From b0444bb43546d71b8b8b995b47b597fde6988bb2 Mon Sep 17 00:00:00 2001 From: Jo Vandeginste Date: Thu, 11 Dec 2025 13:48:22 +0100 Subject: [PATCH] Batch requests for docker image size We have a registry with many thousands of images. The function to calculate the size of the docker images sometimes takes down the whole Artifactory instance because of the required memory. Even when only a few images are return by the initial query, since the size of _all_ images is then requested. Instead of querying the size of _every_ asset on the registry, this function now batches the artifacts in slices of 40 (there are limits in de query size), and requests the size of those artifact paths per slice. It then fills out the size information for thoses artifacts in the internal dictionary, as before. This means the query is a lot more efficient for us: - The size query takes a few seconds instead of a few minutes. - Especially for queries returning only a few results! - No single (background) query should take down the whole server. We provide an environment variable `ARTIFACTORY_CLEANUP_DOCKER_RULE_BATCH_SIZE` to override this size. Signed-off-by: Jo Vandeginste --- artifactory_cleanup/rules/docker.py | 40 ++++++++++++++++++++++------- 1 file changed, 31 insertions(+), 9 deletions(-) diff --git a/artifactory_cleanup/rules/docker.py b/artifactory_cleanup/rules/docker.py index 2b11d08..50865c1 100644 --- a/artifactory_cleanup/rules/docker.py +++ b/artifactory_cleanup/rules/docker.py @@ -10,6 +10,8 @@ from artifactory_cleanup.rules.base import ArtifactsList from artifactory_cleanup.rules.utils import to_masks +import os + ctx_mgr_block, ctx_mgr_test = get_context_managers() @@ -20,6 +22,7 @@ class RuleForDocker(Rule): MANIFEST_FILENAME = "manifest.json" FAT_MANIFEST_FILENAME = "list.manifest.json" + BATCH_SIZE = int(os.getenv("ARTIFACTORY_CLEANUP_DOCKER_RULE_BATCH_SIZE", 40)) def get_docker_images_list(self, docker_repo): url = f"/api/docker/{docker_repo}/v2/_catalog" @@ -60,21 +63,40 @@ def _collect_docker_size(self, artifacts): if sizes_collected: return - docker_repos = list(set(x["repo"] for x in artifacts)) - if docker_repos: + images_sizes = defaultdict(int) + print("Fetching docker image sizes...") + for i in range(0, len(artifacts), self.BATCH_SIZE): + + slice = artifacts[i : i + self.BATCH_SIZE] + paths = { + "$or": [ + { + "$and": [ + { + "repo": artifact["repo"], + "path": f"{artifact['path']}/{artifact['name']}", + } + ] + } + for artifact in slice + ] + } + aql = ArtifactoryPath(self.session.base_url, session=self.session) - args = ["items.find", {"$or": [{"repo": repo} for repo in docker_repos]}] - artifacts_list = aql.aql(*args) + args = [ + "items.find", + paths, + ] - images_sizes = defaultdict(int) + artifacts_list = aql.aql(*args) for docker_layer in artifacts_list: image_key = (docker_layer["repo"], docker_layer["path"]) images_sizes[image_key] += docker_layer["size"] - for artifact in artifacts: - image = f"{artifact['path']}/{artifact['name']}" - image_key = (artifact["repo"], image) - artifact["size"] = images_sizes.get(image_key, 0) + for artifact in artifacts: + image = f"{artifact['path']}/{artifact['name']}" + image_key = (artifact["repo"], image) + artifact["size"] = images_sizes.get(image_key, 0) def aql_add_filter(self, filters): filters.append({'$or': [{'name': {'$match': self.MANIFEST_FILENAME}}, {'name': {'$match': self.FAT_MANIFEST_FILENAME}}]})