gitgitgadget · derrickstolee · Jun 7, 2024 · Sep 1, 2024 · Sep 1, 2024 · Sep 1, 2024
diff --git a/.gitignore b/.gitignore
@@ -20,6 +20,7 @@
 /git-apply
 /git-archimport
 /git-archive
+/git-backfill
 /git-bisect
 /git-blame
 /git-branch

diff --git a/Documentation/git-backfill.txt b/Documentation/git-backfill.txt
@@ -0,0 +1,68 @@
+git-backfill(1)
+===============
+
+NAME
+----
+git-backfill - Download missing objects in a partial clone
+
+
+SYNOPSIS
+--------
+[verse]
+'git backfill' [--batch-size=<n>] [--[no-]sparse]
+
+DESCRIPTION
+-----------
+
+Blobless partial clones are created using `git clone --filter=blob:none`
+and then configure the local repository such that the Git client avoids
+downloading blob objects unless they are required for a local operation.
+This initially means that the clone and later fetches download reachable
+commits and trees but no blobs. Later operations that change the `HEAD`
+pointer, such as `git checkout` or `git merge`, may need to download
+missing blobs in order to complete their operation.
+
+In the worst cases, commands that compute blob diffs, such as `git blame`,
+become very slow as they download the missing blobs in single-blob
+requests to satisfy the missing object as the Git command needs it. This
+leads to multiple download requests and no ability for the Git server to
+provide delta compression across those objects.
+
+The `git backfill` command provides a way for the user to request that
+Git downloads the missing blobs (with optional filters) such that the
+missing blobs representing historical versions of files can be downloaded
+in batches. The `backfill` command attempts to optimize the request by
+grouping blobs that appear at the same path, hopefully leading to good
+delta compression in the packfile sent by the server.
+
+In this way, `git backfill` provides a mechanism to break a large clone
+into smaller chunks. Starting with a blobless partial clone with `git
+clone --filter=blob:none` and then running `git backfill` in the local
+repository provides a way to download all reachable objects in several
+smaller network calls than downloading the entire repository at clone
+time.
+
+By default, `git backfill` downloads all blobs reachable from the `HEAD`
+commit. This set can be restricted or expanded using various options.
+
+OPTIONS
+-------
+
+--min-batch-size=<n>::
+	Specify a minimum size for a batch of missing objects to request
+	from the server. This size may be exceeded by the last set of
+	blobs seen at a given path. The default minimum batch size is
+	50,000.
+
+--[no-]sparse::
+	Only download objects if they appear at a path that matches the
+	current sparse-checkout. If the sparse-checkout feature is enabled,
+	then `--sparse` is assumed and can be disabled with `--no-sparse`.
+
+SEE ALSO
+--------
+linkgit:git-clone[1].
+
+GIT
+---
+Part of the linkgit:git[1] suite
diff --git a/Documentation/technical/api-path-walk.txt b/Documentation/technical/api-path-walk.txt
@@ -56,8 +56,17 @@ better off using the revision walk API instead.
 	the revision walk so that the walk emits commits marked with the
 	`UNINTERESTING` flag.
 
+`pl`::
+	This pattern list pointer allows focusing the path-walk search to
+	a set of patterns, only emitting paths that match the given
+	patterns. See linkgit:gitignore[5] or
+	linkgit:git-sparse-checkout[1] for details about pattern lists.
+	When the pattern list uses cone-mode patterns, then the path-walk
+	API can prune the set of paths it walks to improve performance.
+
 Examples
 --------
 
 See example usages in:
-	`t/helper/test-path-walk.c`
+	`t/helper/test-path-walk.c`,
+	`builtin/backfill.c`
diff --git a/Makefile b/Makefile
@@ -1203,6 +1203,7 @@ BUILTIN_OBJS += builtin/am.o
 BUILTIN_OBJS += builtin/annotate.o
 BUILTIN_OBJS += builtin/apply.o
 BUILTIN_OBJS += builtin/archive.o
+BUILTIN_OBJS += builtin/backfill.o
 BUILTIN_OBJS += builtin/bisect.o
 BUILTIN_OBJS += builtin/blame.o
 BUILTIN_OBJS += builtin/branch.o

diff --git a/builtin.h b/builtin.h
@@ -120,6 +120,7 @@ int cmd_am(int argc, const char **argv, const char *prefix, struct repository *r
 int cmd_annotate(int argc, const char **argv, const char *prefix, struct repository *repo);
 int cmd_apply(int argc, const char **argv, const char *prefix, struct repository *repo);
 int cmd_archive(int argc, const char **argv, const char *prefix, struct repository *repo);
+int cmd_backfill(int argc, const char **argv, const char *prefix, struct repository *repo);
 int cmd_bisect(int argc, const char **argv, const char *prefix, struct repository *repo);
 int cmd_blame(int argc, const char **argv, const char *prefix, struct repository *repo);
 int cmd_branch(int argc, const char **argv, const char *prefix, struct repository *repo);

diff --git a/builtin/backfill.c b/builtin/backfill.c
@@ -0,0 +1,151 @@
+/* We need this macro to access core_apply_sparse_checkout */
+#define USE_THE_REPOSITORY_VARIABLE
+
+#include "builtin.h"
+#include "git-compat-util.h"
+#include "config.h"
+#include "parse-options.h"
+#include "repository.h"
+#include "commit.h"
+#include "dir.h"
+#include "environment.h"
+#include "hex.h"
+#include "tree.h"
+#include "tree-walk.h"
+#include "object.h"
+#include "object-store-ll.h"
+#include "oid-array.h"
+#include "oidset.h"
+#include "promisor-remote.h"
+#include "strmap.h"
+#include "string-list.h"
+#include "revision.h"
+#include "trace2.h"
+#include "progress.h"
+#include "packfile.h"
+#include "path-walk.h"
+
+static const char * const builtin_backfill_usage[] = {
+	N_("git backfill [--batch-size=<n>] [--[no-]sparse]"),
+	NULL
+};
+
+struct backfill_context {
+	struct repository *repo;
+	struct oid_array current_batch;
+	size_t min_batch_size;
+	int sparse;
+};
+
+static void backfill_context_clear(struct backfill_context *ctx)
+{
+	oid_array_clear(&ctx->current_batch);
+}
+
+static void download_batch(struct backfill_context *ctx)
+{
+	promisor_remote_get_direct(ctx->repo,
+				   ctx->current_batch.oid,
+				   ctx->current_batch.nr);
+	oid_array_clear(&ctx->current_batch);
+
+	/*
+	 * We likely have a new packfile. Add it to the packed list to
+	 * avoid possible duplicate downloads of the same objects.
+	 */
+	reprepare_packed_git(ctx->repo);
+}
+
+static int fill_missing_blobs(const char *path UNUSED,
+			      struct oid_array *list,
+			      enum object_type type,
+			      void *data)
+{
+	struct backfill_context *ctx = data;
+
+	if (type != OBJ_BLOB)
+		return 0;
+
+	for (size_t i = 0; i < list->nr; i++) {
+		off_t size = 0;
+		struct object_info info = OBJECT_INFO_INIT;
+		info.disk_sizep = &size;
+		if (oid_object_info_extended(ctx->repo,
+					     &list->oid[i],
+					     &info,
+					     OBJECT_INFO_FOR_PREFETCH) ||
+		    !size)
+			oid_array_append(&ctx->current_batch, &list->oid[i]);
+	}
+
+	if (ctx->current_batch.nr >= ctx->min_batch_size)
+		download_batch(ctx);
+
+	return 0;
+}
+
+static int do_backfill(struct backfill_context *ctx)
+{
+	struct rev_info revs;
+	struct path_walk_info info = PATH_WALK_INFO_INIT;
+	int ret;
+
+	if (ctx->sparse) {
+		CALLOC_ARRAY(info.pl, 1);
+		if (get_sparse_checkout_patterns(info.pl)) {
+			path_walk_info_clear(&info);
+			return error(_("problem loading sparse-checkout"));
+		}
+	}
+
+	repo_init_revisions(ctx->repo, &revs, "");
+	handle_revision_arg("HEAD", &revs, 0, 0);
+
+	info.blobs = 1;
+	info.tags = info.commits = info.trees = 0;
+
+	info.revs = &revs;
+	info.path_fn = fill_missing_blobs;
+	info.path_fn_data = ctx;
+
+	ret = walk_objects_by_path(&info);
+
+	/* Download the objects that did not fill a batch. */
+	if (!ret)
+		download_batch(ctx);
+
+	backfill_context_clear(ctx);
+	path_walk_info_clear(&info);
+	release_revisions(&revs);
+	return ret;
+}
+
+int cmd_backfill(int argc, const char **argv, const char *prefix, struct repository *repo)
+{
+	struct backfill_context ctx = {
+		.repo = repo,
+		.current_batch = OID_ARRAY_INIT,
+		.min_batch_size = 50000,
+		.sparse = 0,
+	};
+	struct option options[] = {
+		OPT_INTEGER(0, "min-batch-size", &ctx.min_batch_size,
+			    N_("Minimum number of objects to request at a time")),
+		OPT_BOOL(0, "sparse", &ctx.sparse,
+			 N_("Restrict the missing objects to the current sparse-checkout")),
+		OPT_END(),
+	};
+
+	if (argc == 2 && !strcmp(argv[1], "-h"))
+		usage_with_options(builtin_backfill_usage, options);
+
+	argc = parse_options(argc, argv, prefix, options, builtin_backfill_usage,
+			     0);
+
+	repo_config(repo, git_default_config, NULL);
+
+	if (ctx.sparse < 0)
+		ctx.sparse = core_apply_sparse_checkout;
+
+	return do_backfill(&ctx);
+}
diff --git a/command-list.txt b/command-list.txt
@@ -60,6 +60,7 @@ git-annotate                            ancillaryinterrogators
 git-apply                               plumbingmanipulators            complete
 git-archimport                          foreignscminterface
 git-archive                             mainporcelain
+git-backfill                            mainporcelain           history
 git-bisect                              mainporcelain           info
 git-blame                               ancillaryinterrogators          complete
 git-branch                              mainporcelain           history

diff --git a/dir.c b/dir.c
@@ -1088,10 +1088,6 @@ static void invalidate_directory(struct untracked_cache *uc,
 		dir->dirs[i]->recurse = 0;
 }
 
-static int add_patterns_from_buffer(char *buf, size_t size,
-				    const char *base, int baselen,
-				    struct pattern_list *pl);
-
 /* Flags for add_patterns() */
 #define PATTERN_NOFOLLOW (1<<0)
 
@@ -1181,9 +1177,9 @@ static int add_patterns(const char *fname, const char *base, int baselen,
 	return 0;
 }
 
-static int add_patterns_from_buffer(char *buf, size_t size,
-				    const char *base, int baselen,
-				    struct pattern_list *pl)
+int add_patterns_from_buffer(char *buf, size_t size,
+			     const char *base, int baselen,
+			     struct pattern_list *pl)
 {
 	char *orig = buf;
 	int i, lineno = 1;

diff --git a/dir.h b/dir.h
@@ -467,6 +467,9 @@ void add_patterns_from_file(struct dir_struct *, const char *fname);
 int add_patterns_from_blob_to_list(struct object_id *oid,
 				   const char *base, int baselen,
 				   struct pattern_list *pl);
+int add_patterns_from_buffer(char *buf, size_t size,
+			     const char *base, int baselen,
+			     struct pattern_list *pl);
 void parse_path_pattern(const char **string, int *patternlen, unsigned *flags, int *nowildcardlen);
 void add_pattern(const char *string, const char *base,
 		 int baselen, struct pattern_list *pl, int srcpos);

diff --git a/git.c b/git.c
@@ -509,6 +509,7 @@ static struct cmd_struct commands[] = {
 	{ "annotate", cmd_annotate, RUN_SETUP },
 	{ "apply", cmd_apply, RUN_SETUP_GENTLY },
 	{ "archive", cmd_archive, RUN_SETUP_GENTLY },
+	{ "backfill", cmd_backfill, RUN_SETUP },
 	{ "bisect", cmd_bisect, RUN_SETUP },
 	{ "blame", cmd_blame, RUN_SETUP },
 	{ "branch", cmd_branch, RUN_SETUP | DELAY_PAGER_CONFIG },

diff --git a/path-walk.c b/path-walk.c
@@ -12,6 +12,7 @@
 #include "object.h"
 #include "oid-array.h"
 #include "prio-queue.h"
+#include "repository.h"
 #include "revision.h"
 #include "string-list.h"
 #include "strmap.h"
@@ -173,6 +174,23 @@ static int add_tree_entries(struct path_walk_context *ctx,
 		if (type == OBJ_TREE)
 			strbuf_addch(&path, '/');
 
+		if (ctx->info->pl) {
+			int dtype;
+			enum pattern_match_result match;
+			match = path_matches_pattern_list(path.buf, path.len,
+							  path.buf + base_len, &dtype,
+							  ctx->info->pl,
+							  ctx->repo->index);
+
+			if (ctx->info->pl->use_cone_patterns &&
+			    match == NOT_MATCHED)
+				continue;
+			else if (!ctx->info->pl->use_cone_patterns &&
+				 type == OBJ_BLOB &&
+				 match != MATCHED)
+				continue;
+		}
+
 		if (!(list = strmap_get(&ctx->paths_to_lists, path.buf))) {
 			CALLOC_ARRAY(list, 1);
 			list->type = type;
@@ -583,10 +601,10 @@ void path_walk_info_init(struct path_walk_info *info)
 	memcpy(info, &empty, sizeof(empty));
 }
 
-void path_walk_info_clear(struct path_walk_info *info UNUSED)
+void path_walk_info_clear(struct path_walk_info *info)
 {
-	/*
-	 * This destructor is empty for now, as info->revs
-	 * is not owned by 'struct path_walk_info'.
-	 */
+	if (info->pl) {
+		clear_pattern_list(info->pl);
+		free(info->pl);
+	}
 }
diff --git a/path-walk.h b/path-walk.h
@@ -6,6 +6,7 @@
 
 struct rev_info;
 struct oid_array;
+struct pattern_list;
 
 /**
  * The type of a function pointer for the method that is called on a list of
@@ -48,6 +49,16 @@ struct path_walk_info {
 	 * walk the children of such trees.
 	 */
 	int prune_all_uninteresting;
+
+	/**
+	 * Specify a sparse-checkout definition to match our paths to. Do not
+	 * walk outside of this sparse definition. If the patterns are in
+	 * cone mode, then the search may prune directories that are outside
+	 * of the cone. If not in cone mode, then all tree paths will be
+	 * explored but the path_fn will only be called when the path matches
+	 * the sparse-checkout patterns.
+	 */
+	struct pattern_list *pl;
 };
 
 #define PATH_WALK_INFO_INIT {   \