[PATCH RFC v3 3/3] last-modified: use Bloom filters when available

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Our 'git last-modified' performs a revision walk, and computes a diff at
each point in the walk to figure out whether a given revision changed
any of the paths it considers interesting.

When changed-path Bloom filters are available, we can avoid computing
many such diffs. Before computing a diff, we first check if any of the
remaining paths of interest were possibly changed at a given commit by
consulting its Bloom filter. If any of them are, we are resigned to
compute the diff.

If none of those queries returned "maybe", we know that the given commit
doesn't contain any changed paths which are interesting to us. So, we
can avoid computing it in this case.

Comparing the perf test results on git.git:

Test                                        HEAD~             HEAD
------------------------------------------------------------------------------------
8020.1: top-level last-modified             4.49(4.34+0.11)   2.22(2.05+0.09) -50.6%
8020.2: top-level recursive last-modified   5.64(5.45+0.11)   5.62(5.30+0.11) -0.4%
8020.3: subdir last-modified                0.11(0.06+0.04)   0.07(0.03+0.04) -36.4%

Based-on-patch-by: Taylor Blau <me@xxxxxxxxxxxx>
Signed-off-by: Toon Claes <toon@xxxxxxxxx>
---
 last-modified.c | 45 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

diff --git a/last-modified.c b/last-modified.c
index 4904d00d2a..2097894c6e 100644
--- a/last-modified.c
+++ b/last-modified.c
@@ -1,7 +1,10 @@
 #include "git-compat-util.h"
+#include "bloom.h"
+#include "commit-graph.h"
 #include "commit.h"
 #include "diff.h"
 #include "diffcore.h"
+#include "dir.h"
 #include "last-modified.h"
 #include "log-tree.h"
 #include "object.h"
@@ -11,6 +14,7 @@
 struct last_modified_entry {
 	struct hashmap_entry hashent;
 	struct object_id oid;
+	struct bloom_key key;
 	const char path[FLEX_ARRAY];
 };
 
@@ -27,6 +31,9 @@ static void add_path_from_diff(struct diff_queue_struct *q,
 
 		FLEX_ALLOC_STR(ent, path, path);
 		oidcpy(&ent->oid, &p->two->oid);
+		if (lm->rev.bloom_filter_settings)
+			fill_bloom_key(path, strlen(path), &ent->key,
+				       lm->rev.bloom_filter_settings);
 		hashmap_entry_init(&ent->hashent, strhash(ent->path));
 		hashmap_add(&lm->paths, &ent->hashent);
 	}
@@ -94,6 +101,13 @@ int last_modified_init(struct last_modified *lm,
 	if (setup_revisions(argc, argv, &lm->rev, NULL) > 1)
 		return error(_("unknown last-modified argument: %s"), argv[1]);
 
+	/*
+	 * We're not interested in generation numbers here,
+	 * but calling this function to prepare the commit-graph.
+	 */
+	(void)generation_numbers_enabled(lm->rev.repo);
+	lm->rev.bloom_filter_settings = get_bloom_filter_settings(lm->rev.repo);
+
 	if (populate_paths_from_revs(lm) < 0)
 		return error(_("unable to setup last-modified"));
 
@@ -102,6 +116,12 @@ int last_modified_init(struct last_modified *lm,
 
 void last_modified_release(struct last_modified *lm)
 {
+	struct hashmap_iter iter;
+	struct last_modified_entry *ent;
+
+	hashmap_for_each_entry(&lm->paths, &iter, ent, hashent)
+		clear_bloom_key(&ent->key);
+
 	hashmap_clear_and_free(&lm->paths, struct last_modified_entry, hashent);
 	release_revisions(&lm->rev);
 }
@@ -136,6 +156,7 @@ static void mark_path(const char *path, const struct object_id *oid,
 		data->callback(path, data->commit, data->callback_data);
 
 	hashmap_remove(data->paths, &ent->hashent, path);
+	clear_bloom_key(&ent->key);
 	free(ent);
 }
 
@@ -179,6 +200,27 @@ static void last_modified_diff(struct diff_queue_struct *q,
 	}
 }
 
+static int maybe_changed_path(struct last_modified *lm, struct commit *origin)
+{
+	struct bloom_filter *filter;
+	struct last_modified_entry *ent;
+	struct hashmap_iter iter;
+
+	if (!lm->rev.bloom_filter_settings)
+		return 1;
+
+	filter = get_bloom_filter(lm->rev.repo, origin);
+	if (!filter)
+		return 1;
+
+	hashmap_for_each_entry(&lm->paths, &iter, ent, hashent) {
+		if (bloom_filter_contains(filter, &ent->key,
+					  lm->rev.bloom_filter_settings))
+			return 1;
+	}
+	return 0;
+}
+
 int last_modified_run(struct last_modified *lm, last_modified_callback cb, void *cbdata)
 {
 	struct last_modified_callback_data data;
@@ -198,6 +240,9 @@ int last_modified_run(struct last_modified *lm, last_modified_callback cb, void
 		if (!data.commit)
 			break;
 
+		if (!maybe_changed_path(lm, data.commit))
+			continue;
+
 		if (data.commit->object.flags & BOUNDARY) {
 			diff_tree_oid(lm->rev.repo->hash_algo->empty_tree,
 				       &data.commit->object.oid,

-- 
2.50.0.rc0.18.gfcfe60668e





[Index of Archives]     [Linux Kernel Development]     [Gcc Help]     [IETF Annouce]     [DCCP]     [Netdev]     [Networking]     [Security]     [V4L]     [Bugtraq]     [Yosemite]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Linux SCSI]     [Fedora Users]

  Powered by Linux