On 18/07/2025 15:38, Junio C Hamano wrote:
"Ezekiel Newren via GitGitGadget" <gitgitgadget@xxxxxxxxx> writes:
This series accelerates xdiff by 5-19%.
;-)
Do we know how much of that can be attributed to the hash algorithm
difference, and how much for languages?
That's an interesting question. The two patches below [1] switch
xdiff to use xxhash from libxxhash. On my computer the rust and
C implementations both speed up "git log --oneline --shortstat"
by 15%. Just over half of that seems to come from hoisting the
check for whitespace flags in xdl_hash_record() out of the loop
in xdl_prepare_ctx() and the rest comes from the change in hash
function. As I understand it the hash is implemented using SIMD
compiler intrinsics and the rust implementation is basically a
copy of the C code in libxxhash. I wonder how well xxhash performs
compared to our existing hash on platforms without an optimized
implementation.
Thanks
Phillip
[1] These patches are available in the xdiff-hashing-experiments
branch at https://github.com/phillipwood/git
---- 8< ----
From 06e7abdcfb9fc3f143ef84644966d6fce128d8ae Mon Sep 17 00:00:00 2001
From: Phillip Wood <phillip.wood@xxxxxxxxxxxxx>
Date: Sat, 19 Jul 2025 10:58:48 +0100
Subject: [PATCH 1/2] xdiff: refactor xdl_hash_record()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Inline the check for whitespace flags so that the compiler can hoist
it out of the loop in xdl_prepare_ctx(). This improves the performance
by 8%.
$ hyperfine --warmup=1 -L rev HEAD,HEAD^ --setup='git checkout {rev} -- :/ && make git' ': {rev}; GIT_CONFIG_GLOBAL=/dev/null ./git log --oneline --shortstat v2.0.0..v2.5.0'
Benchmark 1: : HEAD; GIT_CONFIG_GLOBAL=/dev/null ./git log --oneline --shortstat v2.0.0..v2.5.0
Time (mean ┬▒ ¤â): 1.670 s ┬▒ 0.044 s [User: 1.473 s, System: 0.196 s]
Range (min  max): 1.619 s  1.754 s 10 runs
Benchmark 2: : HEAD^; GIT_CONFIG_GLOBAL=/dev/null ./git log --oneline --shortstat v2.0.0..v2.5.0
Time (mean ┬▒ ¤â): 1.801 s ┬▒ 0.021 s [User: 1.605 s, System: 0.192 s]
Range (min  max): 1.766 s  1.831 s 10 runs
Summary
': HEAD^; GIT_CONFIG_GLOBAL=/dev/null ./git log --oneline --shortstat v2.0.0..v2.5.0' ran
1.08 ┬▒ 0.03 times faster than ': HEAD^^; GIT_CONFIG_GLOBAL=/dev/null ./git log --oneline --shortstat v2.0.0..v2.5.0'
Signed-off-by: Phillip Wood <phillip.wood@xxxxxxxxxxxxx>
---
xdiff/xutils.c | 7 ++-----
xdiff/xutils.h | 10 +++++++++-
2 files changed, 11 insertions(+), 6 deletions(-)
diff --git a/xdiff/xutils.c b/xdiff/xutils.c
index 444a108f87..e070ed649f 100644
--- a/xdiff/xutils.c
+++ b/xdiff/xutils.c
@@ -249,7 +249,7 @@ int xdl_recmatch(const char *l1, long s1, const char *l2, long s2, long flags)
return 1;
}
-static unsigned long xdl_hash_record_with_whitespace(char const **data,
+unsigned long xdl_hash_record_with_whitespace(char const **data,
char const *top, long flags) {
unsigned long ha = 5381;
char const *ptr = *data;
@@ -294,13 +294,10 @@ static unsigned long xdl_hash_record_with_whitespace(char const **data,
return ha;
}
-unsigned long xdl_hash_record(char const **data, char const *top, long flags) {
+unsigned long xdl_hash_record_verbatim(char const **data, char const *top) {
unsigned long ha = 5381;
char const *ptr = *data;
- if (flags & XDF_WHITESPACE_FLAGS)
- return xdl_hash_record_with_whitespace(data, top, flags);
-
for (; ptr < top && *ptr != '\n'; ptr++) {
ha += (ha << 5);
ha ^= (unsigned long) *ptr;
diff --git a/xdiff/xutils.h b/xdiff/xutils.h
index fd0bba94e8..13f6831047 100644
--- a/xdiff/xutils.h
+++ b/xdiff/xutils.h
@@ -34,7 +34,15 @@ void *xdl_cha_alloc(chastore_t *cha);
long xdl_guess_lines(mmfile_t *mf, long sample);
int xdl_blankline(const char *line, long size, long flags);
int xdl_recmatch(const char *l1, long s1, const char *l2, long s2, long flags);
-unsigned long xdl_hash_record(char const **data, char const *top, long flags);
+unsigned long xdl_hash_record_verbatim(char const **data, char const *top);
+unsigned long xdl_hash_record_with_whitespace(char const **data, char const *top, long flags);
+static inline unsigned long xdl_hash_record(char const **data, char const *top, long flags)
+{
+ if (flags & XDF_WHITESPACE_FLAGS)
+ return xdl_hash_record_with_whitespace(data, top, flags);
+ else
+ return xdl_hash_record_verbatim(data, top);
+}
unsigned int xdl_hashbits(unsigned int size);
int xdl_num_out(char *out, long val);
int xdl_emit_hunk_hdr(long s1, long c1, long s2, long c2,
--
2.49.0.897.gfad3eb7d21
From 16f3b26624dc17002f3e507cd1e260deadfe1de8 Mon Sep 17 00:00:00 2001
From: Phillip Wood <phillip.wood@xxxxxxxxxxxxx>
Date: Sat, 19 Jul 2025 14:52:48 +0100
Subject: [PATCH 2/2] xdiff: use xxhash
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Using XXH3_64bits() from libxxhash to hash the input lines improves
the performance by about 6% and equals the performance of using
xxhash-rust.
$ hyperfine --warmup=1 -L rev en/xdiff-rust/v1,HEAD,HEAD^,HEAD^^ --setup='git checkout {rev} -- :/ && make git' ': {rev}; GIT_CONFIG_GLOBAL=/dev/null ./git log --oneline --shortstat v2.0.0..v2.5.0'
Benchmark 1: : en/xdiff-rust/v1; GIT_CONFIG_GLOBAL=/dev/null ./git log --oneline --shortstat v2.0.0..v2.5.0
Time (mean ┬▒ ¤â): 1.575 s ┬▒ 0.032 s [User: 1.406 s, System: 0.168 s]
Range (min  max): 1.541 s  1.651 s 10 runs
Benchmark 2: : HEAD; GIT_CONFIG_GLOBAL=/dev/null ./git log --oneline --shortstat v2.0.0..v2.5.0
Time (mean ┬▒ ¤â): 1.569 s ┬▒ 0.018 s [User: 1.382 s, System: 0.185 s]
Range (min  max): 1.546 s  1.596 s 10 runs
Benchmark 3: : HEAD^; GIT_CONFIG_GLOBAL=/dev/null ./git log --oneline --shortstat v2.0.0..v2.5.0
Time (mean ┬▒ ¤â): 1.661 s ┬▒ 0.026 s [User: 1.475 s, System: 0.186 s]
Range (min  max): 1.630 s  1.696 s 10 runs
Benchmark 4: : HEAD^^; GIT_CONFIG_GLOBAL=/dev/null ./git log --oneline --shortstat v2.0.0..v2.5.0
Time (mean ┬▒ ¤â): 1.800 s ┬▒ 0.023 s [User: 1.611 s, System: 0.187 s]
Range (min  max): 1.772 s  1.837 s 10 runs
Summary
': HEAD; GIT_CONFIG_GLOBAL=/dev/null ./git log --oneline --shortstat v2.0.0..v2.5.0' ran
1.00 ┬▒ 0.02 times faster than ': en/xdiff-rust/v1; GIT_CONFIG_GLOBAL=/dev/null ./git log --oneline --shortstat v2.0.0..v2.5.0'
1.06 ┬▒ 0.02 times faster than ': HEAD^; GIT_CONFIG_GLOBAL=/dev/null ./git log --oneline --shortstat v2.0.0..v2.5.0'
1.15 ┬▒ 0.02 times faster than ': HEAD^^; GIT_CONFIG_GLOBAL=/dev/null ./git log --oneline --shortstat v2.0.0..v2.5.0'
Signed-off-by: Phillip Wood <phillip.wood@xxxxxxxxxxxxx>
---
Makefile | 1 +
xdiff/xutils.c | 14 ++++++--------
2 files changed, 7 insertions(+), 8 deletions(-)
diff --git a/Makefile b/Makefile
index 5f7dd79dfa..6de7ccdf3b 100644
--- a/Makefile
+++ b/Makefile
@@ -1390,6 +1390,7 @@ UNIT_TEST_OBJS += $(UNIT_TEST_DIR)/lib-reftable.o
# xdiff and reftable libs may in turn depend on what is in libgit.a
GITLIBS = common-main.o $(LIB_FILE) $(XDIFF_LIB) $(REFTABLE_LIB) $(LIB_FILE)
EXTLIBS =
+EXTLIBS += -lxxhash
GIT_USER_AGENT = git/$(GIT_VERSION)
diff --git a/xdiff/xutils.c b/xdiff/xutils.c
index e070ed649f..43fce4b5b1 100644
--- a/xdiff/xutils.c
+++ b/xdiff/xutils.c
@@ -21,7 +21,7 @@
*/
#include "xinclude.h"
-
+#include <xxhash.h>
long xdl_bogosqrt(long n) {
long i;
@@ -295,14 +295,12 @@ unsigned long xdl_hash_record_with_whitespace(char const **data,
}
unsigned long xdl_hash_record_verbatim(char const **data, char const *top) {
- unsigned long ha = 5381;
- char const *ptr = *data;
+ long ha;
+ char const *eol = memchr(*data, '\n', top - *data);
+ size_t len = (eol ? eol : top) - *data;
- for (; ptr < top && *ptr != '\n'; ptr++) {
- ha += (ha << 5);
- ha ^= (unsigned long) *ptr;
- }
- *data = ptr < top ? ptr + 1: ptr;
+ ha = XXH3_64bits(*data, len);
+ *data += len + !!eol;
return ha;
}
--
2.49.0.897.gfad3eb7d21