Re: [PATCH 0/7] RFC: Accelerate xdiff and begin its rustification

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On 18/07/2025 15:38, Junio C Hamano wrote:
"Ezekiel Newren via GitGitGadget" <gitgitgadget@xxxxxxxxx> writes:

This series accelerates xdiff by 5-19%.

;-)

Do we know how much of that can be attributed to the hash algorithm
difference, and how much for languages?

That's an interesting question. The two patches below [1] switch
xdiff to use xxhash from libxxhash. On my computer the rust and
C implementations both speed up "git log --oneline --shortstat"
by 15%. Just over half of that seems to come from hoisting the
check for whitespace flags in xdl_hash_record() out of the loop
in xdl_prepare_ctx() and the rest comes from the change in hash
function. As I understand it the hash is implemented using SIMD
compiler intrinsics and the rust implementation is basically a
copy of the C code in libxxhash. I wonder how well xxhash performs
compared to our existing hash on platforms without an optimized
implementation.

Thanks

Phillip

[1] These patches are available in the xdiff-hashing-experiments
    branch at https://github.com/phillipwood/git

---- 8< ----
From 06e7abdcfb9fc3f143ef84644966d6fce128d8ae Mon Sep 17 00:00:00 2001
From: Phillip Wood <phillip.wood@xxxxxxxxxxxxx>
Date: Sat, 19 Jul 2025 10:58:48 +0100
Subject: [PATCH 1/2] xdiff: refactor xdl_hash_record()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Inline the check for whitespace flags so that the compiler can hoist
it out of the loop in xdl_prepare_ctx(). This improves the performance
by 8%.

$ hyperfine --warmup=1 -L rev HEAD,HEAD^  --setup='git checkout {rev} -- :/ && make git' ': {rev}; GIT_CONFIG_GLOBAL=/dev/null ./git log --oneline --shortstat v2.0.0..v2.5.0'
Benchmark 1: : HEAD; GIT_CONFIG_GLOBAL=/dev/null ./git log --oneline --shortstat v2.0.0..v2.5.0
  Time (mean ┬▒ ¤â):      1.670 s ┬▒  0.044 s    [User: 1.473 s, System: 0.196 s]
  Range (min  max):    1.619 s   1.754 s    10 runs

Benchmark 2: : HEAD^; GIT_CONFIG_GLOBAL=/dev/null ./git log --oneline --shortstat v2.0.0..v2.5.0
  Time (mean ┬▒ ¤â):      1.801 s ┬▒  0.021 s    [User: 1.605 s, System: 0.192 s]
  Range (min  max):    1.766 s   1.831 s    10 runs

Summary
  ': HEAD^; GIT_CONFIG_GLOBAL=/dev/null ./git log --oneline --shortstat v2.0.0..v2.5.0' ran
    1.08 ┬▒ 0.03 times faster than ': HEAD^^; GIT_CONFIG_GLOBAL=/dev/null ./git log --oneline --shortstat v2.0.0..v2.5.0'

Signed-off-by: Phillip Wood <phillip.wood@xxxxxxxxxxxxx>
---
 xdiff/xutils.c |  7 ++-----
 xdiff/xutils.h | 10 +++++++++-
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/xdiff/xutils.c b/xdiff/xutils.c
index 444a108f87..e070ed649f 100644
--- a/xdiff/xutils.c
+++ b/xdiff/xutils.c
@@ -249,7 +249,7 @@ int xdl_recmatch(const char *l1, long s1, const char *l2, long s2, long flags)
 	return 1;
 }
-static unsigned long xdl_hash_record_with_whitespace(char const **data,
+unsigned long xdl_hash_record_with_whitespace(char const **data,
 		char const *top, long flags) {
 	unsigned long ha = 5381;
 	char const *ptr = *data;
@@ -294,13 +294,10 @@ static unsigned long xdl_hash_record_with_whitespace(char const **data,
 	return ha;
 }
-unsigned long xdl_hash_record(char const **data, char const *top, long flags) {
+unsigned long xdl_hash_record_verbatim(char const **data, char const *top) {
 	unsigned long ha = 5381;
 	char const *ptr = *data;
- if (flags & XDF_WHITESPACE_FLAGS)
-		return xdl_hash_record_with_whitespace(data, top, flags);
-
 	for (; ptr < top && *ptr != '\n'; ptr++) {
 		ha += (ha << 5);
 		ha ^= (unsigned long) *ptr;
diff --git a/xdiff/xutils.h b/xdiff/xutils.h
index fd0bba94e8..13f6831047 100644
--- a/xdiff/xutils.h
+++ b/xdiff/xutils.h
@@ -34,7 +34,15 @@ void *xdl_cha_alloc(chastore_t *cha);
 long xdl_guess_lines(mmfile_t *mf, long sample);
 int xdl_blankline(const char *line, long size, long flags);
 int xdl_recmatch(const char *l1, long s1, const char *l2, long s2, long flags);
-unsigned long xdl_hash_record(char const **data, char const *top, long flags);
+unsigned long xdl_hash_record_verbatim(char const **data, char const *top);
+unsigned long xdl_hash_record_with_whitespace(char const **data, char const *top, long flags);
+static inline unsigned long xdl_hash_record(char const **data, char const *top, long flags)
+{
+	if (flags & XDF_WHITESPACE_FLAGS)
+		return xdl_hash_record_with_whitespace(data, top, flags);
+	else
+		return xdl_hash_record_verbatim(data, top);
+}
 unsigned int xdl_hashbits(unsigned int size);
 int xdl_num_out(char *out, long val);
 int xdl_emit_hunk_hdr(long s1, long c1, long s2, long c2,
--
2.49.0.897.gfad3eb7d21


From 16f3b26624dc17002f3e507cd1e260deadfe1de8 Mon Sep 17 00:00:00 2001
From: Phillip Wood <phillip.wood@xxxxxxxxxxxxx>
Date: Sat, 19 Jul 2025 14:52:48 +0100
Subject: [PATCH 2/2] xdiff: use xxhash
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Using XXH3_64bits() from libxxhash to hash the input lines improves
the performance by about 6% and equals the performance of using
xxhash-rust.

$ hyperfine --warmup=1 -L rev en/xdiff-rust/v1,HEAD,HEAD^,HEAD^^  --setup='git checkout {rev} -- :/ && make git' ': {rev}; GIT_CONFIG_GLOBAL=/dev/null ./git log --oneline --shortstat v2.0.0..v2.5.0'
Benchmark 1: : en/xdiff-rust/v1; GIT_CONFIG_GLOBAL=/dev/null ./git log --oneline --shortstat v2.0.0..v2.5.0
  Time (mean ┬▒ ¤â):      1.575 s ┬▒  0.032 s    [User: 1.406 s, System: 0.168 s]
  Range (min  max):    1.541 s   1.651 s    10 runs

Benchmark 2: : HEAD; GIT_CONFIG_GLOBAL=/dev/null ./git log --oneline --shortstat v2.0.0..v2.5.0
  Time (mean ┬▒ ¤â):      1.569 s ┬▒  0.018 s    [User: 1.382 s, System: 0.185 s]
  Range (min  max):    1.546 s   1.596 s    10 runs

Benchmark 3: : HEAD^; GIT_CONFIG_GLOBAL=/dev/null ./git log --oneline --shortstat v2.0.0..v2.5.0
  Time (mean ┬▒ ¤â):      1.661 s ┬▒  0.026 s    [User: 1.475 s, System: 0.186 s]
  Range (min  max):    1.630 s   1.696 s    10 runs

Benchmark 4: : HEAD^^; GIT_CONFIG_GLOBAL=/dev/null ./git log --oneline --shortstat v2.0.0..v2.5.0
  Time (mean ┬▒ ¤â):      1.800 s ┬▒  0.023 s    [User: 1.611 s, System: 0.187 s]
  Range (min  max):    1.772 s   1.837 s    10 runs

Summary
  ': HEAD; GIT_CONFIG_GLOBAL=/dev/null ./git log --oneline --shortstat v2.0.0..v2.5.0' ran
    1.00 ┬▒ 0.02 times faster than ': en/xdiff-rust/v1; GIT_CONFIG_GLOBAL=/dev/null ./git log --oneline --shortstat v2.0.0..v2.5.0'
    1.06 ┬▒ 0.02 times faster than ': HEAD^; GIT_CONFIG_GLOBAL=/dev/null ./git log --oneline --shortstat v2.0.0..v2.5.0'
    1.15 ┬▒ 0.02 times faster than ': HEAD^^; GIT_CONFIG_GLOBAL=/dev/null ./git log --oneline --shortstat v2.0.0..v2.5.0'

Signed-off-by: Phillip Wood <phillip.wood@xxxxxxxxxxxxx>
---
 Makefile       |  1 +
 xdiff/xutils.c | 14 ++++++--------
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/Makefile b/Makefile
index 5f7dd79dfa..6de7ccdf3b 100644
--- a/Makefile
+++ b/Makefile
@@ -1390,6 +1390,7 @@ UNIT_TEST_OBJS += $(UNIT_TEST_DIR)/lib-reftable.o
 # xdiff and reftable libs may in turn depend on what is in libgit.a
 GITLIBS = common-main.o $(LIB_FILE) $(XDIFF_LIB) $(REFTABLE_LIB) $(LIB_FILE)
 EXTLIBS =
+EXTLIBS += -lxxhash
GIT_USER_AGENT = git/$(GIT_VERSION) diff --git a/xdiff/xutils.c b/xdiff/xutils.c
index e070ed649f..43fce4b5b1 100644
--- a/xdiff/xutils.c
+++ b/xdiff/xutils.c
@@ -21,7 +21,7 @@
  */
#include "xinclude.h"
-
+#include <xxhash.h>
long xdl_bogosqrt(long n) {
 	long i;
@@ -295,14 +295,12 @@ unsigned long xdl_hash_record_with_whitespace(char const **data,
 }
unsigned long xdl_hash_record_verbatim(char const **data, char const *top) {
-	unsigned long ha = 5381;
-	char const *ptr = *data;
+	long ha;
+	char const *eol = memchr(*data, '\n', top - *data);
+	size_t len = (eol ? eol : top) - *data;
- for (; ptr < top && *ptr != '\n'; ptr++) {
-		ha += (ha << 5);
-		ha ^= (unsigned long) *ptr;
-	}
-	*data = ptr < top ? ptr + 1: ptr;
+	ha = XXH3_64bits(*data, len);
+	*data += len + !!eol;
return ha;
 }
--
2.49.0.897.gfad3eb7d21






[Index of Archives]     [Linux Kernel Development]     [Gcc Help]     [IETF Annouce]     [DCCP]     [Netdev]     [Networking]     [Security]     [V4L]     [Bugtraq]     [Yosemite]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Linux SCSI]     [Fedora Users]

  Powered by Linux