Re: [PATCH 5/7] xdiff: separate parsing lines from hashing them

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Hi Ezekiel

On 17/07/2025 21:32, Ezekiel Newren via GitGitGadget wrote:
From: Ezekiel Newren <ezekielnewren@xxxxxxxxx>

We want to use xxhash for faster hashing. To facilitate that
and to simplify the code. Separate the concerns of parsing
and hashing into discrete steps. This makes swapping the hash
function much easier. Since xdl_hash_record() both parses and
hashses lines, this requires some slight code restructuring.

That makes sense though unfortunately we seem to have lost some error handling in the restructuring. How much does this extra pass over the input data slow down the cases that don't end up using xxhash?

Signed-off-by: Ezekiel Newren <ezekielnewren@xxxxxxxxx>
---
  xdiff/xprepare.c | 75 ++++++++++++++++++++++++++++--------------------
  1 file changed, 44 insertions(+), 31 deletions(-)

diff --git a/xdiff/xprepare.c b/xdiff/xprepare.c
index 747268e4fdf7..c44005e9bbb8 100644
--- a/xdiff/xprepare.c
+++ b/xdiff/xprepare.c
@@ -129,13 +129,39 @@ static int xdl_classify_record(unsigned int pass, xdlclassifier_t *cf, xrecord_t
  }
+static void xdl_parse_lines(mmfile_t *mf, long narec, xdfile_t *xdf) {
+	u8 const* ptr = (u8 const*) mf->ptr;
+	usize len = (usize) mf->size;
+
+	xdf->recs = NULL;
+	xdf->nrec = 0;
+	XDL_ALLOC_ARRAY(xdf->recs, narec);

This should return error if the allocation fails like the original code. Although that does not make any difference for git a number of other projects such as libgit2 carry a copy of our xdiff code and want to be able to handle allocation failures.

+	while (len > 0) {
+		xrecord_t *rec = NULL;
+		usize length;
+		u8 const* result = memchr(ptr, '\n', len);
+		if (result) {
+			length = result - ptr + 1;
+		} else {
+			length = len;
+		}
+		if (XDL_ALLOC_GROW(xdf->recs, xdf->nrec + 1, narec))
+			die("XDL_ALLOC_GROW failed");

We should return an error rather than dying here

+		rec = xdl_cha_alloc(&xdf->rcha);

We should return an error if the call fails like the original code

+		rec->ptr = ptr;
+		rec->size = length;
+		rec->ha = 0;
+		xdf->recs[xdf->nrec++] = rec;
+		ptr += length;
+		len -= length;
+	}
+
+}
+
+
  static int xdl_prepare_ctx(unsigned int pass, mmfile_t *mf, long narec, xpparam_t const *xpp,
  			   xdlclassifier_t *cf, xdfile_t *xdf) {
-	long nrec, bsize;
-	unsigned long hav;
-	char const *blk, *cur, *top, *prev;
-	xrecord_t *crec;
-	xrecord_t **recs;
  	unsigned long *ha;
  	char *rchg;
  	long *rindex;
@@ -143,50 +169,37 @@ static int xdl_prepare_ctx(unsigned int pass, mmfile_t *mf, long narec, xpparam_
  	ha = NULL;
  	rindex = NULL;
  	rchg = NULL;
-	recs = NULL;
if (xdl_cha_init(&xdf->rcha, sizeof(xrecord_t), narec / 4 + 1) < 0)
  		goto abort;
-	if (!XDL_ALLOC_ARRAY(recs, narec))
-		goto abort;
- nrec = 0;
-	if ((cur = blk = xdl_mmfile_first(mf, &bsize))) {
-		for (top = blk + bsize; cur < top; ) {
-			prev = cur;
-			hav = xdl_hash_record(&cur, top, xpp->flags);
-			if (XDL_ALLOC_GROW(recs, nrec + 1, narec))
-				goto abort;
-			if (!(crec = xdl_cha_alloc(&xdf->rcha)))
-				goto abort;
-			crec->ptr = (u8 const*) prev;
-			crec->size = (long) (cur - prev);
-			crec->ha = hav;
-			recs[nrec++] = crec;
-			if (xdl_classify_record(pass, cf, crec) < 0)
-				goto abort;
-		}
+	xdl_parse_lines(mf, narec, xdf);
+
+	for (usize i = 0; i < (usize) xdf->nrec; i++) {
+		xrecord_t *rec = xdf->recs[i];
+		char const* dump = (char const*) rec->ptr;
+		rec->ha = xdl_hash_record(&dump, (char const*) (rec->ptr + rec->size), xpp->flags);

I think we should update xdl_hash_record() to stop updating dump and use the length from xdl_parse_lines(). Now that we parse the lines before hashing we should use that length in the hash function so we have a single definition of line length.

+		xdl_classify_record(pass, cf, rec);

We should return an error if this call fails like the original code

Thanks

Phillip

  	}
- if (!XDL_CALLOC_ARRAY(rchg, nrec + 2))
+
+	if (!XDL_CALLOC_ARRAY(rchg, xdf->nrec + 2))
  		goto abort;
if ((XDF_DIFF_ALG(xpp->flags) != XDF_PATIENCE_DIFF) &&
  	    (XDF_DIFF_ALG(xpp->flags) != XDF_HISTOGRAM_DIFF)) {
-		if (!XDL_ALLOC_ARRAY(rindex, nrec + 1))
+		if (!XDL_ALLOC_ARRAY(rindex, xdf->nrec + 1))
  			goto abort;
-		if (!XDL_ALLOC_ARRAY(ha, nrec + 1))
+		if (!XDL_ALLOC_ARRAY(ha, xdf->nrec + 1))
  			goto abort;
  	}
- xdf->nrec = nrec;
-	xdf->recs = recs;
  	xdf->rchg = rchg + 1;
  	xdf->rindex = rindex;
  	xdf->nreff = 0;
  	xdf->ha = ha;
  	xdf->dstart = 0;
-	xdf->dend = nrec - 1;
+	xdf->dend = xdf->nrec - 1;
return 0; @@ -194,7 +207,7 @@ abort:
  	xdl_free(ha);
  	xdl_free(rindex);
  	xdl_free(rchg);
-	xdl_free(recs);
+	xdl_free(xdf->recs);
  	xdl_cha_free(&xdf->rcha);
  	return -1;
  }





[Index of Archives]     [Linux Kernel Development]     [Gcc Help]     [IETF Annouce]     [DCCP]     [Netdev]     [Networking]     [Security]     [V4L]     [Bugtraq]     [Yosemite]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Linux SCSI]     [Fedora Users]

  Powered by Linux