Hi Ezekiel
On 17/07/2025 21:32, Ezekiel Newren via GitGitGadget wrote:
From: Ezekiel Newren <ezekielnewren@xxxxxxxxx>
We want to use xxhash for faster hashing. To facilitate that
and to simplify the code. Separate the concerns of parsing
and hashing into discrete steps. This makes swapping the hash
function much easier. Since xdl_hash_record() both parses and
hashses lines, this requires some slight code restructuring.
That makes sense though unfortunately we seem to have lost some error
handling in the restructuring. How much does this extra pass over the
input data slow down the cases that don't end up using xxhash?
Signed-off-by: Ezekiel Newren <ezekielnewren@xxxxxxxxx>
---
xdiff/xprepare.c | 75 ++++++++++++++++++++++++++++--------------------
1 file changed, 44 insertions(+), 31 deletions(-)
diff --git a/xdiff/xprepare.c b/xdiff/xprepare.c
index 747268e4fdf7..c44005e9bbb8 100644
--- a/xdiff/xprepare.c
+++ b/xdiff/xprepare.c
@@ -129,13 +129,39 @@ static int xdl_classify_record(unsigned int pass, xdlclassifier_t *cf, xrecord_t
}
+static void xdl_parse_lines(mmfile_t *mf, long narec, xdfile_t *xdf) {
+ u8 const* ptr = (u8 const*) mf->ptr;
+ usize len = (usize) mf->size;
+
+ xdf->recs = NULL;
+ xdf->nrec = 0;
+ XDL_ALLOC_ARRAY(xdf->recs, narec);
This should return error if the allocation fails like the original code.
Although that does not make any difference for git a number of other
projects such as libgit2 carry a copy of our xdiff code and want to be
able to handle allocation failures.
+ while (len > 0) {
+ xrecord_t *rec = NULL;
+ usize length;
+ u8 const* result = memchr(ptr, '\n', len);
+ if (result) {
+ length = result - ptr + 1;
+ } else {
+ length = len;
+ }
+ if (XDL_ALLOC_GROW(xdf->recs, xdf->nrec + 1, narec))
+ die("XDL_ALLOC_GROW failed");
We should return an error rather than dying here
+ rec = xdl_cha_alloc(&xdf->rcha);
We should return an error if the call fails like the original code
+ rec->ptr = ptr;
+ rec->size = length;
+ rec->ha = 0;
+ xdf->recs[xdf->nrec++] = rec;
+ ptr += length;
+ len -= length;
+ }
+
+}
+
+
static int xdl_prepare_ctx(unsigned int pass, mmfile_t *mf, long narec, xpparam_t const *xpp,
xdlclassifier_t *cf, xdfile_t *xdf) {
- long nrec, bsize;
- unsigned long hav;
- char const *blk, *cur, *top, *prev;
- xrecord_t *crec;
- xrecord_t **recs;
unsigned long *ha;
char *rchg;
long *rindex;
@@ -143,50 +169,37 @@ static int xdl_prepare_ctx(unsigned int pass, mmfile_t *mf, long narec, xpparam_
ha = NULL;
rindex = NULL;
rchg = NULL;
- recs = NULL;
if (xdl_cha_init(&xdf->rcha, sizeof(xrecord_t), narec / 4 + 1) < 0)
goto abort;
- if (!XDL_ALLOC_ARRAY(recs, narec))
- goto abort;
- nrec = 0;
- if ((cur = blk = xdl_mmfile_first(mf, &bsize))) {
- for (top = blk + bsize; cur < top; ) {
- prev = cur;
- hav = xdl_hash_record(&cur, top, xpp->flags);
- if (XDL_ALLOC_GROW(recs, nrec + 1, narec))
- goto abort;
- if (!(crec = xdl_cha_alloc(&xdf->rcha)))
- goto abort;
- crec->ptr = (u8 const*) prev;
- crec->size = (long) (cur - prev);
- crec->ha = hav;
- recs[nrec++] = crec;
- if (xdl_classify_record(pass, cf, crec) < 0)
- goto abort;
- }
+ xdl_parse_lines(mf, narec, xdf);
+
+ for (usize i = 0; i < (usize) xdf->nrec; i++) {
+ xrecord_t *rec = xdf->recs[i];
+ char const* dump = (char const*) rec->ptr;
+ rec->ha = xdl_hash_record(&dump, (char const*) (rec->ptr + rec->size), xpp->flags);
I think we should update xdl_hash_record() to stop updating dump and use
the length from xdl_parse_lines(). Now that we parse the lines before
hashing we should use that length in the hash function so we have a
single definition of line length.
+ xdl_classify_record(pass, cf, rec);
We should return an error if this call fails like the original code
Thanks
Phillip
}
- if (!XDL_CALLOC_ARRAY(rchg, nrec + 2))
+
+ if (!XDL_CALLOC_ARRAY(rchg, xdf->nrec + 2))
goto abort;
if ((XDF_DIFF_ALG(xpp->flags) != XDF_PATIENCE_DIFF) &&
(XDF_DIFF_ALG(xpp->flags) != XDF_HISTOGRAM_DIFF)) {
- if (!XDL_ALLOC_ARRAY(rindex, nrec + 1))
+ if (!XDL_ALLOC_ARRAY(rindex, xdf->nrec + 1))
goto abort;
- if (!XDL_ALLOC_ARRAY(ha, nrec + 1))
+ if (!XDL_ALLOC_ARRAY(ha, xdf->nrec + 1))
goto abort;
}
- xdf->nrec = nrec;
- xdf->recs = recs;
xdf->rchg = rchg + 1;
xdf->rindex = rindex;
xdf->nreff = 0;
xdf->ha = ha;
xdf->dstart = 0;
- xdf->dend = nrec - 1;
+ xdf->dend = xdf->nrec - 1;
return 0;
@@ -194,7 +207,7 @@ abort:
xdl_free(ha);
xdl_free(rindex);
xdl_free(rchg);
- xdl_free(recs);
+ xdl_free(xdf->recs);
xdl_cha_free(&xdf->rcha);
return -1;
}