[PATCH] xdiff: disable cleanup_records heuristic with --minimal

Niels Glodny <n.glodny@xxxxxxxxxxxxx> · Fri, 25 Apr 2025 17:59:51 +0200

The cleanup_records function marks some lines as changed
before running the actual diff algorithm. For most lines,
this is a good performance optimization, but it also marks
lines that are surrounded by many changed lines as changed
as well. This can cause redundant changes and longer-than-
necessary diffs.

Whether this results in better-looking diffs is subjective.
However, the --minimal flag explicitly requests the shortest
possible diff.

The performance impact of this change is negligible, and it
results in shorter diffs in about 1.3% of diffs in Git's
history.

Signed-off-by: Niels Glodny <n.glodny@xxxxxxxxxxxxx>
---
 t/meson.build           |  1 +
 t/t4071-diff-minimal.sh | 16 ++++++++++++++++
 xdiff/xprepare.c        | 22 +++++++++++++---------
 3 files changed, 30 insertions(+), 9 deletions(-)
 create mode 100755 t/t4071-diff-minimal.sh

diff --git a/t/meson.build b/t/meson.build
index bfb744e886..8f2e9d2c50 100644
--- a/t/meson.build
+++ b/t/meson.build
@@ -501,6 +501,7 @@ integration_tests = [
   't4068-diff-symmetric-merge-base.sh',
   't4069-remerge-diff.sh',
   't4070-diff-pairs.sh',
+  't4071-diff-minimal.sh',
   't4100-apply-stat.sh',
   't4101-apply-nonl.sh',
   't4102-apply-rename.sh',
diff --git a/t/t4071-diff-minimal.sh b/t/t4071-diff-minimal.sh
new file mode 100755
index 0000000000..3ad759dab4
--- /dev/null
+++ b/t/t4071-diff-minimal.sh
@@ -0,0 +1,16 @@
+#!/bin/sh
+
+test_description='minimal diff algorithm'
+
+. ./test-lib.sh
+
+test_expect_success 'minimal diff should not mark changes between changed lines' '
+	printf "x\nx\nx\nx\n" >pre &&
+	printf "x\nx\nx\nA\nB\nC\nD\nx\nE\nF\nG\n" >post &&
+	test_must_fail git diff --no-index \
+		--minimal pre post >diff &&
+	! grep "+x" diff &&
+	! grep "-x" diff
+'
+
+test_done
diff --git a/xdiff/xprepare.c b/xdiff/xprepare.c
index c84549f6c5..cb0b6c9fd6 100644
--- a/xdiff/xprepare.c
+++ b/xdiff/xprepare.c
@@ -61,9 +61,11 @@ static int xdl_prepare_ctx(unsigned int pass, mmfile_t *mf, long narec, xpparam_
 			   xdlclassifier_t *cf, xdfile_t *xdf);
 static void xdl_free_ctx(xdfile_t *xdf);
 static int xdl_clean_mmatch(char const *dis, long i, long s, long e);
-static int xdl_cleanup_records(xdlclassifier_t *cf, xdfile_t *xdf1, xdfile_t *xdf2);
+static int xdl_cleanup_records(xdlclassifier_t *cf, xdfile_t *xdf1,
+			       xdfile_t *xdf2, int need_min);
 static int xdl_trim_ends(xdfile_t *xdf1, xdfile_t *xdf2);
-static int xdl_optimize_ctxs(xdlclassifier_t *cf, xdfile_t *xdf1, xdfile_t *xdf2);
+static int xdl_optimize_ctxs(xdlclassifier_t *cf, xdfile_t *xdf1,
+			     xdfile_t *xdf2, int need_min);
 
 
 
@@ -279,7 +281,8 @@ int xdl_prepare_env(mmfile_t *mf1, mmfile_t *mf2, xpparam_t const *xpp,
 
 	if ((XDF_DIFF_ALG(xpp->flags) != XDF_PATIENCE_DIFF) &&
 	    (XDF_DIFF_ALG(xpp->flags) != XDF_HISTOGRAM_DIFF) &&
-	    xdl_optimize_ctxs(&cf, &xe->xdf1, &xe->xdf2) < 0) {
+	    xdl_optimize_ctxs(&cf, &xe->xdf1, &xe->xdf2,
+			      (xpp->flags & XDF_NEED_MINIMAL) != 0) < 0) {
 
 		xdl_free_ctx(&xe->xdf2);
 		xdl_free_ctx(&xe->xdf1);
@@ -363,7 +366,8 @@ static int xdl_clean_mmatch(char const *dis, long i, long s, long e) {
  * matches on the other file. Also, lines that have multiple matches
  * might be potentially discarded if they happear in a run of discardable.
  */
-static int xdl_cleanup_records(xdlclassifier_t *cf, xdfile_t *xdf1, xdfile_t *xdf2) {
+static int xdl_cleanup_records(xdlclassifier_t *cf, xdfile_t *xdf1,
+			       xdfile_t *xdf2, int need_min) {
 	long i, nm, nreff, mlim;
 	xrecord_t **recs;
 	xdlclass_t *rcrec;
@@ -379,7 +383,7 @@ static int xdl_cleanup_records(xdlclassifier_t *cf, xdfile_t *xdf1, xdfile_t *xd
 	for (i = xdf1->dstart, recs = &xdf1->recs[xdf1->dstart]; i <= xdf1->dend; i++, recs++) {
 		rcrec = cf->rcrecs[(*recs)->ha];
 		nm = rcrec ? rcrec->len2 : 0;
-		dis1[i] = (nm == 0) ? 0: (nm >= mlim) ? 2: 1;
+		dis1[i] = (nm == 0) ? 0: (nm >= mlim && !need_min) ? 2: 1;
 	}
 
 	if ((mlim = xdl_bogosqrt(xdf2->nrec)) > XDL_MAX_EQLIMIT)
@@ -387,7 +391,7 @@ static int xdl_cleanup_records(xdlclassifier_t *cf, xdfile_t *xdf1, xdfile_t *xd
 	for (i = xdf2->dstart, recs = &xdf2->recs[xdf2->dstart]; i <= xdf2->dend; i++, recs++) {
 		rcrec = cf->rcrecs[(*recs)->ha];
 		nm = rcrec ? rcrec->len1 : 0;
-		dis2[i] = (nm == 0) ? 0: (nm >= mlim) ? 2: 1;
+		dis2[i] = (nm == 0) ? 0: (nm >= mlim && !need_min) ? 2: 1;
 	}
 
 	for (nreff = 0, i = xdf1->dstart, recs = &xdf1->recs[xdf1->dstart];
@@ -449,10 +453,10 @@ static int xdl_trim_ends(xdfile_t *xdf1, xdfile_t *xdf2) {
 }
 
 
-static int xdl_optimize_ctxs(xdlclassifier_t *cf, xdfile_t *xdf1, xdfile_t *xdf2) {
-
+static int xdl_optimize_ctxs(xdlclassifier_t *cf, xdfile_t *xdf1,
+	xdfile_t *xdf2, int need_min) {
 	if (xdl_trim_ends(xdf1, xdf2) < 0 ||
-	    xdl_cleanup_records(cf, xdf1, xdf2) < 0) {
+	    xdl_cleanup_records(cf, xdf1, xdf2, need_min) < 0) {
 
 		return -1;
 	}

base-commit: f65182a99e545d2f2bc22e6c1c2da192133b16a3
-- 
2.34.1