public inbox for git@vger.kernel.org 
 help / color / mirror / Atom feed
From: "Ezekiel Newren via GitGitGadget" <gitgitgadget@gmail•com>
To: git@vger•kernel.org
Cc: Ezekiel Newren <ezekielnewren@gmail•com>,
	Ezekiel Newren <ezekielnewren@gmail•com>
Subject: [PATCH 03/10] xdiff: don't waste time guessing the number of lines
Date: Fri, 02 Jan 2026 18:52:17 +0000	[thread overview]
Message-ID: <53e4840c1653772379dc8d5c883b34717b81ac43.1767379944.git.gitgitgadget@gmail.com> (raw)
In-Reply-To: <pull.2156.git.git.1767379944.gitgitgadget@gmail.com>

From: Ezekiel Newren <ezekielnewren@gmail•com>

All lines must be read anyway, so classify them after they're read in.
Also move the memset() into xdl_init_classifier().

Signed-off-by: Ezekiel Newren <ezekielnewren@gmail•com>
---
 xdiff/xprepare.c | 52 +++++++++++++++++++-----------------------------
 xdiff/xutils.c   | 20 -------------------
 xdiff/xutils.h   |  1 -
 3 files changed, 21 insertions(+), 52 deletions(-)

diff --git a/xdiff/xprepare.c b/xdiff/xprepare.c
index 34c82e4f8e..96a32cc5e9 100644
--- a/xdiff/xprepare.c
+++ b/xdiff/xprepare.c
@@ -26,8 +26,6 @@
 #define XDL_KPDIS_RUN 4
 #define XDL_MAX_EQLIMIT 1024
 #define XDL_SIMSCAN_WINDOW 100
-#define XDL_GUESS_NLINES1 256
-#define XDL_GUESS_NLINES2 20
 
 #define DISCARD 0
 #define KEEP 1
@@ -55,6 +53,8 @@ typedef struct s_xdlclassifier {
 
 
 static int xdl_init_classifier(xdlclassifier_t *cf, long size, long flags) {
+	memset(cf, 0, sizeof(xdlclassifier_t));
+
 	cf->flags = flags;
 
 	cf->hbits = xdl_hashbits((unsigned int) size);
@@ -134,12 +134,12 @@ static void xdl_free_ctx(xdfile_t *xdf)
 }
 
 
-static int xdl_prepare_ctx(unsigned int pass, mmfile_t *mf, long narec, xpparam_t const *xpp,
-			   xdlclassifier_t *cf, xdfile_t *xdf) {
+static int xdl_prepare_ctx(mmfile_t *mf, xdfile_t *xdf, uint64_t flags) {
 	long bsize;
 	uint64_t hav;
 	uint8_t const *blk, *cur, *top, *prev;
 	xrecord_t *crec;
+	long narec = 8;
 
 	xdf->reference_index = NULL;
 	xdf->changed = NULL;
@@ -152,23 +152,21 @@ static int xdl_prepare_ctx(unsigned int pass, mmfile_t *mf, long narec, xpparam_
 	if ((cur = blk = xdl_mmfile_first(mf, &bsize))) {
 		for (top = blk + bsize; cur < top; ) {
 			prev = cur;
-			hav = xdl_hash_record(&cur, top, xpp->flags);
+			hav = xdl_hash_record(&cur, top, flags);
 			if (XDL_ALLOC_GROW(xdf->recs, (long)xdf->nrec + 1, narec))
 				goto abort;
 			crec = &xdf->recs[xdf->nrec++];
 			crec->ptr = prev;
 			crec->size = cur - prev;
 			crec->line_hash = hav;
-			if (xdl_classify_record(pass, cf, crec) < 0)
-				goto abort;
 		}
 	}
 
 	if (!XDL_CALLOC_ARRAY(xdf->changed, xdf->nrec + 2))
 		goto abort;
 
-	if ((XDF_DIFF_ALG(xpp->flags) != XDF_PATIENCE_DIFF) &&
-	    (XDF_DIFF_ALG(xpp->flags) != XDF_HISTOGRAM_DIFF)) {
+	if ((XDF_DIFF_ALG(flags) != XDF_PATIENCE_DIFF) &&
+	    (XDF_DIFF_ALG(flags) != XDF_HISTOGRAM_DIFF)) {
 		if (!XDL_ALLOC_ARRAY(xdf->reference_index, xdf->nrec + 1))
 			goto abort;
 	}
@@ -381,37 +379,29 @@ static int xdl_optimize_ctxs(xdlclassifier_t *cf, xdfile_t *xdf1, xdfile_t *xdf2
 
 int xdl_prepare_env(mmfile_t *mf1, mmfile_t *mf2, xpparam_t const *xpp,
 		    xdfenv_t *xe) {
-	long enl1, enl2, sample;
 	xdlclassifier_t cf;
 
-	memset(&cf, 0, sizeof(cf));
-
-	/*
-	 * For histogram diff, we can afford a smaller sample size and
-	 * thus a poorer estimate of the number of lines, as the hash
-	 * table (rhash) won't be filled up/grown. The number of lines
-	 * (nrecs) will be updated correctly anyway by
-	 * xdl_prepare_ctx().
-	 */
-	sample = (XDF_DIFF_ALG(xpp->flags) == XDF_HISTOGRAM_DIFF
-		  ? XDL_GUESS_NLINES2 : XDL_GUESS_NLINES1);
+	if (xdl_prepare_ctx(mf1, &xe->xdf1, xpp->flags) < 0) {
 
-	enl1 = xdl_guess_lines(mf1, sample) + 1;
-	enl2 = xdl_guess_lines(mf2, sample) + 1;
-
-	if (xdl_init_classifier(&cf, enl1 + enl2 + 1, xpp->flags) < 0)
 		return -1;
+	}
+	if (xdl_prepare_ctx(mf2, &xe->xdf2, xpp->flags) < 0) {
 
-	if (xdl_prepare_ctx(1, mf1, enl1, xpp, &cf, &xe->xdf1) < 0) {
-
-		xdl_free_classifier(&cf);
+		xdl_free_ctx(&xe->xdf1);
 		return -1;
 	}
-	if (xdl_prepare_ctx(2, mf2, enl2, xpp, &cf, &xe->xdf2) < 0) {
 
-		xdl_free_ctx(&xe->xdf1);
-		xdl_free_classifier(&cf);
+	if (xdl_init_classifier(&cf, xe->xdf1.nrec + xe->xdf2.nrec + 1, xpp->flags) < 0)
 		return -1;
+
+	for (size_t i = 0; i < xe->xdf1.nrec; i++) {
+		xrecord_t *rec = &xe->xdf1.recs[i];
+		xdl_classify_record(1, &cf, rec);
+	}
+
+	for (size_t i = 0; i < xe->xdf2.nrec; i++) {
+		xrecord_t *rec = &xe->xdf2.recs[i];
+		xdl_classify_record(2, &cf, rec);
 	}
 
 	if ((XDF_DIFF_ALG(xpp->flags) != XDF_PATIENCE_DIFF) &&
diff --git a/xdiff/xutils.c b/xdiff/xutils.c
index 77ee1ad9c8..b3d51197c1 100644
--- a/xdiff/xutils.c
+++ b/xdiff/xutils.c
@@ -118,26 +118,6 @@ void *xdl_cha_alloc(chastore_t *cha) {
 	return data;
 }
 
-long xdl_guess_lines(mmfile_t *mf, long sample) {
-	long nl = 0, size, tsize = 0;
-	char const *data, *cur, *top;
-
-	if ((cur = data = xdl_mmfile_first(mf, &size))) {
-		for (top = data + size; nl < sample && cur < top; ) {
-			nl++;
-			if (!(cur = memchr(cur, '\n', top - cur)))
-				cur = top;
-			else
-				cur++;
-		}
-		tsize += (long) (cur - data);
-	}
-
-	if (nl && tsize)
-		nl = xdl_mmfile_size(mf) / (tsize / nl);
-
-	return nl + 1;
-}
 
 int xdl_blankline(const char *line, long size, long flags)
 {
diff --git a/xdiff/xutils.h b/xdiff/xutils.h
index 615b4a9d35..d800840dd0 100644
--- a/xdiff/xutils.h
+++ b/xdiff/xutils.h
@@ -31,7 +31,6 @@ int xdl_emit_diffrec(char const *rec, long size, char const *pre, long psize,
 int xdl_cha_init(chastore_t *cha, long isize, long icount);
 void xdl_cha_free(chastore_t *cha);
 void *xdl_cha_alloc(chastore_t *cha);
-long xdl_guess_lines(mmfile_t *mf, long sample);
 int xdl_blankline(const char *line, long size, long flags);
 int xdl_recmatch(const char *l1, long s1, const char *l2, long s2, long flags);
 uint64_t xdl_hash_record_verbatim(uint8_t const **data, uint8_t const *top);
-- 
gitgitgadget


  parent reply	other threads:[~2026-01-02 18:52 UTC|newest]

Thread overview: 124+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-01-02 18:52 [PATCH 00/10] Xdiff cleanup part 3 Ezekiel Newren via GitGitGadget
2026-01-02 18:52 ` [PATCH 01/10] ivec: introduce the C side of ivec Ezekiel Newren via GitGitGadget
2026-01-04  5:32   ` Junio C Hamano
2026-01-17 16:06     ` Ezekiel Newren
2026-01-08 14:34   ` Phillip Wood
2026-01-15 15:55     ` Ezekiel Newren
2026-01-16 10:39       ` Phillip Wood
2026-01-16 20:19         ` René Scharfe
2026-01-17 13:55           ` Phillip Wood
2026-01-17 16:04             ` Ezekiel Newren
2026-01-18 14:58               ` René Scharfe
2026-01-17 16:14         ` Ezekiel Newren
2026-01-17 16:16           ` Ezekiel Newren
2026-01-17 17:40           ` Phillip Wood
2026-01-19  5:59             ` Jeff King
2026-01-19 20:21               ` Ezekiel Newren
2026-01-19 20:40                 ` Jeff King
2026-01-20  2:36                   ` D. Ben Knoble
2026-01-21 21:00                   ` Ezekiel Newren
2026-01-21 21:20                     ` Jeff King
2026-01-21 21:31                       ` Junio C Hamano
2026-01-21 21:45                         ` Ezekiel Newren
2026-01-20 13:46               ` Phillip Wood
2026-01-20 14:06       ` Phillip Wood
2026-01-21 21:39         ` Ezekiel Newren
2026-01-28 11:15           ` Phillip Wood
2026-01-16 20:19   ` René Scharfe
2026-01-17 15:58     ` Ezekiel Newren
2026-01-18 14:55       ` René Scharfe
2026-01-02 18:52 ` [PATCH 02/10] xdiff: make classic diff explicit by creating xdl_do_classic_diff() Ezekiel Newren via GitGitGadget
2026-01-20 15:01   ` Phillip Wood
2026-01-21 21:05     ` Ezekiel Newren
2026-01-02 18:52 ` Ezekiel Newren via GitGitGadget [this message]
2026-01-20 15:02   ` [PATCH 03/10] xdiff: don't waste time guessing the number of lines Phillip Wood
2026-01-21 21:12     ` Ezekiel Newren
2026-01-22 10:16       ` Phillip Wood
2026-01-02 18:52 ` [PATCH 04/10] xdiff: let patience and histogram benefit from xdl_trim_ends() Ezekiel Newren via GitGitGadget
2026-01-20 15:02   ` Phillip Wood
2026-01-21 14:49     ` Phillip Wood
2026-01-02 18:52 ` [PATCH 05/10] xdiff: use xdfenv_t in xdl_trim_ends() and xdl_cleanup_records() Ezekiel Newren via GitGitGadget
2026-01-20 16:32   ` Phillip Wood
2026-01-02 18:52 ` [PATCH 06/10] xdiff: cleanup xdl_trim_ends() Ezekiel Newren via GitGitGadget
2026-01-20 16:32   ` Phillip Wood
2026-01-02 18:52 ` [PATCH 07/10] xdiff: replace xdfile_t.dstart with xdfenv_t.delta_start Ezekiel Newren via GitGitGadget
2026-01-20 16:32   ` Phillip Wood
2026-01-28 10:51     ` Phillip Wood
2026-01-02 18:52 ` [PATCH 08/10] xdiff: replace xdfile_t.dend with xdfenv_t.delta_end Ezekiel Newren via GitGitGadget
2026-01-02 18:52 ` [PATCH 09/10] xdiff: remove dependence on xdlclassifier from xdl_cleanup_records() Ezekiel Newren via GitGitGadget
2026-01-16 20:19   ` René Scharfe
2026-01-17 16:34     ` Ezekiel Newren
2026-01-18 18:23       ` René Scharfe
2026-01-21 15:01   ` Phillip Wood
2026-01-02 18:52 ` [PATCH 10/10] xdiff: move xdl_cleanup_records() from xprepare.c to xdiffi.c Ezekiel Newren via GitGitGadget
2026-01-21 15:01   ` Phillip Wood
2026-01-28 10:56     ` Phillip Wood
2026-01-04  2:44 ` [PATCH 00/10] Xdiff cleanup part 3 Junio C Hamano
2026-01-04  6:01 ` Yee Cheng Chin
2026-01-28 14:40 ` Phillip Wood
2026-03-06 23:03 ` Junio C Hamano
2026-03-09 19:06   ` Ezekiel Newren
2026-03-09 23:31     ` Junio C Hamano
2026-03-25 21:11 ` [PATCH v2 0/5] " Ezekiel Newren via GitGitGadget
2026-03-25 21:11   ` [PATCH v2 1/5] xdiff/xdl_cleanup_records: delete local recs pointer Ezekiel Newren via GitGitGadget
2026-03-25 21:11   ` [PATCH v2 2/5] xdiff/xdl_cleanup_records: make limits more clear Ezekiel Newren via GitGitGadget
2026-03-25 21:11   ` [PATCH v2 3/5] xdiff/xdl_cleanup_records: make setting action easier to follow Ezekiel Newren via GitGitGadget
2026-03-25 21:11   ` [PATCH v2 4/5] xdiff/xdl_cleanup_records: simplify INVESTIGATE handling for clarity Ezekiel Newren via GitGitGadget
2026-03-25 21:11   ` [PATCH v2 5/5] xdiff/xdl_cleanup_records: use unambiguous types Ezekiel Newren via GitGitGadget
2026-03-25 21:58     ` Junio C Hamano
2026-03-26  6:26   ` [PATCH v2 0/5] Xdiff cleanup part 3 SZEDER Gábor
2026-03-27 19:23   ` [PATCH v3 0/6] " Ezekiel Newren via GitGitGadget
2026-03-27 19:23     ` [PATCH v3 1/6] xdiff/xdl_cleanup_records: delete local recs pointer Ezekiel Newren via GitGitGadget
2026-03-27 19:23     ` [PATCH v3 2/6] xdiff: use unambiguous types in xdl_bogo_sqrt() Ezekiel Newren via GitGitGadget
2026-03-27 19:23     ` [PATCH v3 3/6] xdiff/xdl_cleanup_records: use unambiguous types Ezekiel Newren via GitGitGadget
2026-03-27 19:23     ` [PATCH v3 4/6] xdiff/xdl_cleanup_records: make limits more clear Ezekiel Newren via GitGitGadget
2026-03-27 21:09       ` Junio C Hamano
2026-03-27 23:01         ` Junio C Hamano
2026-03-30 16:00           ` Ezekiel Newren
2026-03-30 19:59             ` Junio C Hamano
2026-03-31  1:29               ` Ezekiel Newren
2026-03-27 19:23     ` [PATCH v3 5/6] xdiff/xdl_cleanup_records: make setting action easier to follow Ezekiel Newren via GitGitGadget
2026-03-27 19:23     ` [PATCH v3 6/6] xdiff/xdl_cleanup_records: simplify INVESTIGATE handling for clarity Ezekiel Newren via GitGitGadget
2026-03-30 16:59     ` [PATCH v4 0/6] Xdiff cleanup part 3 Ezekiel Newren via GitGitGadget
2026-03-30 16:59       ` [PATCH v4 1/6] xdiff/xdl_cleanup_records: delete local recs pointer Ezekiel Newren via GitGitGadget
2026-03-30 17:23         ` Ezekiel Newren
2026-03-30 22:53         ` Junio C Hamano
2026-03-30 16:59       ` [PATCH v4 2/6] xdiff: use unambiguous types in xdl_bogo_sqrt() Ezekiel Newren via GitGitGadget
2026-03-30 22:59         ` Junio C Hamano
2026-03-30 17:00       ` [PATCH v4 3/6] xdiff/xdl_cleanup_records: use unambiguous types Ezekiel Newren via GitGitGadget
2026-03-30 17:00       ` [PATCH v4 4/6] xdiff/xdl_cleanup_records: make limits more clear Ezekiel Newren via GitGitGadget
2026-03-31  9:44         ` Phillip Wood
2026-03-31 16:13           ` Junio C Hamano
2026-04-14 21:58           ` Ezekiel Newren
2026-04-14 22:15             ` Junio C Hamano
2026-04-15 13:54               ` Phillip Wood
2026-03-30 17:00       ` [PATCH v4 5/6] xdiff/xdl_cleanup_records: make setting action easier to follow Ezekiel Newren via GitGitGadget
2026-03-30 23:02         ` Junio C Hamano
2026-03-31  9:44           ` Phillip Wood
2026-03-30 17:00       ` [PATCH v4 6/6] xdiff/xdl_cleanup_records: simplify INVESTIGATE handling for clarity Ezekiel Newren via GitGitGadget
2026-03-31  9:43         ` Phillip Wood
2026-04-01 16:00         ` Phillip Wood
2026-03-30 23:04       ` [PATCH v4 0/6] Xdiff cleanup part 3 Junio C Hamano
2026-03-31  9:45         ` Phillip Wood
2026-04-08 20:26       ` [PATCH v5 " Ezekiel Newren via GitGitGadget
2026-04-08 20:26         ` [PATCH v5 1/6] xdiff/xdl_cleanup_records: delete local recs pointer Ezekiel Newren via GitGitGadget
2026-04-08 20:26         ` [PATCH v5 2/6] xdiff: use unambiguous types in xdl_bogo_sqrt() Ezekiel Newren via GitGitGadget
2026-04-08 20:26         ` [PATCH v5 3/6] xdiff/xdl_cleanup_records: use unambiguous types Ezekiel Newren via GitGitGadget
2026-04-08 20:26         ` [PATCH v5 4/6] xdiff/xdl_cleanup_records: make limits more clear Ezekiel Newren via GitGitGadget
2026-04-14 10:09           ` Phillip Wood
2026-04-08 20:26         ` [PATCH v5 5/6] xdiff/xdl_cleanup_records: make setting action easier to follow Ezekiel Newren via GitGitGadget
2026-04-08 20:26         ` [PATCH v5 6/6] xdiff/xdl_cleanup_records: put braces around the else clause Ezekiel Newren via GitGitGadget
2026-04-08 21:28         ` [PATCH v5 0/6] Xdiff cleanup part 3 Junio C Hamano
2026-04-09 14:01           ` Phillip Wood
2026-04-14 10:08         ` Phillip Wood
2026-04-14 17:06           ` Junio C Hamano
2026-04-29 22:08         ` [PATCH v6 " Ezekiel Newren via GitGitGadget
2026-04-29 22:08           ` [PATCH v6 1/6] xdiff/xdl_cleanup_records: delete local recs pointer Ezekiel Newren via GitGitGadget
2026-04-29 22:08           ` [PATCH v6 2/6] xdiff: use unambiguous types in xdl_bogo_sqrt() Ezekiel Newren via GitGitGadget
2026-04-29 22:08           ` [PATCH v6 3/6] xdiff/xdl_cleanup_records: use unambiguous types Ezekiel Newren via GitGitGadget
2026-04-29 22:08           ` [PATCH v6 4/6] xdiff/xdl_cleanup_records: make limits more clear Ezekiel Newren via GitGitGadget
2026-04-29 22:08           ` [PATCH v6 5/6] xdiff/xdl_cleanup_records: make setting action easier to follow Ezekiel Newren via GitGitGadget
2026-04-29 22:08           ` [PATCH v6 6/6] xdiff/xdl_cleanup_records: make execution of " Ezekiel Newren via GitGitGadget
2026-04-30 13:35           ` [PATCH v6 0/6] Xdiff cleanup part 3 Phillip Wood
2026-04-30 21:08             ` Ezekiel Newren
2026-05-04  0:59             ` Junio C Hamano

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=53e4840c1653772379dc8d5c883b34717b81ac43.1767379944.git.gitgitgadget@gmail.com \
    --to=gitgitgadget@gmail$(echo .)com \
    --cc=ezekielnewren@gmail$(echo .)com \
    --cc=git@vger$(echo .)kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox