From: "Ezekiel Newren via GitGitGadget" <gitgitgadget@gmail•com>
To: git@vger•kernel.org
Cc: Ezekiel Newren <ezekielnewren@gmail•com>,
Ezekiel Newren <ezekielnewren@gmail•com>
Subject: [PATCH 03/10] xdiff: don't waste time guessing the number of lines
Date: Fri, 02 Jan 2026 18:52:17 +0000 [thread overview]
Message-ID: <53e4840c1653772379dc8d5c883b34717b81ac43.1767379944.git.gitgitgadget@gmail.com> (raw)
In-Reply-To: <pull.2156.git.git.1767379944.gitgitgadget@gmail.com>
From: Ezekiel Newren <ezekielnewren@gmail•com>
All lines must be read anyway, so classify them after they're read in.
Also move the memset() into xdl_init_classifier().
Signed-off-by: Ezekiel Newren <ezekielnewren@gmail•com>
---
xdiff/xprepare.c | 52 +++++++++++++++++++-----------------------------
xdiff/xutils.c | 20 -------------------
xdiff/xutils.h | 1 -
3 files changed, 21 insertions(+), 52 deletions(-)
diff --git a/xdiff/xprepare.c b/xdiff/xprepare.c
index 34c82e4f8e..96a32cc5e9 100644
--- a/xdiff/xprepare.c
+++ b/xdiff/xprepare.c
@@ -26,8 +26,6 @@
#define XDL_KPDIS_RUN 4
#define XDL_MAX_EQLIMIT 1024
#define XDL_SIMSCAN_WINDOW 100
-#define XDL_GUESS_NLINES1 256
-#define XDL_GUESS_NLINES2 20
#define DISCARD 0
#define KEEP 1
@@ -55,6 +53,8 @@ typedef struct s_xdlclassifier {
static int xdl_init_classifier(xdlclassifier_t *cf, long size, long flags) {
+ memset(cf, 0, sizeof(xdlclassifier_t));
+
cf->flags = flags;
cf->hbits = xdl_hashbits((unsigned int) size);
@@ -134,12 +134,12 @@ static void xdl_free_ctx(xdfile_t *xdf)
}
-static int xdl_prepare_ctx(unsigned int pass, mmfile_t *mf, long narec, xpparam_t const *xpp,
- xdlclassifier_t *cf, xdfile_t *xdf) {
+static int xdl_prepare_ctx(mmfile_t *mf, xdfile_t *xdf, uint64_t flags) {
long bsize;
uint64_t hav;
uint8_t const *blk, *cur, *top, *prev;
xrecord_t *crec;
+ long narec = 8;
xdf->reference_index = NULL;
xdf->changed = NULL;
@@ -152,23 +152,21 @@ static int xdl_prepare_ctx(unsigned int pass, mmfile_t *mf, long narec, xpparam_
if ((cur = blk = xdl_mmfile_first(mf, &bsize))) {
for (top = blk + bsize; cur < top; ) {
prev = cur;
- hav = xdl_hash_record(&cur, top, xpp->flags);
+ hav = xdl_hash_record(&cur, top, flags);
if (XDL_ALLOC_GROW(xdf->recs, (long)xdf->nrec + 1, narec))
goto abort;
crec = &xdf->recs[xdf->nrec++];
crec->ptr = prev;
crec->size = cur - prev;
crec->line_hash = hav;
- if (xdl_classify_record(pass, cf, crec) < 0)
- goto abort;
}
}
if (!XDL_CALLOC_ARRAY(xdf->changed, xdf->nrec + 2))
goto abort;
- if ((XDF_DIFF_ALG(xpp->flags) != XDF_PATIENCE_DIFF) &&
- (XDF_DIFF_ALG(xpp->flags) != XDF_HISTOGRAM_DIFF)) {
+ if ((XDF_DIFF_ALG(flags) != XDF_PATIENCE_DIFF) &&
+ (XDF_DIFF_ALG(flags) != XDF_HISTOGRAM_DIFF)) {
if (!XDL_ALLOC_ARRAY(xdf->reference_index, xdf->nrec + 1))
goto abort;
}
@@ -381,37 +379,29 @@ static int xdl_optimize_ctxs(xdlclassifier_t *cf, xdfile_t *xdf1, xdfile_t *xdf2
int xdl_prepare_env(mmfile_t *mf1, mmfile_t *mf2, xpparam_t const *xpp,
xdfenv_t *xe) {
- long enl1, enl2, sample;
xdlclassifier_t cf;
- memset(&cf, 0, sizeof(cf));
-
- /*
- * For histogram diff, we can afford a smaller sample size and
- * thus a poorer estimate of the number of lines, as the hash
- * table (rhash) won't be filled up/grown. The number of lines
- * (nrecs) will be updated correctly anyway by
- * xdl_prepare_ctx().
- */
- sample = (XDF_DIFF_ALG(xpp->flags) == XDF_HISTOGRAM_DIFF
- ? XDL_GUESS_NLINES2 : XDL_GUESS_NLINES1);
+ if (xdl_prepare_ctx(mf1, &xe->xdf1, xpp->flags) < 0) {
- enl1 = xdl_guess_lines(mf1, sample) + 1;
- enl2 = xdl_guess_lines(mf2, sample) + 1;
-
- if (xdl_init_classifier(&cf, enl1 + enl2 + 1, xpp->flags) < 0)
return -1;
+ }
+ if (xdl_prepare_ctx(mf2, &xe->xdf2, xpp->flags) < 0) {
- if (xdl_prepare_ctx(1, mf1, enl1, xpp, &cf, &xe->xdf1) < 0) {
-
- xdl_free_classifier(&cf);
+ xdl_free_ctx(&xe->xdf1);
return -1;
}
- if (xdl_prepare_ctx(2, mf2, enl2, xpp, &cf, &xe->xdf2) < 0) {
- xdl_free_ctx(&xe->xdf1);
- xdl_free_classifier(&cf);
+ if (xdl_init_classifier(&cf, xe->xdf1.nrec + xe->xdf2.nrec + 1, xpp->flags) < 0)
return -1;
+
+ for (size_t i = 0; i < xe->xdf1.nrec; i++) {
+ xrecord_t *rec = &xe->xdf1.recs[i];
+ xdl_classify_record(1, &cf, rec);
+ }
+
+ for (size_t i = 0; i < xe->xdf2.nrec; i++) {
+ xrecord_t *rec = &xe->xdf2.recs[i];
+ xdl_classify_record(2, &cf, rec);
}
if ((XDF_DIFF_ALG(xpp->flags) != XDF_PATIENCE_DIFF) &&
diff --git a/xdiff/xutils.c b/xdiff/xutils.c
index 77ee1ad9c8..b3d51197c1 100644
--- a/xdiff/xutils.c
+++ b/xdiff/xutils.c
@@ -118,26 +118,6 @@ void *xdl_cha_alloc(chastore_t *cha) {
return data;
}
-long xdl_guess_lines(mmfile_t *mf, long sample) {
- long nl = 0, size, tsize = 0;
- char const *data, *cur, *top;
-
- if ((cur = data = xdl_mmfile_first(mf, &size))) {
- for (top = data + size; nl < sample && cur < top; ) {
- nl++;
- if (!(cur = memchr(cur, '\n', top - cur)))
- cur = top;
- else
- cur++;
- }
- tsize += (long) (cur - data);
- }
-
- if (nl && tsize)
- nl = xdl_mmfile_size(mf) / (tsize / nl);
-
- return nl + 1;
-}
int xdl_blankline(const char *line, long size, long flags)
{
diff --git a/xdiff/xutils.h b/xdiff/xutils.h
index 615b4a9d35..d800840dd0 100644
--- a/xdiff/xutils.h
+++ b/xdiff/xutils.h
@@ -31,7 +31,6 @@ int xdl_emit_diffrec(char const *rec, long size, char const *pre, long psize,
int xdl_cha_init(chastore_t *cha, long isize, long icount);
void xdl_cha_free(chastore_t *cha);
void *xdl_cha_alloc(chastore_t *cha);
-long xdl_guess_lines(mmfile_t *mf, long sample);
int xdl_blankline(const char *line, long size, long flags);
int xdl_recmatch(const char *l1, long s1, const char *l2, long s2, long flags);
uint64_t xdl_hash_record_verbatim(uint8_t const **data, uint8_t const *top);
--
gitgitgadget
next prev parent reply other threads:[~2026-01-02 18:52 UTC|newest]
Thread overview: 124+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-01-02 18:52 [PATCH 00/10] Xdiff cleanup part 3 Ezekiel Newren via GitGitGadget
2026-01-02 18:52 ` [PATCH 01/10] ivec: introduce the C side of ivec Ezekiel Newren via GitGitGadget
2026-01-04 5:32 ` Junio C Hamano
2026-01-17 16:06 ` Ezekiel Newren
2026-01-08 14:34 ` Phillip Wood
2026-01-15 15:55 ` Ezekiel Newren
2026-01-16 10:39 ` Phillip Wood
2026-01-16 20:19 ` René Scharfe
2026-01-17 13:55 ` Phillip Wood
2026-01-17 16:04 ` Ezekiel Newren
2026-01-18 14:58 ` René Scharfe
2026-01-17 16:14 ` Ezekiel Newren
2026-01-17 16:16 ` Ezekiel Newren
2026-01-17 17:40 ` Phillip Wood
2026-01-19 5:59 ` Jeff King
2026-01-19 20:21 ` Ezekiel Newren
2026-01-19 20:40 ` Jeff King
2026-01-20 2:36 ` D. Ben Knoble
2026-01-21 21:00 ` Ezekiel Newren
2026-01-21 21:20 ` Jeff King
2026-01-21 21:31 ` Junio C Hamano
2026-01-21 21:45 ` Ezekiel Newren
2026-01-20 13:46 ` Phillip Wood
2026-01-20 14:06 ` Phillip Wood
2026-01-21 21:39 ` Ezekiel Newren
2026-01-28 11:15 ` Phillip Wood
2026-01-16 20:19 ` René Scharfe
2026-01-17 15:58 ` Ezekiel Newren
2026-01-18 14:55 ` René Scharfe
2026-01-02 18:52 ` [PATCH 02/10] xdiff: make classic diff explicit by creating xdl_do_classic_diff() Ezekiel Newren via GitGitGadget
2026-01-20 15:01 ` Phillip Wood
2026-01-21 21:05 ` Ezekiel Newren
2026-01-02 18:52 ` Ezekiel Newren via GitGitGadget [this message]
2026-01-20 15:02 ` [PATCH 03/10] xdiff: don't waste time guessing the number of lines Phillip Wood
2026-01-21 21:12 ` Ezekiel Newren
2026-01-22 10:16 ` Phillip Wood
2026-01-02 18:52 ` [PATCH 04/10] xdiff: let patience and histogram benefit from xdl_trim_ends() Ezekiel Newren via GitGitGadget
2026-01-20 15:02 ` Phillip Wood
2026-01-21 14:49 ` Phillip Wood
2026-01-02 18:52 ` [PATCH 05/10] xdiff: use xdfenv_t in xdl_trim_ends() and xdl_cleanup_records() Ezekiel Newren via GitGitGadget
2026-01-20 16:32 ` Phillip Wood
2026-01-02 18:52 ` [PATCH 06/10] xdiff: cleanup xdl_trim_ends() Ezekiel Newren via GitGitGadget
2026-01-20 16:32 ` Phillip Wood
2026-01-02 18:52 ` [PATCH 07/10] xdiff: replace xdfile_t.dstart with xdfenv_t.delta_start Ezekiel Newren via GitGitGadget
2026-01-20 16:32 ` Phillip Wood
2026-01-28 10:51 ` Phillip Wood
2026-01-02 18:52 ` [PATCH 08/10] xdiff: replace xdfile_t.dend with xdfenv_t.delta_end Ezekiel Newren via GitGitGadget
2026-01-02 18:52 ` [PATCH 09/10] xdiff: remove dependence on xdlclassifier from xdl_cleanup_records() Ezekiel Newren via GitGitGadget
2026-01-16 20:19 ` René Scharfe
2026-01-17 16:34 ` Ezekiel Newren
2026-01-18 18:23 ` René Scharfe
2026-01-21 15:01 ` Phillip Wood
2026-01-02 18:52 ` [PATCH 10/10] xdiff: move xdl_cleanup_records() from xprepare.c to xdiffi.c Ezekiel Newren via GitGitGadget
2026-01-21 15:01 ` Phillip Wood
2026-01-28 10:56 ` Phillip Wood
2026-01-04 2:44 ` [PATCH 00/10] Xdiff cleanup part 3 Junio C Hamano
2026-01-04 6:01 ` Yee Cheng Chin
2026-01-28 14:40 ` Phillip Wood
2026-03-06 23:03 ` Junio C Hamano
2026-03-09 19:06 ` Ezekiel Newren
2026-03-09 23:31 ` Junio C Hamano
2026-03-25 21:11 ` [PATCH v2 0/5] " Ezekiel Newren via GitGitGadget
2026-03-25 21:11 ` [PATCH v2 1/5] xdiff/xdl_cleanup_records: delete local recs pointer Ezekiel Newren via GitGitGadget
2026-03-25 21:11 ` [PATCH v2 2/5] xdiff/xdl_cleanup_records: make limits more clear Ezekiel Newren via GitGitGadget
2026-03-25 21:11 ` [PATCH v2 3/5] xdiff/xdl_cleanup_records: make setting action easier to follow Ezekiel Newren via GitGitGadget
2026-03-25 21:11 ` [PATCH v2 4/5] xdiff/xdl_cleanup_records: simplify INVESTIGATE handling for clarity Ezekiel Newren via GitGitGadget
2026-03-25 21:11 ` [PATCH v2 5/5] xdiff/xdl_cleanup_records: use unambiguous types Ezekiel Newren via GitGitGadget
2026-03-25 21:58 ` Junio C Hamano
2026-03-26 6:26 ` [PATCH v2 0/5] Xdiff cleanup part 3 SZEDER Gábor
2026-03-27 19:23 ` [PATCH v3 0/6] " Ezekiel Newren via GitGitGadget
2026-03-27 19:23 ` [PATCH v3 1/6] xdiff/xdl_cleanup_records: delete local recs pointer Ezekiel Newren via GitGitGadget
2026-03-27 19:23 ` [PATCH v3 2/6] xdiff: use unambiguous types in xdl_bogo_sqrt() Ezekiel Newren via GitGitGadget
2026-03-27 19:23 ` [PATCH v3 3/6] xdiff/xdl_cleanup_records: use unambiguous types Ezekiel Newren via GitGitGadget
2026-03-27 19:23 ` [PATCH v3 4/6] xdiff/xdl_cleanup_records: make limits more clear Ezekiel Newren via GitGitGadget
2026-03-27 21:09 ` Junio C Hamano
2026-03-27 23:01 ` Junio C Hamano
2026-03-30 16:00 ` Ezekiel Newren
2026-03-30 19:59 ` Junio C Hamano
2026-03-31 1:29 ` Ezekiel Newren
2026-03-27 19:23 ` [PATCH v3 5/6] xdiff/xdl_cleanup_records: make setting action easier to follow Ezekiel Newren via GitGitGadget
2026-03-27 19:23 ` [PATCH v3 6/6] xdiff/xdl_cleanup_records: simplify INVESTIGATE handling for clarity Ezekiel Newren via GitGitGadget
2026-03-30 16:59 ` [PATCH v4 0/6] Xdiff cleanup part 3 Ezekiel Newren via GitGitGadget
2026-03-30 16:59 ` [PATCH v4 1/6] xdiff/xdl_cleanup_records: delete local recs pointer Ezekiel Newren via GitGitGadget
2026-03-30 17:23 ` Ezekiel Newren
2026-03-30 22:53 ` Junio C Hamano
2026-03-30 16:59 ` [PATCH v4 2/6] xdiff: use unambiguous types in xdl_bogo_sqrt() Ezekiel Newren via GitGitGadget
2026-03-30 22:59 ` Junio C Hamano
2026-03-30 17:00 ` [PATCH v4 3/6] xdiff/xdl_cleanup_records: use unambiguous types Ezekiel Newren via GitGitGadget
2026-03-30 17:00 ` [PATCH v4 4/6] xdiff/xdl_cleanup_records: make limits more clear Ezekiel Newren via GitGitGadget
2026-03-31 9:44 ` Phillip Wood
2026-03-31 16:13 ` Junio C Hamano
2026-04-14 21:58 ` Ezekiel Newren
2026-04-14 22:15 ` Junio C Hamano
2026-04-15 13:54 ` Phillip Wood
2026-03-30 17:00 ` [PATCH v4 5/6] xdiff/xdl_cleanup_records: make setting action easier to follow Ezekiel Newren via GitGitGadget
2026-03-30 23:02 ` Junio C Hamano
2026-03-31 9:44 ` Phillip Wood
2026-03-30 17:00 ` [PATCH v4 6/6] xdiff/xdl_cleanup_records: simplify INVESTIGATE handling for clarity Ezekiel Newren via GitGitGadget
2026-03-31 9:43 ` Phillip Wood
2026-04-01 16:00 ` Phillip Wood
2026-03-30 23:04 ` [PATCH v4 0/6] Xdiff cleanup part 3 Junio C Hamano
2026-03-31 9:45 ` Phillip Wood
2026-04-08 20:26 ` [PATCH v5 " Ezekiel Newren via GitGitGadget
2026-04-08 20:26 ` [PATCH v5 1/6] xdiff/xdl_cleanup_records: delete local recs pointer Ezekiel Newren via GitGitGadget
2026-04-08 20:26 ` [PATCH v5 2/6] xdiff: use unambiguous types in xdl_bogo_sqrt() Ezekiel Newren via GitGitGadget
2026-04-08 20:26 ` [PATCH v5 3/6] xdiff/xdl_cleanup_records: use unambiguous types Ezekiel Newren via GitGitGadget
2026-04-08 20:26 ` [PATCH v5 4/6] xdiff/xdl_cleanup_records: make limits more clear Ezekiel Newren via GitGitGadget
2026-04-14 10:09 ` Phillip Wood
2026-04-08 20:26 ` [PATCH v5 5/6] xdiff/xdl_cleanup_records: make setting action easier to follow Ezekiel Newren via GitGitGadget
2026-04-08 20:26 ` [PATCH v5 6/6] xdiff/xdl_cleanup_records: put braces around the else clause Ezekiel Newren via GitGitGadget
2026-04-08 21:28 ` [PATCH v5 0/6] Xdiff cleanup part 3 Junio C Hamano
2026-04-09 14:01 ` Phillip Wood
2026-04-14 10:08 ` Phillip Wood
2026-04-14 17:06 ` Junio C Hamano
2026-04-29 22:08 ` [PATCH v6 " Ezekiel Newren via GitGitGadget
2026-04-29 22:08 ` [PATCH v6 1/6] xdiff/xdl_cleanup_records: delete local recs pointer Ezekiel Newren via GitGitGadget
2026-04-29 22:08 ` [PATCH v6 2/6] xdiff: use unambiguous types in xdl_bogo_sqrt() Ezekiel Newren via GitGitGadget
2026-04-29 22:08 ` [PATCH v6 3/6] xdiff/xdl_cleanup_records: use unambiguous types Ezekiel Newren via GitGitGadget
2026-04-29 22:08 ` [PATCH v6 4/6] xdiff/xdl_cleanup_records: make limits more clear Ezekiel Newren via GitGitGadget
2026-04-29 22:08 ` [PATCH v6 5/6] xdiff/xdl_cleanup_records: make setting action easier to follow Ezekiel Newren via GitGitGadget
2026-04-29 22:08 ` [PATCH v6 6/6] xdiff/xdl_cleanup_records: make execution of " Ezekiel Newren via GitGitGadget
2026-04-30 13:35 ` [PATCH v6 0/6] Xdiff cleanup part 3 Phillip Wood
2026-04-30 21:08 ` Ezekiel Newren
2026-05-04 0:59 ` Junio C Hamano
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=53e4840c1653772379dc8d5c883b34717b81ac43.1767379944.git.gitgitgadget@gmail.com \
--to=gitgitgadget@gmail$(echo .)com \
--cc=ezekielnewren@gmail$(echo .)com \
--cc=git@vger$(echo .)kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox