* [RFC PATCH 1/3] diff: add word-diff-align line pairing
2026-05-27 4:23 [RFC PATCH 0/3] diff: pair edited lines inside moved blocks Keita Oda
@ 2026-05-27 4:24 ` Keita Oda
2026-05-27 4:24 ` [RFC PATCH 2/3] diff: render word-diff-align pairs for RFC review Keita Oda
2026-05-27 4:24 ` [RFC PATCH 3/3] t4034: cover moved-and-edited word diff alignment Keita Oda
2 siblings, 0 replies; 4+ messages in thread
From: Keita Oda @ 2026-05-27 4:24 UTC (permalink / raw)
To: git; +Cc: Keita ODA
From: Keita ODA <ainsophyao@gmail•com>
Add an opt-in --word-diff-align mode that post-processes emitted diff
symbols and tries to pair similar deleted and inserted lines inside each hunk.
This is not a replacement for the diff algorithm. The normal diff is produced
first; the new code only annotates the already-emitted symbols for review.
The candidate search is intentionally approximate. Each line is tokenized into
word-ish runs and single non-space symbols. Each token gets a hash, each line
gets a compact 64-bit fingerprint, and each changed line also gets a small
surrounding window fingerprint. Inserted-side windows are placed into bit
buckets, and deleted-side windows query those buckets to avoid a full all-pairs
scan in typical cases.
The fingerprint is only a retrieval index. Candidate pairs are scored again
using token hashes:
W = unique token overlap in the small surrounding windows
L = center-line token LCS score
S = W + 4L
For the center-line score, tokens repeated in the surrounding window carry less
weight. This is a local-IDF-like approximation: in a run of "import" or
"#define" lines, the repeated keyword is weak evidence, while the identifier
specific to the center line remains strong evidence.
For this RFC, selected pairs are exposed with a debug-style "# aligned ..."
comment. The next patch adds a small renderer to make the selected pairs more
readable.
---
diff.c | 809 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
diff.h | 1 +
2 files changed, 802 insertions(+), 8 deletions(-)
diff --git a/diff.c b/diff.c
index 397e38b41..6b8744920 100644
--- a/diff.c
+++ b/diff.c
@@ -46,6 +46,7 @@
#include "setup.h"
#include "strmap.h"
#include "ws.h"
+#include "ewah/ewok.h"
#ifdef NO_FAST_WORKING_DIRECTORY
#define FAST_WORKING_DIRECTORY 0
@@ -1358,6 +1359,774 @@ static void dim_moved_lines(struct diff_options *o)
}
}
+struct word_diff_align_line_info {
+ uint64_t bits;
+ int token_start;
+ int token_nr;
+};
+
+struct word_diff_align_item {
+ int symbol;
+ int change_pos;
+ int line_no;
+};
+
+struct word_diff_align_candidate {
+ int old_pos;
+ int new_pos;
+ int score;
+ int window_score;
+ int window_shared;
+ int line_score;
+ int line_shared;
+};
+
+#define WORD_DIFF_ALIGN_NUMBER_HASH 0x9e3779b9U
+
+#define WORD_DIFF_ALIGN_WINDOW_SIZE 2
+#define WORD_DIFF_ALIGN_FINGERPRINT_BITS 64
+#define WORD_DIFF_ALIGN_BIT_BUCKET_MAX 256
+#define WORD_DIFF_ALIGN_WINDOW_MIN_SCORE 35
+#define WORD_DIFF_ALIGN_WINDOW_MIN_SHARED 4
+#define WORD_DIFF_ALIGN_LINE_WEIGHT 4
+#define WORD_DIFF_ALIGN_SCORE_MIN 175
+
+static void word_diff_align_window_bounds(int nr, int pos,
+ int *start_p, int *end_p)
+{
+ int radius = WORD_DIFF_ALIGN_WINDOW_SIZE;
+
+ *start_p = pos > radius ? pos - radius : 0;
+ *end_p = pos + radius + 1 < nr ? pos + radius + 1 : nr;
+}
+
+static int word_diff_align_payload_len(const struct emitted_diff_symbol *e)
+{
+ int len = e->len;
+
+ if (len && e->line[len - 1] == '\n')
+ len--;
+ if (len && e->line[len - 1] == '\r')
+ len--;
+ return len;
+}
+
+static int word_diff_align_word_byte(char ch)
+{
+ return isalnum((unsigned char)ch) || ch == '_';
+}
+
+static int word_diff_align_numeric_token(const char *line, int start, int end)
+{
+ int i;
+
+ for (i = start; i < end; i++)
+ if (!isdigit((unsigned char)line[i]))
+ return 0;
+ return start < end;
+}
+
+static uint64_t word_diff_align_token_bits(unsigned int hash)
+{
+ return (1ULL << (hash & 63)) | (1ULL << ((hash >> 6) & 63));
+}
+
+static int word_diff_align_next_token(const char *line, int len, int *pos,
+ int *start_p, int *end_p)
+{
+ while (*pos < len && isspace((unsigned char)line[*pos]))
+ (*pos)++;
+ if (*pos == len)
+ return 0;
+
+ *start_p = *pos;
+ if (word_diff_align_word_byte(line[*pos]))
+ while (*pos < len && word_diff_align_word_byte(line[*pos]))
+ (*pos)++;
+ else
+ (*pos)++;
+ *end_p = *pos;
+ return 1;
+}
+
+static void word_diff_align_prepare_line(const struct emitted_diff_symbol *e,
+ struct word_diff_align_line_info *line,
+ unsigned int **tokens,
+ int *tokens_nr,
+ int *tokens_alloc)
+{
+ int len = word_diff_align_payload_len(e);
+ int pos = 0;
+ int start, end;
+
+ line->bits = 0;
+ line->token_start = *tokens_nr;
+ while (word_diff_align_next_token(e->line, len, &pos, &start, &end)) {
+ int numeric = word_diff_align_numeric_token(e->line, start, end);
+ unsigned int hash = numeric ? WORD_DIFF_ALIGN_NUMBER_HASH :
+ memhash(e->line + start, end - start);
+
+ ALLOC_GROW(*tokens, *tokens_nr + 1, *tokens_alloc);
+ (*tokens)[*tokens_nr] = hash;
+ line->bits |= word_diff_align_token_bits(hash);
+ (*tokens_nr)++;
+ }
+ line->token_nr = *tokens_nr - line->token_start;
+}
+
+static uint64_t word_diff_align_window_bits(struct word_diff_align_line_info *lines,
+ int nr, int pos)
+{
+ uint64_t bits = 0;
+ int start, end;
+ int i;
+
+ word_diff_align_window_bounds(nr, pos, &start, &end);
+ for (i = start; i < end; i++)
+ bits |= lines[i].bits;
+ return bits;
+}
+
+static int word_diff_align_filter_score(uint64_t a, uint64_t b, int *shared_p)
+{
+ int shared = ewah_bit_popcount64(a & b);
+ int total = ewah_bit_popcount64(a | b);
+
+ *shared_p = shared;
+ return total ? shared * 100 / total : 0;
+}
+
+static int word_diff_align_line_has_token(const struct word_diff_align_line_info *line,
+ const unsigned int *tokens,
+ unsigned int hash);
+
+static int word_diff_align_window_has_token(const struct word_diff_align_line_info *lines,
+ int start, int end,
+ const unsigned int *tokens,
+ unsigned int hash)
+{
+ int i;
+
+ for (i = start; i < end; i++)
+ if (word_diff_align_line_has_token(&lines[i], tokens, hash))
+ return 1;
+ return 0;
+}
+
+static int word_diff_align_window_seen_token(const struct word_diff_align_line_info *lines,
+ int start, int line_pos,
+ int token_pos,
+ const unsigned int *tokens,
+ unsigned int hash)
+{
+ int i;
+
+ for (i = start; i <= line_pos; i++) {
+ const struct word_diff_align_line_info *line = &lines[i];
+ int end = i == line_pos ? token_pos : line->token_nr;
+ int j;
+
+ for (j = 0; j < end; j++)
+ if (tokens[line->token_start + j] == hash)
+ return 1;
+ }
+ return 0;
+}
+
+static int word_diff_align_window_token_score(const struct word_diff_align_line_info *minus_lines,
+ int minus_nr, int minus_pos,
+ const unsigned int *minus_tokens,
+ const struct word_diff_align_line_info *plus_lines,
+ int plus_nr, int plus_pos,
+ const unsigned int *plus_tokens,
+ int *shared_p)
+{
+ int minus_start, minus_end, plus_start, plus_end;
+ int minus_unique = 0, plus_unique = 0, shared = 0;
+ int i;
+
+ word_diff_align_window_bounds(minus_nr, minus_pos, &minus_start,
+ &minus_end);
+ word_diff_align_window_bounds(plus_nr, plus_pos, &plus_start,
+ &plus_end);
+
+ for (i = minus_start; i < minus_end; i++) {
+ const struct word_diff_align_line_info *line = &minus_lines[i];
+ int j;
+
+ for (j = 0; j < line->token_nr; j++) {
+ unsigned int hash = minus_tokens[line->token_start + j];
+
+ if (word_diff_align_window_seen_token(minus_lines,
+ minus_start, i, j,
+ minus_tokens,
+ hash))
+ continue;
+ minus_unique++;
+ if (word_diff_align_window_has_token(plus_lines,
+ plus_start, plus_end,
+ plus_tokens, hash))
+ shared++;
+ }
+ }
+
+ for (i = plus_start; i < plus_end; i++) {
+ const struct word_diff_align_line_info *line = &plus_lines[i];
+ int j;
+
+ for (j = 0; j < line->token_nr; j++) {
+ unsigned int hash = plus_tokens[line->token_start + j];
+
+ if (word_diff_align_window_seen_token(plus_lines,
+ plus_start, i, j,
+ plus_tokens,
+ hash))
+ continue;
+ plus_unique++;
+ }
+ }
+
+ *shared_p = shared;
+ return minus_unique + plus_unique - shared ?
+ shared * 100 / (minus_unique + plus_unique - shared) : 0;
+}
+
+static int word_diff_align_line_has_token(const struct word_diff_align_line_info *line,
+ const unsigned int *tokens,
+ unsigned int hash)
+{
+ int i;
+
+ for (i = 0; i < line->token_nr; i++)
+ if (tokens[line->token_start + i] == hash)
+ return 1;
+ return 0;
+}
+
+static int word_diff_align_surrounding_line_freq(const struct word_diff_align_line_info *lines,
+ int nr, int pos,
+ const unsigned int *tokens,
+ unsigned int hash)
+{
+ int radius = WORD_DIFF_ALIGN_WINDOW_SIZE;
+ int start = pos > radius ? pos - radius : 0;
+ int end = pos + radius + 1 < nr ? pos + radius + 1 : nr;
+ int i, freq = 0;
+
+ for (i = start; i < end; i++) {
+ if (i == pos)
+ continue;
+ if (word_diff_align_line_has_token(&lines[i], tokens, hash))
+ freq++;
+ }
+ return freq;
+}
+
+static int word_diff_align_shared_token_value(const struct word_diff_align_line_info *minus_lines,
+ int minus_nr, int minus_pos,
+ const unsigned int *minus_tokens,
+ const struct word_diff_align_line_info *plus_lines,
+ int plus_nr, int plus_pos,
+ const unsigned int *plus_tokens,
+ unsigned int hash)
+{
+ int minus_freq = word_diff_align_surrounding_line_freq(minus_lines,
+ minus_nr,
+ minus_pos,
+ minus_tokens,
+ hash);
+ int plus_freq = word_diff_align_surrounding_line_freq(plus_lines,
+ plus_nr,
+ plus_pos,
+ plus_tokens,
+ hash);
+ int freq = minus_freq > plus_freq ? minus_freq : plus_freq;
+ int value = 2 * WORD_DIFF_ALIGN_WINDOW_SIZE - freq;
+
+ return value > 0 ? value : 0;
+}
+
+static int word_diff_align_unmatched_weight(const struct word_diff_align_line_info *line,
+ const unsigned int *tokens,
+ const unsigned int *other_tokens,
+ int other_start, int other_nr)
+{
+ int i, weight = 0;
+
+ for (i = 0; i < line->token_nr; i++) {
+ unsigned int hash = tokens[line->token_start + i];
+ int j, matched = 0;
+
+ for (j = 0; j < other_nr; j++) {
+ if (hash == other_tokens[other_start + j]) {
+ matched = 1;
+ break;
+ }
+ }
+ if (!matched)
+ weight++;
+ }
+ return weight;
+}
+
+static int word_diff_align_line_token_score(const struct word_diff_align_line_info *minus,
+ const struct word_diff_align_line_info *minus_lines,
+ int minus_nr, int minus_pos,
+ const unsigned int *minus_tokens,
+ const struct word_diff_align_line_info *plus,
+ const struct word_diff_align_line_info *plus_lines,
+ int plus_nr, int plus_pos,
+ const unsigned int *plus_tokens,
+ int *shared_p)
+{
+ int i, j, shared;
+ int *lcs;
+ int unmatched, total;
+
+ if (!minus->token_nr || !plus->token_nr) {
+ *shared_p = 0;
+ return 0;
+ }
+
+ CALLOC_ARRAY(lcs, plus->token_nr + 1);
+ for (i = 0; i < minus->token_nr; i++) {
+ unsigned int minus_hash = minus_tokens[minus->token_start + i];
+ int prev = 0;
+
+ for (j = 0; j < plus->token_nr; j++) {
+ unsigned int plus_hash = plus_tokens[plus->token_start + j];
+ int column = j + 1;
+ int saved;
+
+ saved = lcs[column];
+ if (minus_hash == plus_hash) {
+ int value = word_diff_align_shared_token_value(minus_lines,
+ minus_nr,
+ minus_pos,
+ minus_tokens,
+ plus_lines,
+ plus_nr,
+ plus_pos,
+ plus_tokens,
+ minus_hash);
+ int best = prev + value;
+
+ if (lcs[column] > best)
+ best = lcs[column];
+ if (lcs[column - 1] > best)
+ best = lcs[column - 1];
+ lcs[column] = best;
+ }
+ else if (lcs[column - 1] > lcs[column])
+ lcs[column] = lcs[column - 1];
+ prev = saved;
+ }
+ }
+
+ shared = lcs[plus->token_nr];
+ free(lcs);
+ unmatched = word_diff_align_unmatched_weight(minus, minus_tokens,
+ plus_tokens,
+ plus->token_start,
+ plus->token_nr) +
+ word_diff_align_unmatched_weight(plus, plus_tokens,
+ minus_tokens,
+ minus->token_start,
+ minus->token_nr);
+ total = 2 * shared + unmatched;
+ *shared_p = shared;
+ return total ? shared * 200 / total : 0;
+}
+
+static int word_diff_align_pair_score(int window_score,
+ const struct word_diff_align_line_info *minus,
+ const struct word_diff_align_line_info *minus_lines,
+ int minus_nr, int minus_pos,
+ const unsigned int *minus_tokens,
+ const struct word_diff_align_line_info *plus,
+ const struct word_diff_align_line_info *plus_lines,
+ int plus_nr, int plus_pos,
+ const unsigned int *plus_tokens,
+ int *line_score_p,
+ int *line_shared_p)
+{
+ int line_score, line_shared;
+
+ line_score = word_diff_align_line_token_score(minus, minus_lines,
+ minus_nr, minus_pos,
+ minus_tokens,
+ plus, plus_lines,
+ plus_nr, plus_pos,
+ plus_tokens, &line_shared);
+ *line_score_p = line_score;
+ *line_shared_p = line_shared;
+
+ /*
+ * The bit fingerprint is only a retrieval approximation. The window
+ * score passed here is computed from exact token hashes in the 5-line
+ * windows, while the line score gives the center line an additional
+ * identity bonus.
+ */
+ return window_score + WORD_DIFF_ALIGN_LINE_WEIGHT * line_score;
+}
+
+static int word_diff_align_candidate_cmp(const void *va, const void *vb)
+{
+ const struct word_diff_align_candidate *a = va;
+ const struct word_diff_align_candidate *b = vb;
+
+ if (a->score != b->score)
+ return b->score - a->score;
+ if (a->line_score != b->line_score)
+ return b->line_score - a->line_score;
+ if (a->window_score != b->window_score)
+ return b->window_score - a->window_score;
+ if (a->line_shared != b->line_shared)
+ return b->line_shared - a->line_shared;
+ if (a->window_shared != b->window_shared)
+ return b->window_shared - a->window_shared;
+ if (a->old_pos != b->old_pos)
+ return a->old_pos - b->old_pos;
+ return a->new_pos - b->new_pos;
+}
+
+static void word_diff_align_add_candidate(struct word_diff_align_candidate **candidates,
+ int *candidates_nr,
+ int *candidates_alloc,
+ int old_pos, int new_pos,
+ int score, int window_score,
+ int window_shared, int line_score,
+ int line_shared)
+{
+ struct word_diff_align_candidate *candidate;
+
+ ALLOC_GROW(*candidates, *candidates_nr + 1, *candidates_alloc);
+ candidate = &(*candidates)[(*candidates_nr)++];
+ candidate->old_pos = old_pos;
+ candidate->new_pos = new_pos;
+ candidate->score = score;
+ candidate->window_score = window_score;
+ candidate->window_shared = window_shared;
+ candidate->line_score = line_score;
+ candidate->line_shared = line_shared;
+}
+
+static void word_diff_align_debug_append_comment(struct emitted_diff_symbol *line,
+ const struct strbuf *suffix)
+{
+ struct strbuf out = STRBUF_INIT;
+ char *old_line = (char *)line->line;
+ size_t new_len;
+
+ if (line->len && line->line[line->len - 1] == '\n') {
+ strbuf_add(&out, line->line, line->len - 1);
+ strbuf_addbuf(&out, suffix);
+ strbuf_addch(&out, '\n');
+ } else {
+ strbuf_add(&out, line->line, line->len);
+ strbuf_addbuf(&out, suffix);
+ }
+ line->line = strbuf_detach(&out, &new_len);
+ line->len = (int)new_len;
+ free(old_line);
+}
+
+static void word_diff_align_debug_mark_pair(struct emitted_diff_symbol *minus_line,
+ struct emitted_diff_symbol *plus_line,
+ int minus_lineno, int plus_lineno,
+ int changed, int moved,
+ int window_score,
+ int line_score,
+ int pair_score)
+{
+ struct strbuf suffix = STRBUF_INIT;
+
+ if (moved) {
+ minus_line->flags |= DIFF_SYMBOL_MOVED_LINE;
+ plus_line->flags |= DIFF_SYMBOL_MOVED_LINE;
+ }
+
+ strbuf_addf(&suffix,
+ " # aligned from %d to %d, %s, W=%d L=%d S=%d",
+ minus_lineno, plus_lineno,
+ changed ? "edited" : "unchanged",
+ window_score, line_score, pair_score);
+ word_diff_align_debug_append_comment(minus_line, &suffix);
+ word_diff_align_debug_append_comment(plus_line, &suffix);
+ strbuf_release(&suffix);
+}
+
+static void word_diff_align_add_item(struct word_diff_align_item **items,
+ int *items_nr, int *items_alloc,
+ int symbol, int change_pos, int line_no)
+{
+ ALLOC_GROW(*items, *items_nr + 1, *items_alloc);
+ (*items)[*items_nr].symbol = symbol;
+ (*items)[*items_nr].change_pos = change_pos;
+ (*items)[*items_nr].line_no = line_no;
+ (*items_nr)++;
+}
+
+static void word_diff_align_hunk_start(const struct emitted_diff_symbols *e,
+ int start, int *old_lineno,
+ int *new_lineno)
+{
+ const char *line;
+ char *endp;
+
+ *old_lineno = 0;
+ *new_lineno = 0;
+ if (start <= 0 || e->buf[start - 1].s != DIFF_SYMBOL_CONTEXT_FRAGINFO)
+ return;
+ line = e->buf[start - 1].line;
+ if (!starts_with(line, "@@ -"))
+ return;
+ line += 4;
+ if (!isdigit((unsigned char)*line))
+ return;
+ *old_lineno = strtol(line, &endp, 10);
+ line = endp;
+ if (*line == ',') {
+ line++;
+ strtol(line, &endp, 10);
+ line = endp;
+ }
+ if (!starts_with(line, " +"))
+ return;
+ line += 2;
+ if (!isdigit((unsigned char)*line))
+ return;
+ *new_lineno = strtol(line, &endp, 10);
+}
+
+static int word_diff_align_local_pair(int old_pos, int new_pos)
+{
+ return old_pos == new_pos;
+}
+
+static void word_diff_align_hunk(struct emitted_diff_symbols *e, int start, int end)
+{
+ struct word_diff_align_item *old_items = NULL, *new_items = NULL;
+ int old_items_nr = 0, old_items_alloc = 0;
+ int new_items_nr = 0, new_items_alloc = 0;
+ int minus_nr = 0, plus_nr = 0;
+ struct word_diff_align_line_info *old_lines = NULL, *new_lines = NULL;
+ unsigned int *old_tokens = NULL, *new_tokens = NULL;
+ int old_tokens_nr = 0, old_tokens_alloc = 0;
+ int new_tokens_nr = 0, new_tokens_alloc = 0;
+ uint64_t *new_contexts = NULL;
+ int *bucket_heads = NULL, *bucket_next = NULL, *bucket_counts = NULL;
+ int *candidate_counts = NULL, *candidate_touched = NULL;
+ char *used_old = NULL, *used_plus = NULL;
+ struct word_diff_align_candidate *candidates = NULL;
+ int candidates_nr = 0, candidates_alloc = 0;
+ int old_lineno, new_lineno;
+ int i, j;
+
+ word_diff_align_hunk_start(e, start, &old_lineno, &new_lineno);
+ for (i = start; i < end; i++) {
+ if (e->buf[i].s == DIFF_SYMBOL_MINUS) {
+ word_diff_align_add_item(&old_items, &old_items_nr,
+ &old_items_alloc, i, minus_nr++,
+ old_lineno++);
+ } else if (e->buf[i].s == DIFF_SYMBOL_PLUS) {
+ word_diff_align_add_item(&new_items, &new_items_nr,
+ &new_items_alloc, i, plus_nr++,
+ new_lineno++);
+ } else if (e->buf[i].s == DIFF_SYMBOL_CONTEXT) {
+ word_diff_align_add_item(&old_items, &old_items_nr,
+ &old_items_alloc, i, -1,
+ old_lineno++);
+ word_diff_align_add_item(&new_items, &new_items_nr,
+ &new_items_alloc, i, -1,
+ new_lineno++);
+ }
+ }
+
+ if (!minus_nr || !plus_nr)
+ goto cleanup;
+
+ CALLOC_ARRAY(old_lines, old_items_nr);
+ CALLOC_ARRAY(new_lines, new_items_nr);
+ CALLOC_ARRAY(new_contexts, new_items_nr);
+ CALLOC_ARRAY(bucket_heads, WORD_DIFF_ALIGN_FINGERPRINT_BITS);
+ CALLOC_ARRAY(bucket_counts, WORD_DIFF_ALIGN_FINGERPRINT_BITS);
+ CALLOC_ARRAY(bucket_next, new_items_nr * WORD_DIFF_ALIGN_FINGERPRINT_BITS);
+ CALLOC_ARRAY(candidate_counts, new_items_nr);
+ CALLOC_ARRAY(candidate_touched, new_items_nr);
+ CALLOC_ARRAY(used_old, minus_nr);
+ CALLOC_ARRAY(used_plus, plus_nr);
+
+ for (i = 0; i < old_items_nr; i++)
+ word_diff_align_prepare_line(&e->buf[old_items[i].symbol], &old_lines[i],
+ &old_tokens, &old_tokens_nr,
+ &old_tokens_alloc);
+ for (i = 0; i < new_items_nr; i++)
+ word_diff_align_prepare_line(&e->buf[new_items[i].symbol], &new_lines[i],
+ &new_tokens, &new_tokens_nr,
+ &new_tokens_alloc);
+ for (i = 0; i < new_items_nr; i++)
+ new_contexts[i] = word_diff_align_window_bits(new_lines, new_items_nr, i);
+
+ for (i = 0; i < new_items_nr; i++) {
+ if (new_items[i].change_pos < 0)
+ continue;
+ for (j = 0; j < WORD_DIFF_ALIGN_FINGERPRINT_BITS; j++) {
+ uint64_t bit = 1ULL << j;
+
+ if (!(new_contexts[i] & bit))
+ continue;
+ bucket_counts[j]++;
+ bucket_next[i * WORD_DIFF_ALIGN_FINGERPRINT_BITS + j] = bucket_heads[j];
+ bucket_heads[j] = i + 1;
+ }
+ }
+
+ for (i = 0; i < old_items_nr; i++) {
+ int touched_nr = 0;
+ uint64_t old_context;
+
+ if (old_items[i].change_pos < 0)
+ continue;
+ old_context = word_diff_align_window_bits(old_lines, old_items_nr, i);
+ for (j = 0; j < WORD_DIFF_ALIGN_FINGERPRINT_BITS; j++) {
+ uint64_t bit = 1ULL << j;
+ int item;
+
+ if (!(old_context & bit))
+ continue;
+ if (bucket_counts[j] > WORD_DIFF_ALIGN_BIT_BUCKET_MAX)
+ continue;
+ for (item = bucket_heads[j]; item; item = bucket_next[(item - 1) * WORD_DIFF_ALIGN_FINGERPRINT_BITS + j]) {
+ int new_pos = item - 1;
+
+ if (!candidate_counts[new_pos])
+ candidate_touched[touched_nr++] = new_pos;
+ candidate_counts[new_pos]++;
+ }
+ }
+
+ for (j = 0; j < touched_nr; j++) {
+ int new_pos = candidate_touched[j];
+ int filter_score, filter_shared;
+ int window_score, window_shared;
+ int line_score, line_shared, pair_score;
+
+ candidate_counts[new_pos] = 0;
+ if (new_items[new_pos].change_pos < 0)
+ continue;
+ filter_score = word_diff_align_filter_score(old_context,
+ new_contexts[new_pos],
+ &filter_shared);
+ if (filter_score < WORD_DIFF_ALIGN_WINDOW_MIN_SCORE ||
+ filter_shared < WORD_DIFF_ALIGN_WINDOW_MIN_SHARED)
+ continue;
+ window_score = word_diff_align_window_token_score(old_lines,
+ old_items_nr,
+ i,
+ old_tokens,
+ new_lines,
+ new_items_nr,
+ new_pos,
+ new_tokens,
+ &window_shared);
+ if (window_score < WORD_DIFF_ALIGN_WINDOW_MIN_SCORE)
+ continue;
+ pair_score = word_diff_align_pair_score(window_score,
+ &old_lines[i],
+ old_lines,
+ old_items_nr,
+ i,
+ old_tokens,
+ &new_lines[new_pos],
+ new_lines,
+ new_items_nr,
+ new_pos,
+ new_tokens,
+ &line_score,
+ &line_shared);
+ if (pair_score < WORD_DIFF_ALIGN_SCORE_MIN)
+ continue;
+ word_diff_align_add_candidate(&candidates, &candidates_nr,
+ &candidates_alloc, i, new_pos,
+ pair_score, window_score,
+ window_shared, line_score,
+ line_shared);
+ }
+ }
+
+ QSORT(candidates, candidates_nr, word_diff_align_candidate_cmp);
+ for (i = 0; i < candidates_nr; i++) {
+ int old_pos = candidates[i].old_pos;
+ int new_pos = candidates[i].new_pos;
+ int minus_pos = old_items[old_pos].change_pos;
+ int plus_pos = new_items[new_pos].change_pos;
+ int old_len, new_len, edited, moved;
+
+ if (used_old[minus_pos] || used_plus[plus_pos])
+ continue;
+ used_old[minus_pos] = 1;
+ used_plus[plus_pos] = 1;
+ old_len = word_diff_align_payload_len(&e->buf[old_items[old_pos].symbol]);
+ new_len = word_diff_align_payload_len(&e->buf[new_items[new_pos].symbol]);
+ edited = old_len != new_len ||
+ memcmp(e->buf[old_items[old_pos].symbol].line,
+ e->buf[new_items[new_pos].symbol].line,
+ old_len);
+ moved = !word_diff_align_local_pair(old_pos, new_pos);
+ word_diff_align_debug_mark_pair(&e->buf[old_items[old_pos].symbol],
+ &e->buf[new_items[new_pos].symbol],
+ old_items[old_pos].line_no,
+ new_items[new_pos].line_no,
+ edited, moved,
+ candidates[i].window_score,
+ candidates[i].line_score,
+ candidates[i].score);
+ }
+
+cleanup:
+ free(old_items);
+ free(new_items);
+ free(old_lines);
+ free(new_lines);
+ free(old_tokens);
+ free(new_tokens);
+ free(new_contexts);
+ free(bucket_heads);
+ free(bucket_next);
+ free(bucket_counts);
+ free(candidate_counts);
+ free(candidate_touched);
+ free(used_old);
+ free(used_plus);
+ free(candidates);
+}
+
+static void mark_word_diff_align(struct diff_options *o)
+{
+ int i, hunk_start = -1;
+ struct emitted_diff_symbols *e = o->emitted_symbols;
+
+ for (i = 0; i < e->nr; i++) {
+ if (e->buf[i].s == DIFF_SYMBOL_CONTEXT_FRAGINFO) {
+ if (hunk_start >= 0)
+ word_diff_align_hunk(e, hunk_start, i);
+ hunk_start = i + 1;
+ } else if (hunk_start >= 0 &&
+ e->buf[i].s != DIFF_SYMBOL_CONTEXT &&
+ e->buf[i].s != DIFF_SYMBOL_CONTEXT_INCOMPLETE &&
+ e->buf[i].s != DIFF_SYMBOL_CONTEXT_MARKER &&
+ e->buf[i].s != DIFF_SYMBOL_PLUS &&
+ e->buf[i].s != DIFF_SYMBOL_MINUS) {
+ word_diff_align_hunk(e, hunk_start, i);
+ hunk_start = -1;
+ }
+ }
+
+ if (hunk_start >= 0)
+ word_diff_align_hunk(e, hunk_start, e->nr);
+}
+
static void emit_line_ws_markup(struct diff_options *o,
const char *set_sign, const char *set,
const char *reset,
@@ -5245,6 +6014,10 @@ void diff_setup_done(struct diff_options *options)
die(_("options '%s' and '%s' cannot be used together, use '%s' with '%s' and '%s'"),
"--pickaxe-all", "--find-object", "--pickaxe-all", "-G", "-S");
+ if (options->word_diff_align && options->word_diff)
+ die(_("options '%s' and '%s' cannot be used together"),
+ "--word-diff-align", "--word-diff");
+
/*
* Most of the time we can say "there are changes"
* only by checking if there are changed paths, but
@@ -5971,6 +6744,18 @@ static int diff_opt_word_diff_regex(const struct option *opt,
return 0;
}
+static int diff_opt_word_diff_align(const struct option *opt,
+ const char *arg, int unset)
+{
+ struct diff_options *options = opt->value;
+
+ BUG_ON_OPT_ARG(arg);
+ options->word_diff_align = !unset;
+ if (!unset)
+ enable_patch_output(&options->output_format);
+ return 0;
+}
+
static int diff_opt_rotate_to(const struct option *opt, const char *arg, int unset)
{
struct diff_options *options = opt->value;
@@ -6205,6 +6990,9 @@ struct option *add_diff_options(const struct option *opts,
OPT_CALLBACK_F(0, "word-diff-regex", options, N_("<regex>"),
N_("use <regex> to decide what a word is"),
PARSE_OPT_NONEG, diff_opt_word_diff_regex),
+ OPT_CALLBACK_F(0, "word-diff-align", options, NULL,
+ N_("annotate similar moved lines"),
+ PARSE_OPT_NOARG, diff_opt_word_diff_align),
OPT_CALLBACK_F(0, "color-words", options, N_("<regex>"),
N_("equivalent to --word-diff=color --word-diff-regex=<regex>"),
PARSE_OPT_NONEG | PARSE_OPT_OPTARG, diff_opt_color_words),
@@ -7076,7 +7864,7 @@ static void diff_flush_patch_all_file_pairs(struct diff_options *o)
if (WSEH_NEW & WS_RULE_MASK)
BUG("WS rules bit mask overlaps with diff symbol flags");
- if (o->color_moved && want_color(o->use_color))
+ if ((o->color_moved && want_color(o->use_color)) || o->word_diff_align)
o->emitted_symbols = &esm;
if (o->additional_path_headers)
@@ -7092,14 +7880,19 @@ static void diff_flush_patch_all_file_pairs(struct diff_options *o)
struct mem_pool entry_pool;
struct moved_entry_list *entry_list;
- mem_pool_init(&entry_pool, 1024 * 1024);
- entry_list = add_lines_to_move_detection(o, &entry_pool);
- mark_color_as_moved(o, entry_list);
- if (o->color_moved == COLOR_MOVED_ZEBRA_DIM)
- dim_moved_lines(o);
+ if (o->word_diff_align)
+ mark_word_diff_align(o);
+
+ if (o->color_moved && want_color(o->use_color)) {
+ mem_pool_init(&entry_pool, 1024 * 1024);
+ entry_list = add_lines_to_move_detection(o, &entry_pool);
+ mark_color_as_moved(o, entry_list);
+ if (o->color_moved == COLOR_MOVED_ZEBRA_DIM)
+ dim_moved_lines(o);
- mem_pool_discard(&entry_pool, 0);
- free(entry_list);
+ mem_pool_discard(&entry_pool, 0);
+ free(entry_list);
+ }
for (i = 0; i < esm.nr; i++)
emit_diff_symbol_from_struct(o, &esm.buf[i]);
diff --git a/diff.h b/diff.h
index 7eb84aadf..b9a20b7f2 100644
--- a/diff.h
+++ b/diff.h
@@ -351,6 +351,7 @@ struct diff_options {
int stat_count;
const char *word_regex;
enum diff_words_type word_diff;
+ int word_diff_align;
enum diff_submodule_format submodule_format;
struct oidset *objfind;
--
2.39.3 (Apple Git-146)
^ permalink raw reply related [flat|nested] 4+ messages in thread* [RFC PATCH 2/3] diff: render word-diff-align pairs for RFC review
2026-05-27 4:23 [RFC PATCH 0/3] diff: pair edited lines inside moved blocks Keita Oda
2026-05-27 4:24 ` [RFC PATCH 1/3] diff: add word-diff-align line pairing Keita Oda
@ 2026-05-27 4:24 ` Keita Oda
2026-05-27 4:24 ` [RFC PATCH 3/3] t4034: cover moved-and-edited word diff alignment Keita Oda
2 siblings, 0 replies; 4+ messages in thread
From: Keita Oda @ 2026-05-27 4:24 UTC (permalink / raw)
To: git; +Cc: Keita ODA
From: Keita ODA <ainsophyao@gmail•com>
Teach the RFC prototype to render selected --word-diff-align pairs with
word-diff-like markers.
This renderer is deliberately small and local to the RFC. It exists to make
the recovered line pairs inspectable in review output. It is not meant to be
the final UI. A production version should likely reuse the existing word-diff
machinery once the line-pairing question is settled.
The renderer computes a token LCS for the selected pair and marks the unmatched
spans with the familiar plain word-diff delimiters:
[-old-]
{+new+}
Moved selected pairs are also marked with DIFF_SYMBOL_MOVED_LINE so that the
current moved-line coloring can show that the pair came from a moved region.
---
diff.c | 213 +++++++++++++++++++++++++++++++++++++++++++++++++++++----
1 file changed, 200 insertions(+), 13 deletions(-)
diff --git a/diff.c b/diff.c
index 6b8744920..8629d4670 100644
--- a/diff.c
+++ b/diff.c
@@ -1811,6 +1811,104 @@ static void word_diff_align_add_candidate(struct word_diff_align_candidate **can
candidate->line_shared = line_shared;
}
+/*
+ * RFC-only formatter for exposing the selected line pairs. The final
+ * presentation should reuse the normal word-diff machinery instead of this
+ * small debug renderer.
+ */
+struct word_diff_align_debug_token {
+ int start;
+ int end;
+};
+
+static void word_diff_align_debug_collect_tokens(const struct emitted_diff_symbol *line,
+ struct word_diff_align_debug_token **tokens,
+ int *tokens_nr, int *tokens_alloc)
+{
+ int len = word_diff_align_payload_len(line);
+ int pos = 0, start, end;
+
+ while (word_diff_align_next_token(line->line, len, &pos, &start, &end)) {
+ ALLOC_GROW(*tokens, *tokens_nr + 1, *tokens_alloc);
+ (*tokens)[*tokens_nr].start = start;
+ (*tokens)[*tokens_nr].end = end;
+ (*tokens_nr)++;
+ }
+}
+
+static void word_diff_align_debug_add_span(struct strbuf *out,
+ const char *open,
+ const char *line, int len,
+ const char *close)
+{
+ if (!len)
+ return;
+ strbuf_addstr(out, open);
+ strbuf_add(out, line, len);
+ strbuf_addstr(out, close);
+}
+
+static int word_diff_align_debug_token_eq(const struct emitted_diff_symbol *a,
+ const struct word_diff_align_debug_token *a_tok,
+ const struct emitted_diff_symbol *b,
+ const struct word_diff_align_debug_token *b_tok)
+{
+ int a_len = a_tok->end - a_tok->start;
+ int b_len = b_tok->end - b_tok->start;
+
+ return a_len == b_len &&
+ !memcmp(a->line + a_tok->start, b->line + b_tok->start, a_len);
+}
+
+static void word_diff_align_debug_rewrite_line(struct emitted_diff_symbol *line,
+ struct word_diff_align_debug_token *tokens,
+ int tokens_nr, int *match_to,
+ const struct emitted_diff_symbol *other,
+ struct word_diff_align_debug_token *other_tokens,
+ const char *open, const char *close)
+{
+ struct strbuf out = STRBUF_INIT;
+ char *old_line = (char *)line->line;
+ int payload_len = word_diff_align_payload_len(line);
+ int other_pos = 0;
+ int other_payload_len = word_diff_align_payload_len(other);
+ int pos = 0, i;
+ size_t new_len;
+
+ for (i = 0; i < tokens_nr; i++) {
+ int other_i = match_to[i];
+ int gap_len, other_gap_len;
+
+ if (other_i < 0)
+ continue;
+ gap_len = tokens[i].start - pos;
+ other_gap_len = other_tokens[other_i].start - other_pos;
+ if (gap_len == other_gap_len &&
+ !memcmp(line->line + pos, other->line + other_pos, gap_len))
+ strbuf_add(&out, line->line + pos, gap_len);
+ else
+ word_diff_align_debug_add_span(&out, open,
+ line->line + pos,
+ gap_len, close);
+ strbuf_add(&out, line->line + tokens[i].start,
+ tokens[i].end - tokens[i].start);
+ pos = tokens[i].end;
+ other_pos = other_tokens[other_i].end;
+ }
+ if (payload_len - pos == other_payload_len - other_pos &&
+ !memcmp(line->line + pos, other->line + other_pos,
+ payload_len - pos))
+ strbuf_add(&out, line->line + pos, payload_len - pos);
+ else
+ word_diff_align_debug_add_span(&out, open, line->line + pos,
+ payload_len - pos, close);
+ strbuf_add(&out, line->line + payload_len, line->len - payload_len);
+
+ line->line = strbuf_detach(&out, &new_len);
+ line->len = (int)new_len;
+ free(old_line);
+}
+
static void word_diff_align_debug_append_comment(struct emitted_diff_symbol *line,
const struct strbuf *suffix)
{
@@ -1835,25 +1933,114 @@ static void word_diff_align_debug_mark_pair(struct emitted_diff_symbol *minus_li
struct emitted_diff_symbol *plus_line,
int minus_lineno, int plus_lineno,
int changed, int moved,
- int window_score,
- int line_score,
- int pair_score)
-{
- struct strbuf suffix = STRBUF_INIT;
+ int window_score,
+ int line_score,
+ int pair_score)
+{
+ struct word_diff_align_debug_token *minus_tokens = NULL, *plus_tokens = NULL;
+ int minus_tokens_nr = 0, minus_tokens_alloc = 0;
+ int plus_tokens_nr = 0, plus_tokens_alloc = 0;
+ int *minus_match_to = NULL, *plus_match_to = NULL;
+ int *lcs = NULL;
+ struct emitted_diff_symbol minus_original = *minus_line;
+ struct emitted_diff_symbol plus_original = *plus_line;
+ int i, j, columns;
if (moved) {
minus_line->flags |= DIFF_SYMBOL_MOVED_LINE;
plus_line->flags |= DIFF_SYMBOL_MOVED_LINE;
}
- strbuf_addf(&suffix,
- " # aligned from %d to %d, %s, W=%d L=%d S=%d",
- minus_lineno, plus_lineno,
- changed ? "edited" : "unchanged",
- window_score, line_score, pair_score);
- word_diff_align_debug_append_comment(minus_line, &suffix);
- word_diff_align_debug_append_comment(plus_line, &suffix);
- strbuf_release(&suffix);
+ minus_original.line = xmemdupz(minus_line->line, minus_line->len);
+ plus_original.line = xmemdupz(plus_line->line, plus_line->len);
+ if (!changed)
+ goto comment;
+
+ word_diff_align_debug_collect_tokens(minus_line, &minus_tokens,
+ &minus_tokens_nr,
+ &minus_tokens_alloc);
+ word_diff_align_debug_collect_tokens(plus_line, &plus_tokens,
+ &plus_tokens_nr,
+ &plus_tokens_alloc);
+ if (!minus_tokens_nr || !plus_tokens_nr)
+ goto comment;
+
+ columns = plus_tokens_nr + 1;
+ CALLOC_ARRAY(lcs, (minus_tokens_nr + 1) * columns);
+ ALLOC_ARRAY(minus_match_to, minus_tokens_nr);
+ ALLOC_ARRAY(plus_match_to, plus_tokens_nr);
+ for (i = 0; i < minus_tokens_nr; i++)
+ minus_match_to[i] = -1;
+ for (j = 0; j < plus_tokens_nr; j++)
+ plus_match_to[j] = -1;
+
+ for (i = 1; i <= minus_tokens_nr; i++) {
+ for (j = 1; j <= plus_tokens_nr; j++) {
+ if (word_diff_align_debug_token_eq(minus_line,
+ &minus_tokens[i - 1],
+ plus_line,
+ &plus_tokens[j - 1]))
+ lcs[i * columns + j] =
+ lcs[(i - 1) * columns + j - 1] + 1;
+ else if (lcs[(i - 1) * columns + j] >
+ lcs[i * columns + j - 1])
+ lcs[i * columns + j] = lcs[(i - 1) * columns + j];
+ else
+ lcs[i * columns + j] = lcs[i * columns + j - 1];
+ }
+ }
+
+ i = minus_tokens_nr;
+ j = plus_tokens_nr;
+ while (i > 0 && j > 0) {
+ if (lcs[i * columns + j] == lcs[i * columns + j - 1]) {
+ j--;
+ } else if (lcs[i * columns + j] ==
+ lcs[(i - 1) * columns + j]) {
+ i--;
+ } else if (word_diff_align_debug_token_eq(minus_line,
+ &minus_tokens[i - 1],
+ plus_line,
+ &plus_tokens[j - 1])) {
+ minus_match_to[i - 1] = j - 1;
+ plus_match_to[j - 1] = i - 1;
+ i--;
+ j--;
+ } else {
+ BUG("word-diff-align display LCS backtrack failed");
+ }
+ }
+
+ word_diff_align_debug_rewrite_line(minus_line, minus_tokens,
+ minus_tokens_nr, minus_match_to,
+ &plus_original, plus_tokens,
+ "[-", "-]");
+ word_diff_align_debug_rewrite_line(plus_line, plus_tokens,
+ plus_tokens_nr, plus_match_to,
+ &minus_original, minus_tokens,
+ "{+", "+}");
+
+comment:
+ {
+ struct strbuf suffix = STRBUF_INIT;
+
+ strbuf_addf(&suffix,
+ " # aligned from %d to %d, %s, W=%d L=%d S=%d",
+ minus_lineno, plus_lineno,
+ changed ? "edited" : "unchanged",
+ window_score, line_score, pair_score);
+ word_diff_align_debug_append_comment(minus_line, &suffix);
+ word_diff_align_debug_append_comment(plus_line, &suffix);
+ strbuf_release(&suffix);
+ }
+
+ free((char *)minus_original.line);
+ free((char *)plus_original.line);
+ free(minus_tokens);
+ free(plus_tokens);
+ free(minus_match_to);
+ free(plus_match_to);
+ free(lcs);
}
static void word_diff_align_add_item(struct word_diff_align_item **items,
--
2.39.3 (Apple Git-146)
^ permalink raw reply related [flat|nested] 4+ messages in thread