[RFC PATCH 1/3] diff: add word-diff-align line pairing

public inbox for git@vger.kernel.org 
 help / color / mirror / Atom feed

From: Keita Oda <ainsophyao@gmail•com>
To: git@vger•kernel.org
Cc: Keita ODA <ainsophyao@gmail•com>
Subject: [RFC PATCH 1/3] diff: add word-diff-align line pairing
Date: Wed, 27 May 2026 13:24:00 +0900	[thread overview]
Message-ID: <20260527042402.13607-2-ainsophyao@gmail.com> (raw)
In-Reply-To: <20260527042402.13607-1-ainsophyao@gmail.com>

From: Keita ODA <ainsophyao@gmail•com>

Add an opt-in --word-diff-align mode that post-processes emitted diff
symbols and tries to pair similar deleted and inserted lines inside each hunk.

This is not a replacement for the diff algorithm.  The normal diff is produced
first; the new code only annotates the already-emitted symbols for review.

The candidate search is intentionally approximate.  Each line is tokenized into
word-ish runs and single non-space symbols.  Each token gets a hash, each line
gets a compact 64-bit fingerprint, and each changed line also gets a small
surrounding window fingerprint.  Inserted-side windows are placed into bit
buckets, and deleted-side windows query those buckets to avoid a full all-pairs
scan in typical cases.

The fingerprint is only a retrieval index.  Candidate pairs are scored again
using token hashes:

  W = unique token overlap in the small surrounding windows
  L = center-line token LCS score
  S = W + 4L

For the center-line score, tokens repeated in the surrounding window carry less
weight.  This is a local-IDF-like approximation: in a run of "import" or
"#define" lines, the repeated keyword is weak evidence, while the identifier
specific to the center line remains strong evidence.

For this RFC, selected pairs are exposed with a debug-style "# aligned ..."
comment.  The next patch adds a small renderer to make the selected pairs more
readable.

---
 diff.c | 809 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 diff.h |   1 +
 2 files changed, 802 insertions(+), 8 deletions(-)

diff --git a/diff.c b/diff.c
index 397e38b41..6b8744920 100644
--- a/diff.c
+++ b/diff.c
@@ -46,6 +46,7 @@
 #include "setup.h"
 #include "strmap.h"
 #include "ws.h"
+#include "ewah/ewok.h"
 
 #ifdef NO_FAST_WORKING_DIRECTORY
 #define FAST_WORKING_DIRECTORY 0
@@ -1358,6 +1359,774 @@ static void dim_moved_lines(struct diff_options *o)
 	}
 }
 
+struct word_diff_align_line_info {
+	uint64_t bits;
+	int token_start;
+	int token_nr;
+};
+
+struct word_diff_align_item {
+	int symbol;
+	int change_pos;
+	int line_no;
+};
+
+struct word_diff_align_candidate {
+	int old_pos;
+	int new_pos;
+	int score;
+	int window_score;
+	int window_shared;
+	int line_score;
+	int line_shared;
+};
+
+#define WORD_DIFF_ALIGN_NUMBER_HASH 0x9e3779b9U
+
+#define WORD_DIFF_ALIGN_WINDOW_SIZE 2
+#define WORD_DIFF_ALIGN_FINGERPRINT_BITS 64
+#define WORD_DIFF_ALIGN_BIT_BUCKET_MAX 256
+#define WORD_DIFF_ALIGN_WINDOW_MIN_SCORE 35
+#define WORD_DIFF_ALIGN_WINDOW_MIN_SHARED 4
+#define WORD_DIFF_ALIGN_LINE_WEIGHT 4
+#define WORD_DIFF_ALIGN_SCORE_MIN 175
+
+static void word_diff_align_window_bounds(int nr, int pos,
+					  int *start_p, int *end_p)
+{
+	int radius = WORD_DIFF_ALIGN_WINDOW_SIZE;
+
+	*start_p = pos > radius ? pos - radius : 0;
+	*end_p = pos + radius + 1 < nr ? pos + radius + 1 : nr;
+}
+
+static int word_diff_align_payload_len(const struct emitted_diff_symbol *e)
+{
+	int len = e->len;
+
+	if (len && e->line[len - 1] == '\n')
+		len--;
+	if (len && e->line[len - 1] == '\r')
+		len--;
+	return len;
+}
+
+static int word_diff_align_word_byte(char ch)
+{
+	return isalnum((unsigned char)ch) || ch == '_';
+}
+
+static int word_diff_align_numeric_token(const char *line, int start, int end)
+{
+	int i;
+
+	for (i = start; i < end; i++)
+		if (!isdigit((unsigned char)line[i]))
+			return 0;
+	return start < end;
+}
+
+static uint64_t word_diff_align_token_bits(unsigned int hash)
+{
+	return (1ULL << (hash & 63)) | (1ULL << ((hash >> 6) & 63));
+}
+
+static int word_diff_align_next_token(const char *line, int len, int *pos,
+				      int *start_p, int *end_p)
+{
+	while (*pos < len && isspace((unsigned char)line[*pos]))
+		(*pos)++;
+	if (*pos == len)
+		return 0;
+
+	*start_p = *pos;
+	if (word_diff_align_word_byte(line[*pos]))
+		while (*pos < len && word_diff_align_word_byte(line[*pos]))
+			(*pos)++;
+	else
+		(*pos)++;
+	*end_p = *pos;
+	return 1;
+}
+
+static void word_diff_align_prepare_line(const struct emitted_diff_symbol *e,
+					 struct word_diff_align_line_info *line,
+					 unsigned int **tokens,
+					 int *tokens_nr,
+					 int *tokens_alloc)
+{
+	int len = word_diff_align_payload_len(e);
+	int pos = 0;
+	int start, end;
+
+	line->bits = 0;
+	line->token_start = *tokens_nr;
+	while (word_diff_align_next_token(e->line, len, &pos, &start, &end)) {
+		int numeric = word_diff_align_numeric_token(e->line, start, end);
+		unsigned int hash = numeric ? WORD_DIFF_ALIGN_NUMBER_HASH :
+			memhash(e->line + start, end - start);
+
+		ALLOC_GROW(*tokens, *tokens_nr + 1, *tokens_alloc);
+		(*tokens)[*tokens_nr] = hash;
+		line->bits |= word_diff_align_token_bits(hash);
+		(*tokens_nr)++;
+	}
+	line->token_nr = *tokens_nr - line->token_start;
+}
+
+static uint64_t word_diff_align_window_bits(struct word_diff_align_line_info *lines,
+					    int nr, int pos)
+{
+	uint64_t bits = 0;
+	int start, end;
+	int i;
+
+	word_diff_align_window_bounds(nr, pos, &start, &end);
+	for (i = start; i < end; i++)
+		bits |= lines[i].bits;
+	return bits;
+}
+
+static int word_diff_align_filter_score(uint64_t a, uint64_t b, int *shared_p)
+{
+	int shared = ewah_bit_popcount64(a & b);
+	int total = ewah_bit_popcount64(a | b);
+
+	*shared_p = shared;
+	return total ? shared * 100 / total : 0;
+}
+
+static int word_diff_align_line_has_token(const struct word_diff_align_line_info *line,
+					  const unsigned int *tokens,
+					  unsigned int hash);
+
+static int word_diff_align_window_has_token(const struct word_diff_align_line_info *lines,
+					    int start, int end,
+					    const unsigned int *tokens,
+					    unsigned int hash)
+{
+	int i;
+
+	for (i = start; i < end; i++)
+		if (word_diff_align_line_has_token(&lines[i], tokens, hash))
+			return 1;
+	return 0;
+}
+
+static int word_diff_align_window_seen_token(const struct word_diff_align_line_info *lines,
+					     int start, int line_pos,
+					     int token_pos,
+					     const unsigned int *tokens,
+					     unsigned int hash)
+{
+	int i;
+
+	for (i = start; i <= line_pos; i++) {
+		const struct word_diff_align_line_info *line = &lines[i];
+		int end = i == line_pos ? token_pos : line->token_nr;
+		int j;
+
+		for (j = 0; j < end; j++)
+			if (tokens[line->token_start + j] == hash)
+				return 1;
+	}
+	return 0;
+}
+
+static int word_diff_align_window_token_score(const struct word_diff_align_line_info *minus_lines,
+					      int minus_nr, int minus_pos,
+					      const unsigned int *minus_tokens,
+					      const struct word_diff_align_line_info *plus_lines,
+					      int plus_nr, int plus_pos,
+					      const unsigned int *plus_tokens,
+					      int *shared_p)
+{
+	int minus_start, minus_end, plus_start, plus_end;
+	int minus_unique = 0, plus_unique = 0, shared = 0;
+	int i;
+
+	word_diff_align_window_bounds(minus_nr, minus_pos, &minus_start,
+				      &minus_end);
+	word_diff_align_window_bounds(plus_nr, plus_pos, &plus_start,
+				      &plus_end);
+
+	for (i = minus_start; i < minus_end; i++) {
+		const struct word_diff_align_line_info *line = &minus_lines[i];
+		int j;
+
+		for (j = 0; j < line->token_nr; j++) {
+			unsigned int hash = minus_tokens[line->token_start + j];
+
+			if (word_diff_align_window_seen_token(minus_lines,
+							      minus_start, i, j,
+							      minus_tokens,
+							      hash))
+				continue;
+			minus_unique++;
+			if (word_diff_align_window_has_token(plus_lines,
+							     plus_start, plus_end,
+							     plus_tokens, hash))
+				shared++;
+		}
+	}
+
+	for (i = plus_start; i < plus_end; i++) {
+		const struct word_diff_align_line_info *line = &plus_lines[i];
+		int j;
+
+		for (j = 0; j < line->token_nr; j++) {
+			unsigned int hash = plus_tokens[line->token_start + j];
+
+			if (word_diff_align_window_seen_token(plus_lines,
+							      plus_start, i, j,
+							      plus_tokens,
+							      hash))
+				continue;
+			plus_unique++;
+		}
+	}
+
+	*shared_p = shared;
+	return minus_unique + plus_unique - shared ?
+		shared * 100 / (minus_unique + plus_unique - shared) : 0;
+}
+
+static int word_diff_align_line_has_token(const struct word_diff_align_line_info *line,
+					  const unsigned int *tokens,
+					  unsigned int hash)
+{
+	int i;
+
+	for (i = 0; i < line->token_nr; i++)
+		if (tokens[line->token_start + i] == hash)
+			return 1;
+	return 0;
+}
+
+static int word_diff_align_surrounding_line_freq(const struct word_diff_align_line_info *lines,
+						 int nr, int pos,
+						 const unsigned int *tokens,
+						 unsigned int hash)
+{
+	int radius = WORD_DIFF_ALIGN_WINDOW_SIZE;
+	int start = pos > radius ? pos - radius : 0;
+	int end = pos + radius + 1 < nr ? pos + radius + 1 : nr;
+	int i, freq = 0;
+
+	for (i = start; i < end; i++) {
+		if (i == pos)
+			continue;
+		if (word_diff_align_line_has_token(&lines[i], tokens, hash))
+			freq++;
+	}
+	return freq;
+}
+
+static int word_diff_align_shared_token_value(const struct word_diff_align_line_info *minus_lines,
+					      int minus_nr, int minus_pos,
+					      const unsigned int *minus_tokens,
+					      const struct word_diff_align_line_info *plus_lines,
+					      int plus_nr, int plus_pos,
+					      const unsigned int *plus_tokens,
+					      unsigned int hash)
+{
+	int minus_freq = word_diff_align_surrounding_line_freq(minus_lines,
+							       minus_nr,
+							       minus_pos,
+							       minus_tokens,
+							       hash);
+	int plus_freq = word_diff_align_surrounding_line_freq(plus_lines,
+							      plus_nr,
+							      plus_pos,
+							      plus_tokens,
+							      hash);
+	int freq = minus_freq > plus_freq ? minus_freq : plus_freq;
+	int value = 2 * WORD_DIFF_ALIGN_WINDOW_SIZE - freq;
+
+	return value > 0 ? value : 0;
+}
+
+static int word_diff_align_unmatched_weight(const struct word_diff_align_line_info *line,
+					    const unsigned int *tokens,
+					    const unsigned int *other_tokens,
+					    int other_start, int other_nr)
+{
+	int i, weight = 0;
+
+	for (i = 0; i < line->token_nr; i++) {
+		unsigned int hash = tokens[line->token_start + i];
+		int j, matched = 0;
+
+		for (j = 0; j < other_nr; j++) {
+			if (hash == other_tokens[other_start + j]) {
+				matched = 1;
+				break;
+			}
+		}
+		if (!matched)
+			weight++;
+	}
+	return weight;
+}
+
+static int word_diff_align_line_token_score(const struct word_diff_align_line_info *minus,
+					    const struct word_diff_align_line_info *minus_lines,
+					    int minus_nr, int minus_pos,
+					    const unsigned int *minus_tokens,
+					    const struct word_diff_align_line_info *plus,
+					    const struct word_diff_align_line_info *plus_lines,
+					    int plus_nr, int plus_pos,
+					    const unsigned int *plus_tokens,
+					    int *shared_p)
+{
+	int i, j, shared;
+	int *lcs;
+	int unmatched, total;
+
+	if (!minus->token_nr || !plus->token_nr) {
+		*shared_p = 0;
+		return 0;
+	}
+
+	CALLOC_ARRAY(lcs, plus->token_nr + 1);
+	for (i = 0; i < minus->token_nr; i++) {
+		unsigned int minus_hash = minus_tokens[minus->token_start + i];
+		int prev = 0;
+
+		for (j = 0; j < plus->token_nr; j++) {
+			unsigned int plus_hash = plus_tokens[plus->token_start + j];
+			int column = j + 1;
+			int saved;
+
+			saved = lcs[column];
+			if (minus_hash == plus_hash) {
+				int value = word_diff_align_shared_token_value(minus_lines,
+									       minus_nr,
+									       minus_pos,
+									       minus_tokens,
+									       plus_lines,
+									       plus_nr,
+									       plus_pos,
+									       plus_tokens,
+									       minus_hash);
+				int best = prev + value;
+
+				if (lcs[column] > best)
+					best = lcs[column];
+				if (lcs[column - 1] > best)
+					best = lcs[column - 1];
+				lcs[column] = best;
+			}
+			else if (lcs[column - 1] > lcs[column])
+				lcs[column] = lcs[column - 1];
+			prev = saved;
+		}
+	}
+
+	shared = lcs[plus->token_nr];
+	free(lcs);
+	unmatched = word_diff_align_unmatched_weight(minus, minus_tokens,
+						     plus_tokens,
+						     plus->token_start,
+						     plus->token_nr) +
+		word_diff_align_unmatched_weight(plus, plus_tokens,
+						 minus_tokens,
+						 minus->token_start,
+						 minus->token_nr);
+	total = 2 * shared + unmatched;
+	*shared_p = shared;
+	return total ? shared * 200 / total : 0;
+}
+
+static int word_diff_align_pair_score(int window_score,
+				      const struct word_diff_align_line_info *minus,
+				      const struct word_diff_align_line_info *minus_lines,
+				      int minus_nr, int minus_pos,
+				      const unsigned int *minus_tokens,
+				      const struct word_diff_align_line_info *plus,
+				      const struct word_diff_align_line_info *plus_lines,
+				      int plus_nr, int plus_pos,
+				      const unsigned int *plus_tokens,
+				      int *line_score_p,
+				      int *line_shared_p)
+{
+	int line_score, line_shared;
+
+	line_score = word_diff_align_line_token_score(minus, minus_lines,
+						      minus_nr, minus_pos,
+						      minus_tokens,
+						      plus, plus_lines,
+						      plus_nr, plus_pos,
+						      plus_tokens, &line_shared);
+	*line_score_p = line_score;
+	*line_shared_p = line_shared;
+
+	/*
+	 * The bit fingerprint is only a retrieval approximation.  The window
+	 * score passed here is computed from exact token hashes in the 5-line
+	 * windows, while the line score gives the center line an additional
+	 * identity bonus.
+	 */
+	return window_score + WORD_DIFF_ALIGN_LINE_WEIGHT * line_score;
+}
+
+static int word_diff_align_candidate_cmp(const void *va, const void *vb)
+{
+	const struct word_diff_align_candidate *a = va;
+	const struct word_diff_align_candidate *b = vb;
+
+	if (a->score != b->score)
+		return b->score - a->score;
+	if (a->line_score != b->line_score)
+		return b->line_score - a->line_score;
+	if (a->window_score != b->window_score)
+		return b->window_score - a->window_score;
+	if (a->line_shared != b->line_shared)
+		return b->line_shared - a->line_shared;
+	if (a->window_shared != b->window_shared)
+		return b->window_shared - a->window_shared;
+	if (a->old_pos != b->old_pos)
+		return a->old_pos - b->old_pos;
+	return a->new_pos - b->new_pos;
+}
+
+static void word_diff_align_add_candidate(struct word_diff_align_candidate **candidates,
+					  int *candidates_nr,
+					  int *candidates_alloc,
+					  int old_pos, int new_pos,
+					  int score, int window_score,
+					  int window_shared, int line_score,
+					  int line_shared)
+{
+	struct word_diff_align_candidate *candidate;
+
+	ALLOC_GROW(*candidates, *candidates_nr + 1, *candidates_alloc);
+	candidate = &(*candidates)[(*candidates_nr)++];
+	candidate->old_pos = old_pos;
+	candidate->new_pos = new_pos;
+	candidate->score = score;
+	candidate->window_score = window_score;
+	candidate->window_shared = window_shared;
+	candidate->line_score = line_score;
+	candidate->line_shared = line_shared;
+}
+
+static void word_diff_align_debug_append_comment(struct emitted_diff_symbol *line,
+						 const struct strbuf *suffix)
+{
+	struct strbuf out = STRBUF_INIT;
+	char *old_line = (char *)line->line;
+	size_t new_len;
+
+	if (line->len && line->line[line->len - 1] == '\n') {
+		strbuf_add(&out, line->line, line->len - 1);
+		strbuf_addbuf(&out, suffix);
+		strbuf_addch(&out, '\n');
+	} else {
+		strbuf_add(&out, line->line, line->len);
+		strbuf_addbuf(&out, suffix);
+	}
+	line->line = strbuf_detach(&out, &new_len);
+	line->len = (int)new_len;
+	free(old_line);
+}
+
+static void word_diff_align_debug_mark_pair(struct emitted_diff_symbol *minus_line,
+					    struct emitted_diff_symbol *plus_line,
+					    int minus_lineno, int plus_lineno,
+					    int changed, int moved,
+						    int window_score,
+						    int line_score,
+						    int pair_score)
+{
+	struct strbuf suffix = STRBUF_INIT;
+
+	if (moved) {
+		minus_line->flags |= DIFF_SYMBOL_MOVED_LINE;
+		plus_line->flags |= DIFF_SYMBOL_MOVED_LINE;
+	}
+
+	strbuf_addf(&suffix,
+		    "        # aligned from %d to %d, %s, W=%d L=%d S=%d",
+		    minus_lineno, plus_lineno,
+		    changed ? "edited" : "unchanged",
+		    window_score, line_score, pair_score);
+	word_diff_align_debug_append_comment(minus_line, &suffix);
+	word_diff_align_debug_append_comment(plus_line, &suffix);
+	strbuf_release(&suffix);
+}
+
+static void word_diff_align_add_item(struct word_diff_align_item **items,
+				     int *items_nr, int *items_alloc,
+				     int symbol, int change_pos, int line_no)
+{
+	ALLOC_GROW(*items, *items_nr + 1, *items_alloc);
+	(*items)[*items_nr].symbol = symbol;
+	(*items)[*items_nr].change_pos = change_pos;
+	(*items)[*items_nr].line_no = line_no;
+	(*items_nr)++;
+}
+
+static void word_diff_align_hunk_start(const struct emitted_diff_symbols *e,
+				       int start, int *old_lineno,
+				       int *new_lineno)
+{
+	const char *line;
+	char *endp;
+
+	*old_lineno = 0;
+	*new_lineno = 0;
+	if (start <= 0 || e->buf[start - 1].s != DIFF_SYMBOL_CONTEXT_FRAGINFO)
+		return;
+	line = e->buf[start - 1].line;
+	if (!starts_with(line, "@@ -"))
+		return;
+	line += 4;
+	if (!isdigit((unsigned char)*line))
+		return;
+	*old_lineno = strtol(line, &endp, 10);
+	line = endp;
+	if (*line == ',') {
+		line++;
+		strtol(line, &endp, 10);
+		line = endp;
+	}
+	if (!starts_with(line, " +"))
+		return;
+	line += 2;
+	if (!isdigit((unsigned char)*line))
+		return;
+	*new_lineno = strtol(line, &endp, 10);
+}
+
+static int word_diff_align_local_pair(int old_pos, int new_pos)
+{
+	return old_pos == new_pos;
+}
+
+static void word_diff_align_hunk(struct emitted_diff_symbols *e, int start, int end)
+{
+	struct word_diff_align_item *old_items = NULL, *new_items = NULL;
+	int old_items_nr = 0, old_items_alloc = 0;
+	int new_items_nr = 0, new_items_alloc = 0;
+	int minus_nr = 0, plus_nr = 0;
+	struct word_diff_align_line_info *old_lines = NULL, *new_lines = NULL;
+	unsigned int *old_tokens = NULL, *new_tokens = NULL;
+	int old_tokens_nr = 0, old_tokens_alloc = 0;
+	int new_tokens_nr = 0, new_tokens_alloc = 0;
+	uint64_t *new_contexts = NULL;
+	int *bucket_heads = NULL, *bucket_next = NULL, *bucket_counts = NULL;
+	int *candidate_counts = NULL, *candidate_touched = NULL;
+	char *used_old = NULL, *used_plus = NULL;
+	struct word_diff_align_candidate *candidates = NULL;
+	int candidates_nr = 0, candidates_alloc = 0;
+	int old_lineno, new_lineno;
+	int i, j;
+
+	word_diff_align_hunk_start(e, start, &old_lineno, &new_lineno);
+	for (i = start; i < end; i++) {
+		if (e->buf[i].s == DIFF_SYMBOL_MINUS) {
+			word_diff_align_add_item(&old_items, &old_items_nr,
+						 &old_items_alloc, i, minus_nr++,
+						 old_lineno++);
+		} else if (e->buf[i].s == DIFF_SYMBOL_PLUS) {
+			word_diff_align_add_item(&new_items, &new_items_nr,
+						 &new_items_alloc, i, plus_nr++,
+						 new_lineno++);
+		} else if (e->buf[i].s == DIFF_SYMBOL_CONTEXT) {
+			word_diff_align_add_item(&old_items, &old_items_nr,
+						 &old_items_alloc, i, -1,
+						 old_lineno++);
+			word_diff_align_add_item(&new_items, &new_items_nr,
+						 &new_items_alloc, i, -1,
+						 new_lineno++);
+		}
+	}
+
+	if (!minus_nr || !plus_nr)
+		goto cleanup;
+
+	CALLOC_ARRAY(old_lines, old_items_nr);
+	CALLOC_ARRAY(new_lines, new_items_nr);
+	CALLOC_ARRAY(new_contexts, new_items_nr);
+	CALLOC_ARRAY(bucket_heads, WORD_DIFF_ALIGN_FINGERPRINT_BITS);
+	CALLOC_ARRAY(bucket_counts, WORD_DIFF_ALIGN_FINGERPRINT_BITS);
+	CALLOC_ARRAY(bucket_next, new_items_nr * WORD_DIFF_ALIGN_FINGERPRINT_BITS);
+	CALLOC_ARRAY(candidate_counts, new_items_nr);
+	CALLOC_ARRAY(candidate_touched, new_items_nr);
+	CALLOC_ARRAY(used_old, minus_nr);
+	CALLOC_ARRAY(used_plus, plus_nr);
+
+	for (i = 0; i < old_items_nr; i++)
+		word_diff_align_prepare_line(&e->buf[old_items[i].symbol], &old_lines[i],
+					     &old_tokens, &old_tokens_nr,
+					     &old_tokens_alloc);
+	for (i = 0; i < new_items_nr; i++)
+		word_diff_align_prepare_line(&e->buf[new_items[i].symbol], &new_lines[i],
+					     &new_tokens, &new_tokens_nr,
+					     &new_tokens_alloc);
+	for (i = 0; i < new_items_nr; i++)
+		new_contexts[i] = word_diff_align_window_bits(new_lines, new_items_nr, i);
+
+	for (i = 0; i < new_items_nr; i++) {
+		if (new_items[i].change_pos < 0)
+			continue;
+		for (j = 0; j < WORD_DIFF_ALIGN_FINGERPRINT_BITS; j++) {
+			uint64_t bit = 1ULL << j;
+
+			if (!(new_contexts[i] & bit))
+				continue;
+			bucket_counts[j]++;
+			bucket_next[i * WORD_DIFF_ALIGN_FINGERPRINT_BITS + j] = bucket_heads[j];
+			bucket_heads[j] = i + 1;
+		}
+	}
+
+	for (i = 0; i < old_items_nr; i++) {
+		int touched_nr = 0;
+		uint64_t old_context;
+
+		if (old_items[i].change_pos < 0)
+			continue;
+		old_context = word_diff_align_window_bits(old_lines, old_items_nr, i);
+		for (j = 0; j < WORD_DIFF_ALIGN_FINGERPRINT_BITS; j++) {
+			uint64_t bit = 1ULL << j;
+			int item;
+
+			if (!(old_context & bit))
+				continue;
+			if (bucket_counts[j] > WORD_DIFF_ALIGN_BIT_BUCKET_MAX)
+				continue;
+			for (item = bucket_heads[j]; item; item = bucket_next[(item - 1) * WORD_DIFF_ALIGN_FINGERPRINT_BITS + j]) {
+				int new_pos = item - 1;
+
+				if (!candidate_counts[new_pos])
+					candidate_touched[touched_nr++] = new_pos;
+				candidate_counts[new_pos]++;
+			}
+		}
+
+		for (j = 0; j < touched_nr; j++) {
+			int new_pos = candidate_touched[j];
+			int filter_score, filter_shared;
+			int window_score, window_shared;
+			int line_score, line_shared, pair_score;
+
+			candidate_counts[new_pos] = 0;
+			if (new_items[new_pos].change_pos < 0)
+				continue;
+			filter_score = word_diff_align_filter_score(old_context,
+								    new_contexts[new_pos],
+								    &filter_shared);
+			if (filter_score < WORD_DIFF_ALIGN_WINDOW_MIN_SCORE ||
+			    filter_shared < WORD_DIFF_ALIGN_WINDOW_MIN_SHARED)
+				continue;
+			window_score = word_diff_align_window_token_score(old_lines,
+									 old_items_nr,
+									 i,
+									 old_tokens,
+									 new_lines,
+									 new_items_nr,
+									 new_pos,
+									 new_tokens,
+									 &window_shared);
+			if (window_score < WORD_DIFF_ALIGN_WINDOW_MIN_SCORE)
+				continue;
+			pair_score = word_diff_align_pair_score(window_score,
+								&old_lines[i],
+								old_lines,
+								old_items_nr,
+								i,
+								old_tokens,
+								&new_lines[new_pos],
+								new_lines,
+								new_items_nr,
+								new_pos,
+								new_tokens,
+								&line_score,
+								&line_shared);
+			if (pair_score < WORD_DIFF_ALIGN_SCORE_MIN)
+				continue;
+			word_diff_align_add_candidate(&candidates, &candidates_nr,
+						      &candidates_alloc, i, new_pos,
+						      pair_score, window_score,
+						      window_shared, line_score,
+						      line_shared);
+		}
+	}
+
+	QSORT(candidates, candidates_nr, word_diff_align_candidate_cmp);
+	for (i = 0; i < candidates_nr; i++) {
+		int old_pos = candidates[i].old_pos;
+		int new_pos = candidates[i].new_pos;
+		int minus_pos = old_items[old_pos].change_pos;
+		int plus_pos = new_items[new_pos].change_pos;
+		int old_len, new_len, edited, moved;
+
+		if (used_old[minus_pos] || used_plus[plus_pos])
+			continue;
+		used_old[minus_pos] = 1;
+		used_plus[plus_pos] = 1;
+		old_len = word_diff_align_payload_len(&e->buf[old_items[old_pos].symbol]);
+		new_len = word_diff_align_payload_len(&e->buf[new_items[new_pos].symbol]);
+		edited = old_len != new_len ||
+			memcmp(e->buf[old_items[old_pos].symbol].line,
+			       e->buf[new_items[new_pos].symbol].line,
+			       old_len);
+		moved = !word_diff_align_local_pair(old_pos, new_pos);
+		word_diff_align_debug_mark_pair(&e->buf[old_items[old_pos].symbol],
+						&e->buf[new_items[new_pos].symbol],
+						old_items[old_pos].line_no,
+						new_items[new_pos].line_no,
+						edited, moved,
+						candidates[i].window_score,
+						candidates[i].line_score,
+						candidates[i].score);
+	}
+
+cleanup:
+	free(old_items);
+	free(new_items);
+	free(old_lines);
+	free(new_lines);
+	free(old_tokens);
+	free(new_tokens);
+	free(new_contexts);
+	free(bucket_heads);
+	free(bucket_next);
+	free(bucket_counts);
+	free(candidate_counts);
+	free(candidate_touched);
+	free(used_old);
+	free(used_plus);
+	free(candidates);
+}
+
+static void mark_word_diff_align(struct diff_options *o)
+{
+	int i, hunk_start = -1;
+	struct emitted_diff_symbols *e = o->emitted_symbols;
+
+	for (i = 0; i < e->nr; i++) {
+		if (e->buf[i].s == DIFF_SYMBOL_CONTEXT_FRAGINFO) {
+			if (hunk_start >= 0)
+				word_diff_align_hunk(e, hunk_start, i);
+			hunk_start = i + 1;
+		} else if (hunk_start >= 0 &&
+			   e->buf[i].s != DIFF_SYMBOL_CONTEXT &&
+			   e->buf[i].s != DIFF_SYMBOL_CONTEXT_INCOMPLETE &&
+			   e->buf[i].s != DIFF_SYMBOL_CONTEXT_MARKER &&
+			   e->buf[i].s != DIFF_SYMBOL_PLUS &&
+			   e->buf[i].s != DIFF_SYMBOL_MINUS) {
+			word_diff_align_hunk(e, hunk_start, i);
+			hunk_start = -1;
+		}
+	}
+
+	if (hunk_start >= 0)
+		word_diff_align_hunk(e, hunk_start, e->nr);
+}
+
 static void emit_line_ws_markup(struct diff_options *o,
 				const char *set_sign, const char *set,
 				const char *reset,
@@ -5245,6 +6014,10 @@ void diff_setup_done(struct diff_options *options)
 		die(_("options '%s' and '%s' cannot be used together, use '%s' with '%s' and '%s'"),
 			"--pickaxe-all", "--find-object", "--pickaxe-all", "-G", "-S");
 
+	if (options->word_diff_align && options->word_diff)
+		die(_("options '%s' and '%s' cannot be used together"),
+		    "--word-diff-align", "--word-diff");
+
 	/*
 	 * Most of the time we can say "there are changes"
 	 * only by checking if there are changed paths, but
@@ -5971,6 +6744,18 @@ static int diff_opt_word_diff_regex(const struct option *opt,
 	return 0;
 }
 
+static int diff_opt_word_diff_align(const struct option *opt,
+				    const char *arg, int unset)
+{
+	struct diff_options *options = opt->value;
+
+	BUG_ON_OPT_ARG(arg);
+	options->word_diff_align = !unset;
+	if (!unset)
+		enable_patch_output(&options->output_format);
+	return 0;
+}
+
 static int diff_opt_rotate_to(const struct option *opt, const char *arg, int unset)
 {
 	struct diff_options *options = opt->value;
@@ -6205,6 +6990,9 @@ struct option *add_diff_options(const struct option *opts,
 		OPT_CALLBACK_F(0, "word-diff-regex", options, N_("<regex>"),
 			       N_("use <regex> to decide what a word is"),
 			       PARSE_OPT_NONEG, diff_opt_word_diff_regex),
+		OPT_CALLBACK_F(0, "word-diff-align", options, NULL,
+			       N_("annotate similar moved lines"),
+			       PARSE_OPT_NOARG, diff_opt_word_diff_align),
 		OPT_CALLBACK_F(0, "color-words", options, N_("<regex>"),
 			       N_("equivalent to --word-diff=color --word-diff-regex=<regex>"),
 			       PARSE_OPT_NONEG | PARSE_OPT_OPTARG, diff_opt_color_words),
@@ -7076,7 +7864,7 @@ static void diff_flush_patch_all_file_pairs(struct diff_options *o)
 	if (WSEH_NEW & WS_RULE_MASK)
 		BUG("WS rules bit mask overlaps with diff symbol flags");
 
-	if (o->color_moved && want_color(o->use_color))
+	if ((o->color_moved && want_color(o->use_color)) || o->word_diff_align)
 		o->emitted_symbols = &esm;
 
 	if (o->additional_path_headers)
@@ -7092,14 +7880,19 @@ static void diff_flush_patch_all_file_pairs(struct diff_options *o)
 		struct mem_pool entry_pool;
 		struct moved_entry_list *entry_list;
 
-		mem_pool_init(&entry_pool, 1024 * 1024);
-		entry_list = add_lines_to_move_detection(o, &entry_pool);
-		mark_color_as_moved(o, entry_list);
-		if (o->color_moved == COLOR_MOVED_ZEBRA_DIM)
-			dim_moved_lines(o);
+		if (o->word_diff_align)
+			mark_word_diff_align(o);
+
+		if (o->color_moved && want_color(o->use_color)) {
+			mem_pool_init(&entry_pool, 1024 * 1024);
+			entry_list = add_lines_to_move_detection(o, &entry_pool);
+			mark_color_as_moved(o, entry_list);
+			if (o->color_moved == COLOR_MOVED_ZEBRA_DIM)
+				dim_moved_lines(o);
 
-		mem_pool_discard(&entry_pool, 0);
-		free(entry_list);
+			mem_pool_discard(&entry_pool, 0);
+			free(entry_list);
+		}
 
 		for (i = 0; i < esm.nr; i++)
 			emit_diff_symbol_from_struct(o, &esm.buf[i]);
diff --git a/diff.h b/diff.h
index 7eb84aadf..b9a20b7f2 100644
--- a/diff.h
+++ b/diff.h
@@ -351,6 +351,7 @@ struct diff_options {
 	int stat_count;
 	const char *word_regex;
 	enum diff_words_type word_diff;
+	int word_diff_align;
 	enum diff_submodule_format submodule_format;
 
 	struct oidset *objfind;
-- 
2.39.3 (Apple Git-146)

next prev parent reply	other threads:[~2026-05-27  4:24 UTC|newest]

Thread overview: 4+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-05-27  4:23 [RFC PATCH 0/3] diff: pair edited lines inside moved blocks Keita Oda
2026-05-27  4:24 ` Keita Oda [this message]
2026-05-27  4:24 ` [RFC PATCH 2/3] diff: render word-diff-align pairs for RFC review Keita Oda
2026-05-27  4:24 ` [RFC PATCH 3/3] t4034: cover moved-and-edited word diff alignment Keita Oda

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:397e38b4 dfblob:6b874492 dfblob:7eb84aad dfblob:b9a20b7f )
 OR (
bs:"[RFC PATCH 1/3] diff: add word-diff-align line pairing" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260527042402.13607-2-ainsophyao@gmail.com \
    --to=ainsophyao@gmail$(echo .)com \
    --cc=git@vger$(echo .)kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox