shithub: furgit

Download patch

ref: b01bc1a344c47ded15342f7872832daa1bf5cfce
parent: d8b384519a05136bb6f311c18cd05c2e4c494416
author: Runxi Yu <me@runxiyu.org>
date: Sat Feb 21 08:25:21 EST 2026

diff/lines: Line-based diffs via Myers

--- /dev/null
+++ b/diff/lines/diff.go
@@ -1,0 +1,223 @@
+// Package lines provides routines to perform line-based diffs.
+package lines
+
+import "bytes"
+
+// Diff performs a line-based diff.
+// Lines are bytes up to and including '\n' (final line may lack '\n').
+func Diff(oldB, newB []byte) ([]Chunk, error) {
+	type lineRef struct {
+		base  []byte
+		start int
+		end   int
+	}
+
+	split := func(b []byte) []lineRef {
+		if len(b) == 0 {
+			return nil
+		}
+		var res []lineRef
+		start := 0
+		for i := range b {
+			if b[i] == '\n' {
+				res = append(res, lineRef{base: b, start: start, end: i + 1})
+				start = i + 1
+			}
+		}
+		if start < len(b) {
+			res = append(res, lineRef{base: b, start: start, end: len(b)})
+		}
+		return res
+	}
+
+	oldLines := split(oldB)
+	newLines := split(newB)
+
+	n := len(oldLines)
+	m := len(newLines)
+	if n == 0 && m == 0 {
+		return nil, nil
+	}
+
+	idOf := make(map[string]int)
+	nextID := 0
+	oldIDs := make([]int, n)
+	for i, ln := range oldLines {
+		key := string(ln.base[ln.start:ln.end])
+		id, ok := idOf[key]
+		if !ok {
+			id = nextID
+			idOf[key] = id
+			nextID++
+		}
+		oldIDs[i] = id
+	}
+	newIDs := make([]int, m)
+	for i, ln := range newLines {
+		key := string(ln.base[ln.start:ln.end])
+		id, ok := idOf[key]
+		if !ok {
+			id = nextID
+			idOf[key] = id
+			nextID++
+		}
+		newIDs[i] = id
+	}
+
+	max := n + m
+	offset := max
+	trace := make([][]int, 0, max+1)
+
+	Vprev := make([]int, 2*max+1)
+	for i := range Vprev {
+		Vprev[i] = -1
+	}
+
+	x0 := 0
+	y0 := 0
+	for x0 < n && y0 < m && oldIDs[x0] == newIDs[y0] {
+		x0++
+		y0++
+	}
+	Vprev[offset+0] = x0
+	trace = append(trace, append([]int(nil), Vprev...))
+
+	found := x0 >= n && y0 >= m
+
+	for D := 1; D <= max && !found; D++ {
+		V := make([]int, 2*max+1)
+		for i := range V {
+			V[i] = -1
+		}
+
+		for k := -D; k <= D; k += 2 {
+			var x int
+			if k == -D || (k != D && Vprev[offset+(k-1)] < Vprev[offset+(k+1)]) {
+				x = Vprev[offset+(k+1)]
+			} else {
+				x = Vprev[offset+(k-1)] + 1
+			}
+			y := x - k
+
+			for x < n && y < m && oldIDs[x] == newIDs[y] {
+				x++
+				y++
+			}
+			V[offset+k] = x
+
+			if x >= n && y >= m {
+				trace = append(trace, V)
+				found = true
+				break
+			}
+		}
+
+		if !found {
+			trace = append(trace, V)
+			Vprev = V
+		}
+	}
+
+	type edit struct {
+		kind    ChunkKind
+		lineref lineRef
+	}
+	revEdits := make([]edit, 0, n+m)
+
+	x := n
+	y := m
+	for D := len(trace) - 1; D >= 0; D-- {
+		k := x - y
+
+		var (
+			prevK int
+			prevX int
+			prevY int
+		)
+		if D > 0 {
+			prevV := trace[D-1]
+			if k == -D || (k != D && prevV[offset+(k-1)] < prevV[offset+(k+1)]) {
+				prevK = k + 1
+			} else {
+				prevK = k - 1
+			}
+			prevX = prevV[offset+prevK]
+			prevY = prevX - prevK
+		}
+
+		for x > prevX && y > prevY {
+			x--
+			y--
+			revEdits = append(revEdits, edit{kind: ChunkKindUnchanged, lineref: oldLines[x]})
+		}
+
+		if D == 0 {
+			break
+		}
+
+		if x == prevX {
+			y--
+			revEdits = append(revEdits, edit{kind: ChunkKindAdded, lineref: newLines[y]})
+		} else {
+			x--
+			revEdits = append(revEdits, edit{kind: ChunkKindDeleted, lineref: oldLines[x]})
+		}
+	}
+
+	for i, j := 0, len(revEdits)-1; i < j; i, j = i+1, j-1 {
+		revEdits[i], revEdits[j] = revEdits[j], revEdits[i]
+	}
+
+	var out []Chunk
+	type meta struct {
+		base  []byte
+		start int
+		end   int
+	}
+	var metas []meta
+
+	for _, e := range revEdits {
+		curBase := e.lineref.base
+		curStart := e.lineref.start
+		curEnd := e.lineref.end
+
+		if len(out) == 0 || out[len(out)-1].Kind != e.kind {
+			out = append(out, Chunk{Kind: e.kind, Data: curBase[curStart:curEnd]})
+			metas = append(metas, meta{base: curBase, start: curStart, end: curEnd})
+			continue
+		}
+
+		lastIdx := len(out) - 1
+		lastMeta := metas[lastIdx]
+
+		if bytes.Equal(lastMeta.base, curBase) && lastMeta.end == curStart {
+			metas[lastIdx].end = curEnd
+			out[lastIdx].Data = curBase[metas[lastIdx].start:metas[lastIdx].end]
+			continue
+		}
+
+		out[lastIdx].Data = append(out[lastIdx].Data, curBase[curStart:curEnd]...)
+		metas[lastIdx] = meta{base: nil, start: 0, end: 0}
+	}
+
+	return out, nil
+}
+
+// Chunk represents a contiguous region of lines categorized
+// as unchanged, deleted, or added.
+type Chunk struct {
+	Kind ChunkKind
+	Data []byte
+}
+
+// ChunkKind enumerates the type of diff chunk.
+type ChunkKind int
+
+const (
+	// ChunkKindUnchanged represents an unchanged diff chunk.
+	ChunkKindUnchanged ChunkKind = iota
+	// ChunkKindDeleted represents a deleted diff chunk.
+	ChunkKindDeleted
+	// ChunkKindAdded represents an added diff chunk.
+	ChunkKindAdded
+)
--- /dev/null
+++ b/diff/lines/diff_test.go
@@ -1,0 +1,326 @@
+package lines
+
+import (
+	"bytes"
+	"strconv"
+	"strings"
+	"testing"
+)
+
+func TestDiff(t *testing.T) {
+	t.Parallel()
+
+	tests := []struct {
+		name     string
+		oldInput string
+		newInput string
+		expected []Chunk
+	}{
+		{
+			name:     "empty inputs produce no chunks",
+			oldInput: "",
+			newInput: "",
+			expected: []Chunk{},
+		},
+		{
+			name:     "only additions",
+			oldInput: "",
+			newInput: "alpha\nbeta\n",
+			expected: []Chunk{
+				{Kind: ChunkKindAdded, Data: []byte("alpha\nbeta\n")},
+			},
+		},
+		{
+			name:     "only deletions",
+			oldInput: "alpha\nbeta\n",
+			newInput: "",
+			expected: []Chunk{
+				{Kind: ChunkKindDeleted, Data: []byte("alpha\nbeta\n")},
+			},
+		},
+		{
+			name:     "unchanged content is grouped",
+			oldInput: "same\nlines\n",
+			newInput: "same\nlines\n",
+			expected: []Chunk{
+				{Kind: ChunkKindUnchanged, Data: []byte("same\nlines\n")},
+			},
+		},
+		{
+			name:     "insertion in the middle",
+			oldInput: "a\nb\nc\n",
+			newInput: "a\nb\nX\nc\n",
+			expected: []Chunk{
+				{Kind: ChunkKindUnchanged, Data: []byte("a\nb\n")},
+				{Kind: ChunkKindAdded, Data: []byte("X\n")},
+				{Kind: ChunkKindUnchanged, Data: []byte("c\n")},
+			},
+		},
+		{
+			name:     "replacement without trailing newline",
+			oldInput: "first\nsecond",
+			newInput: "first\nsecond\n",
+			expected: []Chunk{
+				{Kind: ChunkKindUnchanged, Data: []byte("first\n")},
+				{Kind: ChunkKindDeleted, Data: []byte("second")},
+				{Kind: ChunkKindAdded, Data: []byte("second\n")},
+			},
+		},
+		{
+			name:     "line replacement",
+			oldInput: "a\nb\nc\n",
+			newInput: "a\nB\nc\n",
+			expected: []Chunk{
+				{Kind: ChunkKindUnchanged, Data: []byte("a\n")},
+				{Kind: ChunkKindDeleted, Data: []byte("b\n")},
+				{Kind: ChunkKindAdded, Data: []byte("B\n")},
+				{Kind: ChunkKindUnchanged, Data: []byte("c\n")},
+			},
+		},
+		{
+			name:     "swap adjacent lines",
+			oldInput: "A\nB\n",
+			newInput: "B\nA\n",
+			expected: []Chunk{
+				{Kind: ChunkKindDeleted, Data: []byte("A\n")},
+				{Kind: ChunkKindUnchanged, Data: []byte("B\n")},
+				{Kind: ChunkKindAdded, Data: []byte("A\n")},
+			},
+		},
+		{
+			name:     "indentation change is a full line replacement",
+			oldInput: "func main() {\n\treturn\n}\n",
+			newInput: "func main() {\n    return\n}\n",
+			expected: []Chunk{
+				{Kind: ChunkKindUnchanged, Data: []byte("func main() {\n")},
+				{Kind: ChunkKindDeleted, Data: []byte("\treturn\n")},
+				{Kind: ChunkKindAdded, Data: []byte("    return\n")},
+				{Kind: ChunkKindUnchanged, Data: []byte("}\n")},
+			},
+		},
+		{
+			name:     "commenting out lines",
+			oldInput: "code\n",
+			newInput: "// code\n",
+			expected: []Chunk{
+				{Kind: ChunkKindDeleted, Data: []byte("code\n")},
+				{Kind: ChunkKindAdded, Data: []byte("// code\n")},
+			},
+		},
+		{
+			name:     "reducing repeating lines",
+			oldInput: "log\nlog\nlog\n",
+			newInput: "log\n",
+			expected: []Chunk{
+				{Kind: ChunkKindUnchanged, Data: []byte("log\n")},
+				{Kind: ChunkKindDeleted, Data: []byte("log\nlog\n")},
+			},
+		},
+		{
+			name:     "expanding repeating lines",
+			oldInput: "tick\n",
+			newInput: "tick\ntick\ntick\n",
+			expected: []Chunk{
+				{Kind: ChunkKindUnchanged, Data: []byte("tick\n")},
+				{Kind: ChunkKindAdded, Data: []byte("tick\ntick\n")},
+			},
+		},
+		{
+			name:     "interleaved modifications",
+			oldInput: "keep\nchange\nkeep\nchange\n",
+			newInput: "keep\nfixed\nkeep\nfixed\n",
+			expected: []Chunk{
+				{Kind: ChunkKindUnchanged, Data: []byte("keep\n")},
+				{Kind: ChunkKindDeleted, Data: []byte("change\n")},
+				{Kind: ChunkKindAdded, Data: []byte("fixed\n")},
+				{Kind: ChunkKindUnchanged, Data: []byte("keep\n")},
+				{Kind: ChunkKindDeleted, Data: []byte("change\n")},
+				{Kind: ChunkKindAdded, Data: []byte("fixed\n")},
+			},
+		},
+		{
+			name:     "large common header and footer",
+			oldInput: "header\nheader\nheader\nOLD\nfooter\nfooter\n",
+			newInput: "header\nheader\nheader\nNEW\nfooter\nfooter\n",
+			expected: []Chunk{
+				{Kind: ChunkKindUnchanged, Data: []byte("header\nheader\nheader\n")},
+				{Kind: ChunkKindDeleted, Data: []byte("OLD\n")},
+				{Kind: ChunkKindAdded, Data: []byte("NEW\n")},
+				{Kind: ChunkKindUnchanged, Data: []byte("footer\nfooter\n")},
+			},
+		},
+		{
+			name:     "completely different content",
+			oldInput: "apple\nbanana\n",
+			newInput: "cherry\ndate\n",
+			expected: []Chunk{
+				{Kind: ChunkKindDeleted, Data: []byte("apple\nbanana\n")},
+				{Kind: ChunkKindAdded, Data: []byte("cherry\ndate\n")},
+			},
+		},
+		{
+			name:     "unicode and emoji changes",
+			oldInput: "Hello 🌍\nYay\n",
+			newInput: "Hello 🌎\nYay\n",
+			expected: []Chunk{
+				{Kind: ChunkKindDeleted, Data: []byte("Hello 🌍\n")},
+				{Kind: ChunkKindAdded, Data: []byte("Hello 🌎\n")},
+				{Kind: ChunkKindUnchanged, Data: []byte("Yay\n")},
+			},
+		},
+		{
+			name:     "binary data with embedded newlines",
+			oldInput: "\x00\x01\n\x02\x03\n",
+			newInput: "\x00\x01\n\x02\xFF\n",
+			expected: []Chunk{
+				{Kind: ChunkKindUnchanged, Data: []byte("\x00\x01\n")},
+				{Kind: ChunkKindDeleted, Data: []byte("\x02\x03\n")},
+				{Kind: ChunkKindAdded, Data: []byte("\x02\xFF\n")},
+			},
+		},
+		{
+			name:     "adding trailing newline to last line",
+			oldInput: "Line 1\nLine 2",
+			newInput: "Line 1\nLine 2\n",
+			expected: []Chunk{
+				{Kind: ChunkKindUnchanged, Data: []byte("Line 1\n")},
+				{Kind: ChunkKindDeleted, Data: []byte("Line 2")},
+				{Kind: ChunkKindAdded, Data: []byte("Line 2\n")},
+			},
+		},
+		{
+			name:     "removing trailing newline",
+			oldInput: "A\nB\n",
+			newInput: "A\nB",
+			expected: []Chunk{
+				{Kind: ChunkKindUnchanged, Data: []byte("A\n")},
+				{Kind: ChunkKindDeleted, Data: []byte("B\n")},
+				{Kind: ChunkKindAdded, Data: []byte("B")},
+			},
+		},
+		{
+			name:     "inserting blank lines",
+			oldInput: "A\nB\n",
+			newInput: "A\n\n\nB\n",
+			expected: []Chunk{
+				{Kind: ChunkKindUnchanged, Data: []byte("A\n")},
+				{Kind: ChunkKindAdded, Data: []byte("\n\n")},
+				{Kind: ChunkKindUnchanged, Data: []byte("B\n")},
+			},
+		},
+		{
+			name:     "collapsing blank lines",
+			oldInput: "A\n\n\n\nB\n",
+			newInput: "A\nB\n",
+			expected: []Chunk{
+				{Kind: ChunkKindUnchanged, Data: []byte("A\n")},
+				{Kind: ChunkKindDeleted, Data: []byte("\n\n\n")},
+				{Kind: ChunkKindUnchanged, Data: []byte("B\n")},
+			},
+		},
+		{
+			name:     "case sensitivity check",
+			oldInput: "FOO\nbar\n",
+			newInput: "foo\nbar\n",
+			expected: []Chunk{
+				{Kind: ChunkKindDeleted, Data: []byte("FOO\n")},
+				{Kind: ChunkKindAdded, Data: []byte("foo\n")},
+				{Kind: ChunkKindUnchanged, Data: []byte("bar\n")},
+			},
+		},
+		{
+			name:     "partial line match is full mismatch",
+			oldInput: "The quick brown fox\n",
+			newInput: "The quick brown fox jumps\n",
+			expected: []Chunk{
+				{Kind: ChunkKindDeleted, Data: []byte("The quick brown fox\n")},
+				{Kind: ChunkKindAdded, Data: []byte("The quick brown fox jumps\n")},
+			},
+		},
+		{
+			name:     "inserting middle content",
+			oldInput: "Top\nBottom\n",
+			newInput: "Top\nMiddle\nBottom\n",
+			expected: []Chunk{
+				{Kind: ChunkKindUnchanged, Data: []byte("Top\n")},
+				{Kind: ChunkKindAdded, Data: []byte("Middle\n")},
+				{Kind: ChunkKindUnchanged, Data: []byte("Bottom\n")},
+			},
+		},
+		{
+			name:     "block move simulated",
+			oldInput: "BlockA\nBlockB\nBlockC\n",
+			newInput: "BlockA\nBlockC\nBlockB\n",
+			expected: []Chunk{
+				{Kind: ChunkKindUnchanged, Data: []byte("BlockA\n")},
+				{Kind: ChunkKindDeleted, Data: []byte("BlockB\n")},
+				{Kind: ChunkKindUnchanged, Data: []byte("BlockC\n")},
+				{Kind: ChunkKindAdded, Data: []byte("BlockB\n")},
+			},
+		},
+		{
+			name:     "alternating additions",
+			oldInput: "A\nB\nC\n",
+			newInput: "A\n1\nB\n2\nC\n",
+			expected: []Chunk{
+				{Kind: ChunkKindUnchanged, Data: []byte("A\n")},
+				{Kind: ChunkKindAdded, Data: []byte("1\n")},
+				{Kind: ChunkKindUnchanged, Data: []byte("B\n")},
+				{Kind: ChunkKindAdded, Data: []byte("2\n")},
+				{Kind: ChunkKindUnchanged, Data: []byte("C\n")},
+			},
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			t.Parallel()
+
+			chunks, err := Diff([]byte(tt.oldInput), []byte(tt.newInput))
+			if err != nil {
+				t.Fatalf("Diff returned error: %v", err)
+			}
+
+			if len(chunks) != len(tt.expected) {
+				t.Fatalf("expected %d chunks, got %d: %s", len(tt.expected), len(chunks), formatChunks(chunks))
+			}
+
+			for i := range tt.expected {
+				if chunks[i].Kind != tt.expected[i].Kind {
+					t.Fatalf("chunk %d kind mismatch: got %v, want %v; chunks: %s", i, chunks[i].Kind, tt.expected[i].Kind, formatChunks(chunks))
+				}
+				if !bytes.Equal(chunks[i].Data, tt.expected[i].Data) {
+					t.Fatalf("chunk %d data mismatch: got %q, want %q; chunks: %s", i, string(chunks[i].Data), string(tt.expected[i].Data), formatChunks(chunks))
+				}
+			}
+		})
+	}
+}
+
+func formatChunks(chunks []Chunk) string {
+	var b strings.Builder
+	b.WriteByte('[')
+	for i, chunk := range chunks {
+		if i > 0 {
+			b.WriteString(", ")
+		}
+		b.WriteString(chunkKindName(chunk.Kind))
+		b.WriteByte(':')
+		b.WriteString(strconv.Quote(string(chunk.Data)))
+	}
+	b.WriteByte(']')
+	return b.String()
+}
+
+func chunkKindName(kind ChunkKind) string {
+	switch kind {
+	case ChunkKindUnchanged:
+		return "U"
+	case ChunkKindDeleted:
+		return "D"
+	case ChunkKindAdded:
+		return "A"
+	default:
+		return "?"
+	}
+}
--