shithub: furgit

Download patch

ref: b5ccb5e2f456a7f34d39990e60662665f6ea2af7
parent: 707342f0814da2fbd7c2b71ba1e973732900229f
author: Runxi Yu <me@runxiyu.org>
date: Fri Jan 30 10:08:46 EST 2026

bloom: Add bloom filter implementation for commit-graph

--- /dev/null
+++ b/internal/bloom/bloom.go
@@ -1,0 +1,236 @@
+// Package bloom provides a bloom filter implementation used for changed-path
+// filters in Git commit graphs.
+package bloom
+
+import "encoding/binary"
+
+const (
+	// DataHeaderSize is the size of the BDAT header in commit-graph files.
+	DataHeaderSize = 3 * 4
+	// DefaultMaxChange matches Git's default max-changed-paths behavior.
+	DefaultMaxChange = 512
+)
+
+// Settings describe the changed-paths Bloom filter parameters stored in
+// commit-graph BDAT chunks.
+//
+// Obviously, they must match the repository's commit-graph settings to
+// interpret filters correctly.
+type Settings struct {
+	HashVersion    uint32
+	NumHashes      uint32
+	BitsPerEntry   uint32
+	MaxChangePaths uint32
+}
+
+// Filter represents a changed-paths Bloom filter associated with a commit.
+//
+// The filter encodes which paths changed between a commit and its first
+// parent. Paths are expected to be in Git's slash-separated form and
+// are queried using a path and its prefixes (e.g. "a/b/c", "a/b", "a").
+type Filter struct {
+	Data    []byte
+	Version uint32
+}
+
+// ParseSettings reads Bloom filter settings from a BDAT chunk header.
+func ParseSettings(bdat []byte) (*Settings, error) {
+	if len(bdat) < DataHeaderSize {
+		return nil, ErrInvalid
+	}
+	settings := &Settings{
+		HashVersion:    binary.BigEndian.Uint32(bdat[0:4]),
+		NumHashes:      binary.BigEndian.Uint32(bdat[4:8]),
+		BitsPerEntry:   binary.BigEndian.Uint32(bdat[8:12]),
+		MaxChangePaths: DefaultMaxChange,
+	}
+	return settings, nil
+}
+
+// MightContain reports whether the Bloom filter may contain the given path.
+//
+// Evaluated against the full path and each of its directory prefixes. A true
+// result indicates a possible match; false means the path definitely did not
+// change.
+func (f *Filter) MightContain(path []byte, settings *Settings) bool {
+	if f == nil || settings == nil {
+		return false
+	}
+	if len(f.Data) == 0 {
+		return false
+	}
+	keys := keyvec(path, settings)
+	for i := range keys {
+		if filterContainsKey(f, &keys[i], settings) {
+			return true
+		}
+	}
+	return false
+}
+
+type key struct {
+	hashes []uint32
+}
+
+func keyvec(path []byte, settings *Settings) []key {
+	if len(path) == 0 {
+		return nil
+	}
+	count := 1
+	for _, b := range path {
+		if b == '/' {
+			count++
+		}
+	}
+	keys := make([]key, 0, count)
+	keys = append(keys, keyFill(path, settings))
+	for i := len(path) - 1; i >= 0; i-- {
+		if path[i] == '/' {
+			keys = append(keys, keyFill(path[:i], settings))
+		}
+	}
+	return keys
+}
+
+func keyFill(path []byte, settings *Settings) key {
+	const seed0 = 0x293ae76f
+	const seed1 = 0x7e646e2c
+	var h0, h1 uint32
+	if settings.HashVersion == 2 {
+		h0 = murmur3SeededV2(seed0, path)
+		h1 = murmur3SeededV2(seed1, path)
+	} else {
+		h0 = murmur3SeededV1(seed0, path)
+		h1 = murmur3SeededV1(seed1, path)
+	}
+	hashes := make([]uint32, settings.NumHashes)
+	for i := uint32(0); i < settings.NumHashes; i++ {
+		hashes[i] = h0 + i*h1
+	}
+	return key{hashes: hashes}
+}
+
+func filterContainsKey(filter *Filter, key *key, settings *Settings) bool {
+	if filter == nil || key == nil || settings == nil {
+		return false
+	}
+	if len(filter.Data) == 0 {
+		return false
+	}
+	mod := uint64(len(filter.Data)) * 8
+	for _, h := range key.hashes {
+		idx := uint64(h) % mod
+		bytePos := idx / 8
+		bit := byte(1 << (idx & 7))
+		if filter.Data[bytePos]&bit == 0 {
+			return false
+		}
+	}
+	return true
+}
+
+func murmur3SeededV2(seed uint32, data []byte) uint32 {
+	const (
+		c1 = 0xcc9e2d51
+		c2 = 0x1b873593
+		r1 = 15
+		r2 = 13
+		m  = 5
+		n  = 0xe6546b64
+	)
+	h := seed
+	nblocks := len(data) / 4
+	for i := 0; i < nblocks; i++ {
+		k := uint32(data[4*i]) |
+			(uint32(data[4*i+1]) << 8) |
+			(uint32(data[4*i+2]) << 16) |
+			(uint32(data[4*i+3]) << 24)
+		k *= c1
+		k = (k << r1) | (k >> (32 - r1))
+		k *= c2
+
+		h ^= k
+		h = (h << r2) | (h >> (32 - r2))
+		h = h*m + n
+	}
+
+	var k1 uint32
+	tail := data[nblocks*4:]
+	switch len(tail) & 3 {
+	case 3:
+		k1 ^= uint32(tail[2]) << 16
+		fallthrough
+	case 2:
+		k1 ^= uint32(tail[1]) << 8
+		fallthrough
+	case 1:
+		k1 ^= uint32(tail[0])
+		k1 *= c1
+		k1 = (k1 << r1) | (k1 >> (32 - r1))
+		k1 *= c2
+		h ^= k1
+	}
+
+	h ^= uint32(len(data))
+	h ^= h >> 16
+	h *= 0x85ebca6b
+	h ^= h >> 13
+	h *= 0xc2b2ae35
+	h ^= h >> 16
+	return h
+}
+
+func murmur3SeededV1(seed uint32, data []byte) uint32 {
+	const (
+		c1 = 0xcc9e2d51
+		c2 = 0x1b873593
+		r1 = 15
+		r2 = 13
+		m  = 5
+		n  = 0xe6546b64
+	)
+	h := seed
+	nblocks := len(data) / 4
+	for i := 0; i < nblocks; i++ {
+		b0 := int8(data[4*i])
+		b1 := int8(data[4*i+1])
+		b2 := int8(data[4*i+2])
+		b3 := int8(data[4*i+3])
+		k := uint32(b0) |
+			(uint32(b1) << 8) |
+			(uint32(b2) << 16) |
+			(uint32(b3) << 24)
+		k *= c1
+		k = (k << r1) | (k >> (32 - r1))
+		k *= c2
+
+		h ^= k
+		h = (h << r2) | (h >> (32 - r2))
+		h = h*m + n
+	}
+
+	var k1 uint32
+	tail := data[nblocks*4:]
+	switch len(tail) & 3 {
+	case 3:
+		k1 ^= uint32(int8(tail[2])) << 16
+		fallthrough
+	case 2:
+		k1 ^= uint32(int8(tail[1])) << 8
+		fallthrough
+	case 1:
+		k1 ^= uint32(int8(tail[0]))
+		k1 *= c1
+		k1 = (k1 << r1) | (k1 >> (32 - r1))
+		k1 *= c2
+		h ^= k1
+	}
+
+	h ^= uint32(len(data))
+	h ^= h >> 16
+	h *= 0x85ebca6b
+	h ^= h >> 13
+	h *= 0xc2b2ae35
+	h ^= h >> 16
+	return h
+}
--- /dev/null
+++ b/internal/bloom/errors.go
@@ -1,0 +1,5 @@
+package bloom
+
+import "errors"
+
+var ErrInvalid = errors.New("bloom: invalid data")
--