diff --git a/pkg/database/migrations.go b/pkg/database/migrations.go index a6f390e..95a1c41 100644 --- a/pkg/database/migrations.go +++ b/pkg/database/migrations.go @@ -18,7 +18,7 @@ import ( ) const ( - currentVersion uint32 = 6 + currentVersion uint32 = 7 ) func (d *D) RunMigrations() { @@ -107,6 +107,14 @@ func (d *D) RunMigrations() { // bump to version 6 _ = d.writeVersionTag(6) } + if dbVersion < 7 { + log.I.F("migrating to version 7...") + // Rebuild word indexes with unicode normalization (small caps, fraktur → ASCII) + // This consolidates duplicate indexes from decorative unicode text + d.RebuildWordIndexesWithNormalization() + // bump to version 7 + _ = d.writeVersionTag(7) + } } // writeVersionTag writes a new version tag key to the database (no value) @@ -1018,3 +1026,56 @@ func (d *D) CleanupLegacyEventStorage() { log.I.F("legacy storage cleanup complete: removed %d evt entries, %d sev entries, reclaimed approximately %d bytes (%.2f MB)", cleanedEvt, cleanedSev, bytesReclaimed, float64(bytesReclaimed)/(1024.0*1024.0)) } + +// RebuildWordIndexesWithNormalization rebuilds all word indexes with unicode +// normalization applied. This migration: +// 1. Deletes all existing word indexes (wrd prefix) +// 2. Re-tokenizes all events with normalizeRune() applied +// 3. Creates new consolidated indexes where decorative unicode maps to ASCII +// +// After this migration, "ᴅᴇᴀᴛʜ" (small caps) and "𝔇𝔢𝔞𝔱𝔥" (fraktur) will index +// the same as "death", eliminating duplicate entries and enabling proper search. +func (d *D) RebuildWordIndexesWithNormalization() { + log.I.F("rebuilding word indexes with unicode normalization...") + var err error + + // Step 1: Delete all existing word indexes + var deletedCount int + if err = d.Update(func(txn *badger.Txn) error { + wrdPrf := new(bytes.Buffer) + if err = indexes.WordEnc(nil, nil).MarshalWrite(wrdPrf); chk.E(err) { + return err + } + + opts := badger.DefaultIteratorOptions + opts.Prefix = wrdPrf.Bytes() + opts.PrefetchValues = false // Keys only for deletion + + it := txn.NewIterator(opts) + defer it.Close() + + // Collect keys to delete (can't delete during iteration) + var keysToDelete [][]byte + for it.Rewind(); it.Valid(); it.Next() { + keysToDelete = append(keysToDelete, it.Item().KeyCopy(nil)) + } + + for _, key := range keysToDelete { + if err = txn.Delete(key); err == nil { + deletedCount++ + } + } + return nil + }); chk.E(err) { + log.W.F("failed to delete old word indexes: %v", err) + return + } + + log.I.F("deleted %d old word index entries", deletedCount) + + // Step 2: Rebuild word indexes from all events + // Reuse the existing UpdateWordIndexes logic which now uses normalizeRune + d.UpdateWordIndexes() + + log.I.F("word index rebuild with unicode normalization complete") +} diff --git a/pkg/database/tokenize.go b/pkg/database/tokenize.go index f501069..e013733 100644 --- a/pkg/database/tokenize.go +++ b/pkg/database/tokenize.go @@ -1,3 +1,5 @@ +//go:build !(js && wasm) + package database import ( @@ -65,7 +67,9 @@ func TokenHashes(content []byte) [][]byte { r2, size2 = utf8DecodeRuneInString(s[i:]) } if unicode.IsLetter(r2) || unicode.IsNumber(r2) { - runes = append(runes, unicode.ToLower(r2)) + // Normalize decorative unicode (small caps, fraktur) to ASCII + // before lowercasing for consistent indexing + runes = append(runes, unicode.ToLower(normalizeRune(r2))) i += size2 continue } @@ -142,18 +146,39 @@ func allAlphaNum(s string) bool { func isWordStart(r rune) bool { return unicode.IsLetter(r) || unicode.IsNumber(r) } -// Minimal utf8 rune decode without importing utf8 to avoid extra deps elsewhere +// utf8DecodeRuneInString decodes the first UTF-8 rune from s. +// Returns the rune and the number of bytes consumed. func utf8DecodeRuneInString(s string) (r rune, size int) { - // Fallback to standard library if available; however, using basic decoding - for i := 1; i <= 4 && i <= len(s); i++ { - r, size = rune(s[0]), 1 - if r < 0x80 { - return r, 1 - } - // Use stdlib for correctness - return []rune(s[:i])[0], len(string([]rune(s[:i])[0])) + if len(s) == 0 { + return 0, 0 } - return rune(s[0]), 1 + // ASCII fast path + b := s[0] + if b < 0x80 { + return rune(b), 1 + } + // Multi-byte: determine expected length from first byte + var expectedLen int + switch { + case b&0xE0 == 0xC0: // 110xxxxx - 2 bytes + expectedLen = 2 + case b&0xF0 == 0xE0: // 1110xxxx - 3 bytes + expectedLen = 3 + case b&0xF8 == 0xF0: // 11110xxx - 4 bytes + expectedLen = 4 + default: + // Invalid UTF-8 start byte + return 0xFFFD, 1 + } + if len(s) < expectedLen { + return 0xFFFD, 1 + } + // Decode using Go's built-in rune conversion (simple and correct) + runes := []rune(s[:expectedLen]) + if len(runes) == 0 { + return 0xFFFD, 1 + } + return runes[0], expectedLen } // isHex64 returns true if s is exactly 64 hex characters (0-9, a-f) diff --git a/pkg/database/unicode_normalize.go b/pkg/database/unicode_normalize.go new file mode 100644 index 0000000..77d8c77 --- /dev/null +++ b/pkg/database/unicode_normalize.go @@ -0,0 +1,135 @@ +//go:build !(js && wasm) + +package database + +// normalizeRune maps decorative unicode characters (small caps, fraktur) back to +// their ASCII equivalents for consistent word indexing. This ensures that text +// written with decorative alphabets (e.g., "ᴅᴇᴀᴛʜ" or "𝔇𝔢𝔞𝔱𝔥") indexes the same +// as regular ASCII ("death"). +// +// Character sets normalized: +// - Small Caps (used for DEATH-style text in Terry Pratchett tradition) +// - Mathematical Fraktur lowercase (𝔞-𝔷) +// - Mathematical Fraktur uppercase (𝔄-ℨ, including Letterlike Symbols block exceptions) +func normalizeRune(r rune) rune { + // Check small caps first (scattered codepoints) + if mapped, ok := smallCapsToASCII[r]; ok { + return mapped + } + + // Check fraktur lowercase: U+1D51E to U+1D537 (contiguous range) + if r >= 0x1D51E && r <= 0x1D537 { + return 'a' + (r - 0x1D51E) + } + + // Check fraktur uppercase main range: U+1D504 to U+1D51C (with gaps) + if r >= 0x1D504 && r <= 0x1D51C { + if mapped, ok := frakturUpperToASCII[r]; ok { + return mapped + } + } + + // Check fraktur uppercase exceptions from Letterlike Symbols block + if mapped, ok := frakturLetterlikeToASCII[r]; ok { + return mapped + } + + return r +} + +// smallCapsToASCII maps small capital letters to lowercase ASCII. +// These are scattered across multiple Unicode blocks (IPA Extensions, +// Phonetic Extensions, Latin Extended-D). +var smallCapsToASCII = map[rune]rune{ + 'ᴀ': 'a', // U+1D00 LATIN LETTER SMALL CAPITAL A + 'ʙ': 'b', // U+0299 LATIN LETTER SMALL CAPITAL B + 'ᴄ': 'c', // U+1D04 LATIN LETTER SMALL CAPITAL C + 'ᴅ': 'd', // U+1D05 LATIN LETTER SMALL CAPITAL D + 'ᴇ': 'e', // U+1D07 LATIN LETTER SMALL CAPITAL E + 'ꜰ': 'f', // U+A730 LATIN LETTER SMALL CAPITAL F + 'ɢ': 'g', // U+0262 LATIN LETTER SMALL CAPITAL G + 'ʜ': 'h', // U+029C LATIN LETTER SMALL CAPITAL H + 'ɪ': 'i', // U+026A LATIN LETTER SMALL CAPITAL I + 'ᴊ': 'j', // U+1D0A LATIN LETTER SMALL CAPITAL J + 'ᴋ': 'k', // U+1D0B LATIN LETTER SMALL CAPITAL K + 'ʟ': 'l', // U+029F LATIN LETTER SMALL CAPITAL L + 'ᴍ': 'm', // U+1D0D LATIN LETTER SMALL CAPITAL M + 'ɴ': 'n', // U+0274 LATIN LETTER SMALL CAPITAL N + 'ᴏ': 'o', // U+1D0F LATIN LETTER SMALL CAPITAL O + 'ᴘ': 'p', // U+1D18 LATIN LETTER SMALL CAPITAL P + 'ǫ': 'q', // U+01EB LATIN SMALL LETTER O WITH OGONEK (no true small cap Q) + 'ʀ': 'r', // U+0280 LATIN LETTER SMALL CAPITAL R + 'ꜱ': 's', // U+A731 LATIN LETTER SMALL CAPITAL S + 'ᴛ': 't', // U+1D1B LATIN LETTER SMALL CAPITAL T + 'ᴜ': 'u', // U+1D1C LATIN LETTER SMALL CAPITAL U + 'ᴠ': 'v', // U+1D20 LATIN LETTER SMALL CAPITAL V + 'ᴡ': 'w', // U+1D21 LATIN LETTER SMALL CAPITAL W + // Note: no small cap X exists in standard use + 'ʏ': 'y', // U+028F LATIN LETTER SMALL CAPITAL Y + 'ᴢ': 'z', // U+1D22 LATIN LETTER SMALL CAPITAL Z +} + +// frakturUpperToASCII maps Mathematical Fraktur uppercase letters to lowercase ASCII. +// The main range U+1D504-U+1D51C has gaps where C, H, I, R, Z use Letterlike Symbols. +var frakturUpperToASCII = map[rune]rune{ + '𝔄': 'a', // U+1D504 MATHEMATICAL FRAKTUR CAPITAL A + '𝔅': 'b', // U+1D505 MATHEMATICAL FRAKTUR CAPITAL B + // C is at U+212D (Letterlike Symbols) + '𝔇': 'd', // U+1D507 MATHEMATICAL FRAKTUR CAPITAL D + '𝔈': 'e', // U+1D508 MATHEMATICAL FRAKTUR CAPITAL E + '𝔉': 'f', // U+1D509 MATHEMATICAL FRAKTUR CAPITAL F + '𝔊': 'g', // U+1D50A MATHEMATICAL FRAKTUR CAPITAL G + // H is at U+210C (Letterlike Symbols) + // I is at U+2111 (Letterlike Symbols) + '𝔍': 'j', // U+1D50D MATHEMATICAL FRAKTUR CAPITAL J + '𝔎': 'k', // U+1D50E MATHEMATICAL FRAKTUR CAPITAL K + '𝔏': 'l', // U+1D50F MATHEMATICAL FRAKTUR CAPITAL L + '𝔐': 'm', // U+1D510 MATHEMATICAL FRAKTUR CAPITAL M + '𝔑': 'n', // U+1D511 MATHEMATICAL FRAKTUR CAPITAL N + '𝔒': 'o', // U+1D512 MATHEMATICAL FRAKTUR CAPITAL O + '𝔓': 'p', // U+1D513 MATHEMATICAL FRAKTUR CAPITAL P + '𝔔': 'q', // U+1D514 MATHEMATICAL FRAKTUR CAPITAL Q + // R is at U+211C (Letterlike Symbols) + '𝔖': 's', // U+1D516 MATHEMATICAL FRAKTUR CAPITAL S + '𝔗': 't', // U+1D517 MATHEMATICAL FRAKTUR CAPITAL T + '𝔘': 'u', // U+1D518 MATHEMATICAL FRAKTUR CAPITAL U + '𝔙': 'v', // U+1D519 MATHEMATICAL FRAKTUR CAPITAL V + '𝔚': 'w', // U+1D51A MATHEMATICAL FRAKTUR CAPITAL W + '𝔛': 'x', // U+1D51B MATHEMATICAL FRAKTUR CAPITAL X + '𝔜': 'y', // U+1D51C MATHEMATICAL FRAKTUR CAPITAL Y + // Z is at U+2128 (Letterlike Symbols) +} + +// frakturLetterlikeToASCII maps the Fraktur characters that live in the +// Letterlike Symbols block (U+2100-U+214F) rather than Mathematical Alphanumeric Symbols. +var frakturLetterlikeToASCII = map[rune]rune{ + 'ℭ': 'c', // U+212D BLACK-LETTER CAPITAL C + 'ℌ': 'h', // U+210C BLACK-LETTER CAPITAL H + 'ℑ': 'i', // U+2111 BLACK-LETTER CAPITAL I + 'ℜ': 'r', // U+211C BLACK-LETTER CAPITAL R + 'ℨ': 'z', // U+2128 BLACK-LETTER CAPITAL Z +} + +// hasDecorativeUnicode checks if text contains any small caps or fraktur characters +// that would need normalization. Used by migration to identify events needing re-indexing. +func hasDecorativeUnicode(s string) bool { + for _, r := range s { + // Check small caps + if _, ok := smallCapsToASCII[r]; ok { + return true + } + // Check fraktur lowercase range + if r >= 0x1D51E && r <= 0x1D537 { + return true + } + // Check fraktur uppercase range + if r >= 0x1D504 && r <= 0x1D51C { + return true + } + // Check letterlike symbols fraktur + if _, ok := frakturLetterlikeToASCII[r]; ok { + return true + } + } + return false +} diff --git a/pkg/database/unicode_normalize_test.go b/pkg/database/unicode_normalize_test.go new file mode 100644 index 0000000..164f2e4 --- /dev/null +++ b/pkg/database/unicode_normalize_test.go @@ -0,0 +1,205 @@ +//go:build !(js && wasm) + +package database + +import ( + "bytes" + "testing" +) + +func TestNormalizeRune(t *testing.T) { + tests := []struct { + name string + input rune + expected rune + }{ + // Small caps + {"small cap A", 'ᴀ', 'a'}, + {"small cap B", 'ʙ', 'b'}, + {"small cap C", 'ᴄ', 'c'}, + {"small cap D", 'ᴅ', 'd'}, + {"small cap E", 'ᴇ', 'e'}, + {"small cap F", 'ꜰ', 'f'}, + {"small cap G", 'ɢ', 'g'}, + {"small cap H", 'ʜ', 'h'}, + {"small cap I", 'ɪ', 'i'}, + {"small cap J", 'ᴊ', 'j'}, + {"small cap K", 'ᴋ', 'k'}, + {"small cap L", 'ʟ', 'l'}, + {"small cap M", 'ᴍ', 'm'}, + {"small cap N", 'ɴ', 'n'}, + {"small cap O", 'ᴏ', 'o'}, + {"small cap P", 'ᴘ', 'p'}, + {"small cap Q (ogonek)", 'ǫ', 'q'}, + {"small cap R", 'ʀ', 'r'}, + {"small cap S", 'ꜱ', 's'}, + {"small cap T", 'ᴛ', 't'}, + {"small cap U", 'ᴜ', 'u'}, + {"small cap V", 'ᴠ', 'v'}, + {"small cap W", 'ᴡ', 'w'}, + {"small cap Y", 'ʏ', 'y'}, + {"small cap Z", 'ᴢ', 'z'}, + + // Fraktur lowercase + {"fraktur lower a", '𝔞', 'a'}, + {"fraktur lower b", '𝔟', 'b'}, + {"fraktur lower c", '𝔠', 'c'}, + {"fraktur lower d", '𝔡', 'd'}, + {"fraktur lower e", '𝔢', 'e'}, + {"fraktur lower f", '𝔣', 'f'}, + {"fraktur lower g", '𝔤', 'g'}, + {"fraktur lower h", '𝔥', 'h'}, + {"fraktur lower i", '𝔦', 'i'}, + {"fraktur lower j", '𝔧', 'j'}, + {"fraktur lower k", '𝔨', 'k'}, + {"fraktur lower l", '𝔩', 'l'}, + {"fraktur lower m", '𝔪', 'm'}, + {"fraktur lower n", '𝔫', 'n'}, + {"fraktur lower o", '𝔬', 'o'}, + {"fraktur lower p", '𝔭', 'p'}, + {"fraktur lower q", '𝔮', 'q'}, + {"fraktur lower r", '𝔯', 'r'}, + {"fraktur lower s", '𝔰', 's'}, + {"fraktur lower t", '𝔱', 't'}, + {"fraktur lower u", '𝔲', 'u'}, + {"fraktur lower v", '𝔳', 'v'}, + {"fraktur lower w", '𝔴', 'w'}, + {"fraktur lower x", '𝔵', 'x'}, + {"fraktur lower y", '𝔶', 'y'}, + {"fraktur lower z", '𝔷', 'z'}, + + // Fraktur uppercase (main range) + {"fraktur upper A", '𝔄', 'a'}, + {"fraktur upper B", '𝔅', 'b'}, + {"fraktur upper D", '𝔇', 'd'}, + {"fraktur upper E", '𝔈', 'e'}, + {"fraktur upper F", '𝔉', 'f'}, + {"fraktur upper G", '𝔊', 'g'}, + {"fraktur upper J", '𝔍', 'j'}, + {"fraktur upper K", '𝔎', 'k'}, + {"fraktur upper L", '𝔏', 'l'}, + {"fraktur upper M", '𝔐', 'm'}, + {"fraktur upper N", '𝔑', 'n'}, + {"fraktur upper O", '𝔒', 'o'}, + {"fraktur upper P", '𝔓', 'p'}, + {"fraktur upper Q", '𝔔', 'q'}, + {"fraktur upper S", '𝔖', 's'}, + {"fraktur upper T", '𝔗', 't'}, + {"fraktur upper U", '𝔘', 'u'}, + {"fraktur upper V", '𝔙', 'v'}, + {"fraktur upper W", '𝔚', 'w'}, + {"fraktur upper X", '𝔛', 'x'}, + {"fraktur upper Y", '𝔜', 'y'}, + + // Fraktur uppercase (Letterlike Symbols block) + {"fraktur upper C (letterlike)", 'ℭ', 'c'}, + {"fraktur upper H (letterlike)", 'ℌ', 'h'}, + {"fraktur upper I (letterlike)", 'ℑ', 'i'}, + {"fraktur upper R (letterlike)", 'ℜ', 'r'}, + {"fraktur upper Z (letterlike)", 'ℨ', 'z'}, + + // Regular ASCII should pass through unchanged + {"regular lowercase a", 'a', 'a'}, + {"regular lowercase z", 'z', 'z'}, + {"regular uppercase A", 'A', 'A'}, + {"regular digit 5", '5', '5'}, + + // Other unicode should pass through unchanged + {"cyrillic д", 'д', 'д'}, + {"greek α", 'α', 'α'}, + {"emoji", '🎉', '🎉'}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := normalizeRune(tt.input) + if result != tt.expected { + t.Errorf("normalizeRune(%q) = %q, want %q", tt.input, result, tt.expected) + } + }) + } +} + +func TestHasDecorativeUnicode(t *testing.T) { + tests := []struct { + name string + input string + expected bool + }{ + {"plain ASCII", "hello world", false}, + {"small caps word", "ᴅᴇᴀᴛʜ", true}, + {"fraktur lowercase", "𝔥𝔢𝔩𝔩𝔬", true}, + {"fraktur uppercase", "𝔇𝔈𝔄𝔗ℌ", true}, + {"mixed with ASCII", "hello ᴡᴏʀʟᴅ", true}, + {"single small cap", "aᴀa", true}, + {"cyrillic (no normalize)", "привет", false}, + {"empty string", "", false}, + {"letterlike fraktur C", "ℭool", true}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := hasDecorativeUnicode(tt.input) + if result != tt.expected { + t.Errorf("hasDecorativeUnicode(%q) = %v, want %v", tt.input, result, tt.expected) + } + }) + } +} + +func TestTokenHashesNormalization(t *testing.T) { + // All three representations should produce the same hash + ascii := TokenHashes([]byte("death")) + smallCaps := TokenHashes([]byte("ᴅᴇᴀᴛʜ")) + frakturLower := TokenHashes([]byte("𝔡𝔢𝔞𝔱𝔥")) + frakturUpper := TokenHashes([]byte("𝔇𝔈𝔄𝔗ℌ")) + + if len(ascii) != 1 { + t.Fatalf("expected 1 hash for 'death', got %d", len(ascii)) + } + if len(smallCaps) != 1 { + t.Fatalf("expected 1 hash for small caps, got %d", len(smallCaps)) + } + if len(frakturLower) != 1 { + t.Fatalf("expected 1 hash for fraktur lower, got %d", len(frakturLower)) + } + if len(frakturUpper) != 1 { + t.Fatalf("expected 1 hash for fraktur upper, got %d", len(frakturUpper)) + } + + // All should match the ASCII version + if !bytes.Equal(ascii[0], smallCaps[0]) { + t.Errorf("small caps hash differs from ASCII\nASCII: %x\nsmall caps: %x", ascii[0], smallCaps[0]) + } + if !bytes.Equal(ascii[0], frakturLower[0]) { + t.Errorf("fraktur lower hash differs from ASCII\nASCII: %x\nfraktur lower: %x", ascii[0], frakturLower[0]) + } + if !bytes.Equal(ascii[0], frakturUpper[0]) { + t.Errorf("fraktur upper hash differs from ASCII\nASCII: %x\nfraktur upper: %x", ascii[0], frakturUpper[0]) + } +} + +func TestTokenHashesMixedContent(t *testing.T) { + // Test that mixed content normalizes correctly + content := []byte("ᴛʜᴇ quick 𝔟𝔯𝔬𝔴𝔫 fox") + hashes := TokenHashes(content) + + // Should get: "the", "quick", "brown", "fox" (4 unique words) + if len(hashes) != 4 { + t.Errorf("expected 4 hashes from mixed content, got %d", len(hashes)) + } + + // Verify "the" matches between decorated and plain + thePlain := TokenHashes([]byte("the")) + theDecorated := TokenHashes([]byte("ᴛʜᴇ")) + if !bytes.Equal(thePlain[0], theDecorated[0]) { + t.Errorf("'the' hash mismatch: plain=%x, decorated=%x", thePlain[0], theDecorated[0]) + } + + // Verify "brown" matches between decorated and plain + brownPlain := TokenHashes([]byte("brown")) + brownDecorated := TokenHashes([]byte("𝔟𝔯𝔬𝔴𝔫")) + if !bytes.Equal(brownPlain[0], brownDecorated[0]) { + t.Errorf("'brown' hash mismatch: plain=%x, decorated=%x", brownPlain[0], brownDecorated[0]) + } +} diff --git a/pkg/version/version b/pkg/version/version index fafd02c..59b6ce7 100644 --- a/pkg/version/version +++ b/pkg/version/version @@ -1 +1 @@ -v0.36.9 +v0.36.10