Add unicode normalization for word indexing (v0.36.10)
Some checks failed
Go / build-and-release (push) Has been cancelled

- Add unicode_normalize.go with mappings for small caps and fraktur
- Map 77 decorative unicode characters to ASCII equivalents:
  - Small caps (25 chars): ᴅᴇᴀᴛʜ → death
  - Fraktur lowercase (26 chars): 𝔡𝔢𝔞𝔱𝔥 → death
  - Fraktur uppercase (26 chars): 𝔇𝔈𝔄𝔗ℌ → death
- Fix broken utf8DecodeRuneInString() that failed on multi-byte UTF-8
- Add migration v7 to rebuild word indexes with normalization
- Add comprehensive unit tests for all character mappings

Files modified:
- pkg/database/unicode_normalize.go: New - character mapping tables
- pkg/database/unicode_normalize_test.go: New - unit tests
- pkg/database/tokenize.go: Integrate normalizeRune(), fix UTF-8 decoder
- pkg/database/migrations.go: Add version 7 migration

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
2025-12-22 18:53:30 +01:00
parent 11d1b6bfd1
commit 0addc61549
5 changed files with 439 additions and 13 deletions

View File

@@ -18,7 +18,7 @@ import (
)
const (
currentVersion uint32 = 6
currentVersion uint32 = 7
)
func (d *D) RunMigrations() {
@@ -107,6 +107,14 @@ func (d *D) RunMigrations() {
// bump to version 6
_ = d.writeVersionTag(6)
}
if dbVersion < 7 {
log.I.F("migrating to version 7...")
// Rebuild word indexes with unicode normalization (small caps, fraktur → ASCII)
// This consolidates duplicate indexes from decorative unicode text
d.RebuildWordIndexesWithNormalization()
// bump to version 7
_ = d.writeVersionTag(7)
}
}
// writeVersionTag writes a new version tag key to the database (no value)
@@ -1018,3 +1026,56 @@ func (d *D) CleanupLegacyEventStorage() {
log.I.F("legacy storage cleanup complete: removed %d evt entries, %d sev entries, reclaimed approximately %d bytes (%.2f MB)",
cleanedEvt, cleanedSev, bytesReclaimed, float64(bytesReclaimed)/(1024.0*1024.0))
}
// RebuildWordIndexesWithNormalization rebuilds all word indexes with unicode
// normalization applied. This migration:
// 1. Deletes all existing word indexes (wrd prefix)
// 2. Re-tokenizes all events with normalizeRune() applied
// 3. Creates new consolidated indexes where decorative unicode maps to ASCII
//
// After this migration, "ᴅᴇᴀᴛʜ" (small caps) and "𝔇𝔢𝔞𝔱𝔥" (fraktur) will index
// the same as "death", eliminating duplicate entries and enabling proper search.
func (d *D) RebuildWordIndexesWithNormalization() {
log.I.F("rebuilding word indexes with unicode normalization...")
var err error
// Step 1: Delete all existing word indexes
var deletedCount int
if err = d.Update(func(txn *badger.Txn) error {
wrdPrf := new(bytes.Buffer)
if err = indexes.WordEnc(nil, nil).MarshalWrite(wrdPrf); chk.E(err) {
return err
}
opts := badger.DefaultIteratorOptions
opts.Prefix = wrdPrf.Bytes()
opts.PrefetchValues = false // Keys only for deletion
it := txn.NewIterator(opts)
defer it.Close()
// Collect keys to delete (can't delete during iteration)
var keysToDelete [][]byte
for it.Rewind(); it.Valid(); it.Next() {
keysToDelete = append(keysToDelete, it.Item().KeyCopy(nil))
}
for _, key := range keysToDelete {
if err = txn.Delete(key); err == nil {
deletedCount++
}
}
return nil
}); chk.E(err) {
log.W.F("failed to delete old word indexes: %v", err)
return
}
log.I.F("deleted %d old word index entries", deletedCount)
// Step 2: Rebuild word indexes from all events
// Reuse the existing UpdateWordIndexes logic which now uses normalizeRune
d.UpdateWordIndexes()
log.I.F("word index rebuild with unicode normalization complete")
}