Add unicode normalization for word indexing (v0.36.10)

- Add unicode_normalize.go with mappings for small caps and fraktur - Map 77 decorative unicode characters to ASCII equivalents: - Small caps (25 chars): ᴅᴇᴀᴛʜ → death - Fraktur lowercase (26 chars): 𝔡𝔢𝔞𝔱𝔥 → death - Fraktur uppercase (26 chars): 𝔇𝔈𝔄𝔗ℌ → death - Fix broken utf8DecodeRuneInString() that failed on multi-byte UTF-8 - Add migration v7 to rebuild word indexes with normalization - Add comprehensive unit tests for all character mappings Files modified: - pkg/database/unicode_normalize.go: New - character mapping tables - pkg/database/unicode_normalize_test.go: New - unit tests - pkg/database/tokenize.go: Integrate normalizeRune(), fix UTF-8 decoder - pkg/database/migrations.go: Add version 7 migration 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-22 18:53:30 +01:00
parent 11d1b6bfd1
commit 0addc61549
5 changed files with 439 additions and 13 deletions
--- a/pkg/database/migrations.go
+++ b/pkg/database/migrations.go
@@ -18,7 +18,7 @@ import (
 )

 const (
-	currentVersion uint32 = 6
+	currentVersion uint32 = 7
 )

 func (d *D) RunMigrations() {
@@ -107,6 +107,14 @@ func (d *D) RunMigrations() {
 		// bump to version 6
 		_ = d.writeVersionTag(6)
 	}
+	if dbVersion < 7 {
+		log.I.F("migrating to version 7...")
+		// Rebuild word indexes with unicode normalization (small caps, fraktur → ASCII)
+		// This consolidates duplicate indexes from decorative unicode text
+		d.RebuildWordIndexesWithNormalization()
+		// bump to version 7
+		_ = d.writeVersionTag(7)
+	}
 }

 // writeVersionTag writes a new version tag key to the database (no value)
@@ -1018,3 +1026,56 @@ func (d *D) CleanupLegacyEventStorage() {
 	log.I.F("legacy storage cleanup complete: removed %d evt entries, %d sev entries, reclaimed approximately %d bytes (%.2f MB)",
 		cleanedEvt, cleanedSev, bytesReclaimed, float64(bytesReclaimed)/(1024.0*1024.0))
 }
+
+// RebuildWordIndexesWithNormalization rebuilds all word indexes with unicode
+// normalization applied. This migration:
+// 1. Deletes all existing word indexes (wrd prefix)
+// 2. Re-tokenizes all events with normalizeRune() applied
+// 3. Creates new consolidated indexes where decorative unicode maps to ASCII
+//
+// After this migration, "ᴅᴇᴀᴛʜ" (small caps) and "𝔇𝔢𝔞𝔱𝔥" (fraktur) will index
+// the same as "death", eliminating duplicate entries and enabling proper search.
+func (d *D) RebuildWordIndexesWithNormalization() {
+	log.I.F("rebuilding word indexes with unicode normalization...")
+	var err error
+
+	// Step 1: Delete all existing word indexes
+	var deletedCount int
+	if err = d.Update(func(txn *badger.Txn) error {
+		wrdPrf := new(bytes.Buffer)
+		if err = indexes.WordEnc(nil, nil).MarshalWrite(wrdPrf); chk.E(err) {
+			return err
+		}
+
+		opts := badger.DefaultIteratorOptions
+		opts.Prefix = wrdPrf.Bytes()
+		opts.PrefetchValues = false // Keys only for deletion
+
+		it := txn.NewIterator(opts)
+		defer it.Close()
+
+		// Collect keys to delete (can't delete during iteration)
+		var keysToDelete [][]byte
+		for it.Rewind(); it.Valid(); it.Next() {
+			keysToDelete = append(keysToDelete, it.Item().KeyCopy(nil))
+		}
+
+		for _, key := range keysToDelete {
+			if err = txn.Delete(key); err == nil {
+				deletedCount++
+			}
+		}
+		return nil
+	}); chk.E(err) {
+		log.W.F("failed to delete old word indexes: %v", err)
+		return
+	}
+
+	log.I.F("deleted %d old word index entries", deletedCount)
+
+	// Step 2: Rebuild word indexes from all events
+	// Reuse the existing UpdateWordIndexes logic which now uses normalizeRune
+	d.UpdateWordIndexes()
+
+	log.I.F("word index rebuild with unicode normalization complete")
+}