Files
next.orly.dev/pkg/database/unicode_normalize_test.go
mleku 0addc61549
Some checks failed
Go / build-and-release (push) Has been cancelled
Add unicode normalization for word indexing (v0.36.10)
- Add unicode_normalize.go with mappings for small caps and fraktur
- Map 77 decorative unicode characters to ASCII equivalents:
  - Small caps (25 chars): ᴅᴇᴀᴛʜ → death
  - Fraktur lowercase (26 chars): 𝔡𝔢𝔞𝔱𝔥 → death
  - Fraktur uppercase (26 chars): 𝔇𝔈𝔄𝔗ℌ → death
- Fix broken utf8DecodeRuneInString() that failed on multi-byte UTF-8
- Add migration v7 to rebuild word indexes with normalization
- Add comprehensive unit tests for all character mappings

Files modified:
- pkg/database/unicode_normalize.go: New - character mapping tables
- pkg/database/unicode_normalize_test.go: New - unit tests
- pkg/database/tokenize.go: Integrate normalizeRune(), fix UTF-8 decoder
- pkg/database/migrations.go: Add version 7 migration

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-22 18:53:30 +01:00

206 lines
6.4 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
//go:build !(js && wasm)
package database
import (
"bytes"
"testing"
)
func TestNormalizeRune(t *testing.T) {
tests := []struct {
name string
input rune
expected rune
}{
// Small caps
{"small cap A", 'ᴀ', 'a'},
{"small cap B", 'ʙ', 'b'},
{"small cap C", '', 'c'},
{"small cap D", 'ᴅ', 'd'},
{"small cap E", 'ᴇ', 'e'},
{"small cap F", 'ꜰ', 'f'},
{"small cap G", 'ɢ', 'g'},
{"small cap H", 'ʜ', 'h'},
{"small cap I", 'ɪ', 'i'},
{"small cap J", 'ᴊ', 'j'},
{"small cap K", 'ᴋ', 'k'},
{"small cap L", 'ʟ', 'l'},
{"small cap M", 'ᴍ', 'm'},
{"small cap N", 'ɴ', 'n'},
{"small cap O", '', 'o'},
{"small cap P", 'ᴘ', 'p'},
{"small cap Q (ogonek)", 'ǫ', 'q'},
{"small cap R", 'ʀ', 'r'},
{"small cap S", '', 's'},
{"small cap T", 'ᴛ', 't'},
{"small cap U", '', 'u'},
{"small cap V", '', 'v'},
{"small cap W", '', 'w'},
{"small cap Y", 'ʏ', 'y'},
{"small cap Z", '', 'z'},
// Fraktur lowercase
{"fraktur lower a", '𝔞', 'a'},
{"fraktur lower b", '𝔟', 'b'},
{"fraktur lower c", '𝔠', 'c'},
{"fraktur lower d", '𝔡', 'd'},
{"fraktur lower e", '𝔢', 'e'},
{"fraktur lower f", '𝔣', 'f'},
{"fraktur lower g", '𝔤', 'g'},
{"fraktur lower h", '𝔥', 'h'},
{"fraktur lower i", '𝔦', 'i'},
{"fraktur lower j", '𝔧', 'j'},
{"fraktur lower k", '𝔨', 'k'},
{"fraktur lower l", '𝔩', 'l'},
{"fraktur lower m", '𝔪', 'm'},
{"fraktur lower n", '𝔫', 'n'},
{"fraktur lower o", '𝔬', 'o'},
{"fraktur lower p", '𝔭', 'p'},
{"fraktur lower q", '𝔮', 'q'},
{"fraktur lower r", '𝔯', 'r'},
{"fraktur lower s", '𝔰', 's'},
{"fraktur lower t", '𝔱', 't'},
{"fraktur lower u", '𝔲', 'u'},
{"fraktur lower v", '𝔳', 'v'},
{"fraktur lower w", '𝔴', 'w'},
{"fraktur lower x", '𝔵', 'x'},
{"fraktur lower y", '𝔶', 'y'},
{"fraktur lower z", '𝔷', 'z'},
// Fraktur uppercase (main range)
{"fraktur upper A", '𝔄', 'a'},
{"fraktur upper B", '𝔅', 'b'},
{"fraktur upper D", '𝔇', 'd'},
{"fraktur upper E", '𝔈', 'e'},
{"fraktur upper F", '𝔉', 'f'},
{"fraktur upper G", '𝔊', 'g'},
{"fraktur upper J", '𝔍', 'j'},
{"fraktur upper K", '𝔎', 'k'},
{"fraktur upper L", '𝔏', 'l'},
{"fraktur upper M", '𝔐', 'm'},
{"fraktur upper N", '𝔑', 'n'},
{"fraktur upper O", '𝔒', 'o'},
{"fraktur upper P", '𝔓', 'p'},
{"fraktur upper Q", '𝔔', 'q'},
{"fraktur upper S", '𝔖', 's'},
{"fraktur upper T", '𝔗', 't'},
{"fraktur upper U", '𝔘', 'u'},
{"fraktur upper V", '𝔙', 'v'},
{"fraktur upper W", '𝔚', 'w'},
{"fraktur upper X", '𝔛', 'x'},
{"fraktur upper Y", '𝔜', 'y'},
// Fraktur uppercase (Letterlike Symbols block)
{"fraktur upper C (letterlike)", '', 'c'},
{"fraktur upper H (letterlike)", '', 'h'},
{"fraktur upper I (letterlike)", '', 'i'},
{"fraktur upper R (letterlike)", '', 'r'},
{"fraktur upper Z (letterlike)", '', 'z'},
// Regular ASCII should pass through unchanged
{"regular lowercase a", 'a', 'a'},
{"regular lowercase z", 'z', 'z'},
{"regular uppercase A", 'A', 'A'},
{"regular digit 5", '5', '5'},
// Other unicode should pass through unchanged
{"cyrillic д", 'д', 'д'},
{"greek α", 'α', 'α'},
{"emoji", '🎉', '🎉'},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := normalizeRune(tt.input)
if result != tt.expected {
t.Errorf("normalizeRune(%q) = %q, want %q", tt.input, result, tt.expected)
}
})
}
}
func TestHasDecorativeUnicode(t *testing.T) {
tests := []struct {
name string
input string
expected bool
}{
{"plain ASCII", "hello world", false},
{"small caps word", "ᴅᴇᴀᴛʜ", true},
{"fraktur lowercase", "𝔥𝔢𝔩𝔩𝔬", true},
{"fraktur uppercase", "𝔇𝔈𝔄𝔗", true},
{"mixed with ASCII", "hello ᴡᴏʀʟᴅ", true},
{"single small cap", "aᴀa", true},
{"cyrillic (no normalize)", "привет", false},
{"empty string", "", false},
{"letterlike fraktur C", "ool", true},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := hasDecorativeUnicode(tt.input)
if result != tt.expected {
t.Errorf("hasDecorativeUnicode(%q) = %v, want %v", tt.input, result, tt.expected)
}
})
}
}
func TestTokenHashesNormalization(t *testing.T) {
// All three representations should produce the same hash
ascii := TokenHashes([]byte("death"))
smallCaps := TokenHashes([]byte("ᴅᴇᴀᴛʜ"))
frakturLower := TokenHashes([]byte("𝔡𝔢𝔞𝔱𝔥"))
frakturUpper := TokenHashes([]byte("𝔇𝔈𝔄𝔗"))
if len(ascii) != 1 {
t.Fatalf("expected 1 hash for 'death', got %d", len(ascii))
}
if len(smallCaps) != 1 {
t.Fatalf("expected 1 hash for small caps, got %d", len(smallCaps))
}
if len(frakturLower) != 1 {
t.Fatalf("expected 1 hash for fraktur lower, got %d", len(frakturLower))
}
if len(frakturUpper) != 1 {
t.Fatalf("expected 1 hash for fraktur upper, got %d", len(frakturUpper))
}
// All should match the ASCII version
if !bytes.Equal(ascii[0], smallCaps[0]) {
t.Errorf("small caps hash differs from ASCII\nASCII: %x\nsmall caps: %x", ascii[0], smallCaps[0])
}
if !bytes.Equal(ascii[0], frakturLower[0]) {
t.Errorf("fraktur lower hash differs from ASCII\nASCII: %x\nfraktur lower: %x", ascii[0], frakturLower[0])
}
if !bytes.Equal(ascii[0], frakturUpper[0]) {
t.Errorf("fraktur upper hash differs from ASCII\nASCII: %x\nfraktur upper: %x", ascii[0], frakturUpper[0])
}
}
func TestTokenHashesMixedContent(t *testing.T) {
// Test that mixed content normalizes correctly
content := []byte("ᴛʜᴇ quick 𝔟𝔯𝔬𝔴𝔫 fox")
hashes := TokenHashes(content)
// Should get: "the", "quick", "brown", "fox" (4 unique words)
if len(hashes) != 4 {
t.Errorf("expected 4 hashes from mixed content, got %d", len(hashes))
}
// Verify "the" matches between decorated and plain
thePlain := TokenHashes([]byte("the"))
theDecorated := TokenHashes([]byte("ᴛʜᴇ"))
if !bytes.Equal(thePlain[0], theDecorated[0]) {
t.Errorf("'the' hash mismatch: plain=%x, decorated=%x", thePlain[0], theDecorated[0])
}
// Verify "brown" matches between decorated and plain
brownPlain := TokenHashes([]byte("brown"))
brownDecorated := TokenHashes([]byte("𝔟𝔯𝔬𝔴𝔫"))
if !bytes.Equal(brownPlain[0], brownDecorated[0]) {
t.Errorf("'brown' hash mismatch: plain=%x, decorated=%x", brownPlain[0], brownDecorated[0])
}
}