Add full-text search indexing for word tokens and update tokenization logic

- Introduced word index (`WordPrefix`) for tokenized search terms.
- Added word token extraction in event and filter processing.
- Implemented Unicode-aware, case-insensitive tokenizer with URL, mention, and hex filters.
- Extended full-text indexing to include tags and content.
This commit is contained in:
2025-10-01 15:03:41 +01:00
parent 7e6adf9fba
commit 86ac7b7897
7 changed files with 253 additions and 5 deletions

View File

@@ -153,5 +153,35 @@ func GetIndexesForEvent(ev *event.E, serial uint64) (
if err = appendIndexBytes(&idxs, kindPubkeyIndex); chk.E(err) {
return
}
// Word token indexes (from content)
if len(ev.Content) > 0 {
for _, h := range TokenHashes(ev.Content) {
w := new(Word)
w.FromWord(h) // 8-byte truncated hash
wIdx := indexes.WordEnc(w, ser)
if err = appendIndexBytes(&idxs, wIdx); chk.E(err) {
return
}
}
}
// Extend full-text search to include all fields of all tags
if ev.Tags != nil && ev.Tags.Len() > 0 {
for _, t := range *ev.Tags {
for _, field := range t.T { // include key and all values
if len(field) == 0 {
continue
}
for _, h := range TokenHashes(field) {
w := new(Word)
w.FromWord(h)
wIdx := indexes.WordEnc(w, ser)
if err = appendIndexBytes(&idxs, wIdx); chk.E(err) {
return
}
}
}
}
}
return
}