Add full-text search indexing for word tokens and update tokenization logic

- Introduced word index (`WordPrefix`) for tokenized search terms.
- Added word token extraction in event and filter processing.
- Implemented Unicode-aware, case-insensitive tokenizer with URL, mention, and hex filters.
- Extended full-text indexing to include tags and content.
This commit is contained in:
2025-10-01 15:03:41 +01:00
parent 7e6adf9fba
commit 86ac7b7897
7 changed files with 253 additions and 5 deletions

View File

@@ -9,10 +9,12 @@ import (
"github.com/dgraph-io/badger/v4"
"lol.mleku.dev/chk"
"lol.mleku.dev/log"
"next.orly.dev/pkg/database/indexes"
"next.orly.dev/pkg/database/indexes/types"
"next.orly.dev/pkg/encoders/event"
"next.orly.dev/pkg/encoders/filter"
"next.orly.dev/pkg/encoders/hex"
"next.orly.dev/pkg/encoders/kind"
"next.orly.dev/pkg/encoders/tag"
)
@@ -230,10 +232,10 @@ func (d *D) SaveEvent(c context.Context, ev *event.E) (kc, vc int, err error) {
return
},
)
// log.T.F(
// "total data written: %d bytes keys %d bytes values for event ID %s", kc,
// vc, hex.Enc(ev.ID),
// )
log.T.F(
"total data written: %d bytes keys %d bytes values for event ID %s", kc,
vc, hex.Enc(ev.ID),
)
// log.T.C(
// func() string {
// return fmt.Sprintf("event:\n%s\n", ev.Serialize())