Add full-text search indexing for word tokens and update tokenization logic
- Introduced word index (`WordPrefix`) for tokenized search terms. - Added word token extraction in event and filter processing. - Implemented Unicode-aware, case-insensitive tokenizer with URL, mention, and hex filters. - Extended full-text indexing to include tags and content.
This commit is contained in:
@@ -69,6 +69,7 @@ const (
|
||||
TagPubkeyPrefix = I("tpc") // tag, pubkey, created at
|
||||
TagKindPubkeyPrefix = I("tkp") // tag, kind, pubkey, created at
|
||||
|
||||
WordPrefix = I("wrd") // word hash, serial
|
||||
ExpirationPrefix = I("exp") // timestamp of expiration
|
||||
VersionPrefix = I("ver") // database version number, for triggering reindexes when new keys are added (policy is add-only).
|
||||
)
|
||||
@@ -106,6 +107,8 @@ func Prefix(prf int) (i I) {
|
||||
return ExpirationPrefix
|
||||
case Version:
|
||||
return VersionPrefix
|
||||
case Word:
|
||||
return WordPrefix
|
||||
}
|
||||
return
|
||||
}
|
||||
@@ -147,6 +150,8 @@ func Identify(r io.Reader) (i int, err error) {
|
||||
|
||||
case ExpirationPrefix:
|
||||
i = Expiration
|
||||
case WordPrefix:
|
||||
i = Word
|
||||
}
|
||||
return
|
||||
}
|
||||
@@ -233,6 +238,21 @@ func FullIdPubkeyDec(
|
||||
return New(NewPrefix(), ser, fid, p, ca)
|
||||
}
|
||||
|
||||
// Word index for tokenized search terms
|
||||
//
|
||||
// 3 prefix|8 word-hash|5 serial
|
||||
var Word = next()
|
||||
|
||||
func WordVars() (w *types.Word, ser *types.Uint40) {
|
||||
return new(types.Word), new(types.Uint40)
|
||||
}
|
||||
func WordEnc(w *types.Word, ser *types.Uint40) (enc *T) {
|
||||
return New(NewPrefix(Word), w, ser)
|
||||
}
|
||||
func WordDec(w *types.Word, ser *types.Uint40) (enc *T) {
|
||||
return New(NewPrefix(), w, ser)
|
||||
}
|
||||
|
||||
// CreatedAt is an index that allows search for the timestamp on the event.
|
||||
//
|
||||
// 3 prefix|8 timestamp|5 serial
|
||||
|
||||
Reference in New Issue
Block a user