Add full-text search indexing for word tokens and update tokenization logic

- Introduced word index (`WordPrefix`) for tokenized search terms.
- Added word token extraction in event and filter processing.
- Implemented Unicode-aware, case-insensitive tokenizer with URL, mention, and hex filters.
- Extended full-text indexing to include tags and content.
This commit is contained in:
2025-10-01 15:03:41 +01:00
parent 7e6adf9fba
commit 86ac7b7897
7 changed files with 253 additions and 5 deletions

View File

@@ -96,4 +96,4 @@ log statements to help locate the cause of bugs
always use Go v1.25.1 for everything involving Go always use Go v1.25.1 for everything involving Go
always use the nips repository that is available at /nips in the root of the repository for documentation about nostr protocol always use the nips repository also for information, found at ../github.com/nostr-protocol/nips attached to the project

View File

@@ -46,6 +46,7 @@ func (s *Server) HandleRelayInfo(w http.ResponseWriter, r *http.Request) {
relayinfo.ExpirationTimestamp, relayinfo.ExpirationTimestamp,
relayinfo.ProtectedEvents, relayinfo.ProtectedEvents,
relayinfo.RelayListMetadata, relayinfo.RelayListMetadata,
relayinfo.SearchCapability,
) )
if s.Config.ACLMode != "none" { if s.Config.ACLMode != "none" {
supportedNIPs = relayinfo.GetList( supportedNIPs = relayinfo.GetList(
@@ -62,6 +63,7 @@ func (s *Server) HandleRelayInfo(w http.ResponseWriter, r *http.Request) {
relayinfo.ExpirationTimestamp, relayinfo.ExpirationTimestamp,
relayinfo.ProtectedEvents, relayinfo.ProtectedEvents,
relayinfo.RelayListMetadata, relayinfo.RelayListMetadata,
relayinfo.SearchCapability,
) )
} }
sort.Sort(supportedNIPs) sort.Sort(supportedNIPs)

View File

@@ -153,5 +153,35 @@ func GetIndexesForEvent(ev *event.E, serial uint64) (
if err = appendIndexBytes(&idxs, kindPubkeyIndex); chk.E(err) { if err = appendIndexBytes(&idxs, kindPubkeyIndex); chk.E(err) {
return return
} }
// Word token indexes (from content)
if len(ev.Content) > 0 {
for _, h := range TokenHashes(ev.Content) {
w := new(Word)
w.FromWord(h) // 8-byte truncated hash
wIdx := indexes.WordEnc(w, ser)
if err = appendIndexBytes(&idxs, wIdx); chk.E(err) {
return
}
}
}
// Extend full-text search to include all fields of all tags
if ev.Tags != nil && ev.Tags.Len() > 0 {
for _, t := range *ev.Tags {
for _, field := range t.T { // include key and all values
if len(field) == 0 {
continue
}
for _, h := range TokenHashes(field) {
w := new(Word)
w.FromWord(h)
wIdx := indexes.WordEnc(w, ser)
if err = appendIndexBytes(&idxs, wIdx); chk.E(err) {
return
}
}
}
}
}
return return
} }

View File

@@ -113,6 +113,27 @@ func GetIndexesFromFilter(f *filter.F) (idxs []Range, err error) {
return return
} }
// Word search: if Search field is present, generate word index ranges
if len(f.Search) > 0 {
for _, h := range TokenHashes(f.Search) {
w := new(types2.Word)
w.FromWord(h)
buf := new(bytes.Buffer)
idx := indexes.WordEnc(w, nil)
if err = idx.MarshalWrite(buf); chk.E(err) {
return
}
b := buf.Bytes()
end := make([]byte, len(b))
copy(end, b)
for i := 0; i < 5; i++ { // match any serial
end = append(end, 0xff)
}
idxs = append(idxs, Range{b, end})
}
return
}
caStart := new(types2.Uint64) caStart := new(types2.Uint64)
caEnd := new(types2.Uint64) caEnd := new(types2.Uint64)

View File

@@ -69,6 +69,7 @@ const (
TagPubkeyPrefix = I("tpc") // tag, pubkey, created at TagPubkeyPrefix = I("tpc") // tag, pubkey, created at
TagKindPubkeyPrefix = I("tkp") // tag, kind, pubkey, created at TagKindPubkeyPrefix = I("tkp") // tag, kind, pubkey, created at
WordPrefix = I("wrd") // word hash, serial
ExpirationPrefix = I("exp") // timestamp of expiration ExpirationPrefix = I("exp") // timestamp of expiration
VersionPrefix = I("ver") // database version number, for triggering reindexes when new keys are added (policy is add-only). VersionPrefix = I("ver") // database version number, for triggering reindexes when new keys are added (policy is add-only).
) )
@@ -106,6 +107,8 @@ func Prefix(prf int) (i I) {
return ExpirationPrefix return ExpirationPrefix
case Version: case Version:
return VersionPrefix return VersionPrefix
case Word:
return WordPrefix
} }
return return
} }
@@ -147,6 +150,8 @@ func Identify(r io.Reader) (i int, err error) {
case ExpirationPrefix: case ExpirationPrefix:
i = Expiration i = Expiration
case WordPrefix:
i = Word
} }
return return
} }
@@ -233,6 +238,21 @@ func FullIdPubkeyDec(
return New(NewPrefix(), ser, fid, p, ca) return New(NewPrefix(), ser, fid, p, ca)
} }
// Word index for tokenized search terms
//
// 3 prefix|8 word-hash|5 serial
var Word = next()
func WordVars() (w *types.Word, ser *types.Uint40) {
return new(types.Word), new(types.Uint40)
}
func WordEnc(w *types.Word, ser *types.Uint40) (enc *T) {
return New(NewPrefix(Word), w, ser)
}
func WordDec(w *types.Word, ser *types.Uint40) (enc *T) {
return New(NewPrefix(), w, ser)
}
// CreatedAt is an index that allows search for the timestamp on the event. // CreatedAt is an index that allows search for the timestamp on the event.
// //
// 3 prefix|8 timestamp|5 serial // 3 prefix|8 timestamp|5 serial

View File

@@ -9,10 +9,12 @@ import (
"github.com/dgraph-io/badger/v4" "github.com/dgraph-io/badger/v4"
"lol.mleku.dev/chk" "lol.mleku.dev/chk"
"lol.mleku.dev/log"
"next.orly.dev/pkg/database/indexes" "next.orly.dev/pkg/database/indexes"
"next.orly.dev/pkg/database/indexes/types" "next.orly.dev/pkg/database/indexes/types"
"next.orly.dev/pkg/encoders/event" "next.orly.dev/pkg/encoders/event"
"next.orly.dev/pkg/encoders/filter" "next.orly.dev/pkg/encoders/filter"
"next.orly.dev/pkg/encoders/hex"
"next.orly.dev/pkg/encoders/kind" "next.orly.dev/pkg/encoders/kind"
"next.orly.dev/pkg/encoders/tag" "next.orly.dev/pkg/encoders/tag"
) )
@@ -230,10 +232,10 @@ func (d *D) SaveEvent(c context.Context, ev *event.E) (kc, vc int, err error) {
return return
}, },
) )
// log.T.F( log.T.F(
// "total data written: %d bytes keys %d bytes values for event ID %s", kc, "total data written: %d bytes keys %d bytes values for event ID %s", kc,
// vc, hex.Enc(ev.ID), vc, hex.Enc(ev.ID),
// ) )
// log.T.C( // log.T.C(
// func() string { // func() string {
// return fmt.Sprintf("event:\n%s\n", ev.Serialize()) // return fmt.Sprintf("event:\n%s\n", ev.Serialize())

173
pkg/database/tokenize.go Normal file
View File

@@ -0,0 +1,173 @@
package database
import (
"strings"
"unicode"
sha "next.orly.dev/pkg/crypto/sha256"
)
// TokenHashes extracts unique word hashes (8-byte truncated sha256) from content.
// Rules:
// - Unicode-aware: words are sequences of letters or numbers.
// - Lowercased using unicode case mapping.
// - Ignore URLs (starting with http://, https://, www., or containing "://").
// - Ignore nostr: URIs and #[n] mentions.
// - Ignore words shorter than 2 runes.
// - Exclude 64-character hexadecimal strings (likely IDs/pubkeys).
func TokenHashes(content []byte) [][]byte {
s := string(content)
var out [][]byte
seen := make(map[string]struct{})
i := 0
for i < len(s) {
r, size := rune(s[i]), 1
if r >= 0x80 {
r, size = utf8DecodeRuneInString(s[i:])
}
// Skip whitespace
if unicode.IsSpace(r) {
i += size
continue
}
// Skip URLs and schemes
if hasPrefixFold(s[i:], "http://") || hasPrefixFold(s[i:], "https://") || hasPrefixFold(s[i:], "nostr:") || hasPrefixFold(s[i:], "www.") {
i = skipUntilSpace(s, i)
continue
}
// If token contains "://" ahead, treat as URL and skip to space
if j := strings.Index(s[i:], "://"); j == 0 || (j > 0 && isWordStart(r)) {
// Only if it's at start of token
before := s[i : i+j]
if len(before) == 0 || allAlphaNum(before) {
i = skipUntilSpace(s, i)
continue
}
}
// Skip #[n] mentions
if r == '#' && i+size < len(s) && s[i+size] == '[' {
end := strings.IndexByte(s[i:], ']')
if end >= 0 {
i += end + 1
continue
}
}
// Collect a word
start := i
var runes []rune
for i < len(s) {
r2, size2 := rune(s[i]), 1
if r2 >= 0x80 {
r2, size2 = utf8DecodeRuneInString(s[i:])
}
if unicode.IsLetter(r2) || unicode.IsNumber(r2) {
runes = append(runes, unicode.ToLower(r2))
i += size2
continue
}
break
}
_ = start
if len(runes) >= 2 {
w := string(runes)
// Exclude 64-char hex strings
if isHex64(w) {
continue
}
if _, ok := seen[w]; !ok {
seen[w] = struct{}{}
h := sha.Sum256([]byte(w))
out = append(out, h[:8])
}
}
}
return out
}
func hasPrefixFold(s, prefix string) bool {
if len(s) < len(prefix) {
return false
}
for i := 0; i < len(prefix); i++ {
c := s[i]
p := prefix[i]
if c == p {
continue
}
// ASCII case-insensitive
if 'A' <= c && c <= 'Z' {
c = c - 'A' + 'a'
}
if 'A' <= p && p <= 'Z' {
p = p - 'A' + 'a'
}
if c != p {
return false
}
}
return true
}
func skipUntilSpace(s string, i int) int {
for i < len(s) {
r, size := rune(s[i]), 1
if r >= 0x80 {
r, size = utf8DecodeRuneInString(s[i:])
}
if unicode.IsSpace(r) {
return i
}
i += size
}
return i
}
func allAlphaNum(s string) bool {
for _, r := range s {
if !(unicode.IsLetter(r) || unicode.IsNumber(r)) {
return false
}
}
return true
}
func isWordStart(r rune) bool { return unicode.IsLetter(r) || unicode.IsNumber(r) }
// Minimal utf8 rune decode without importing utf8 to avoid extra deps elsewhere
func utf8DecodeRuneInString(s string) (r rune, size int) {
// Fallback to standard library if available; however, using basic decoding
for i := 1; i <= 4 && i <= len(s); i++ {
r, size = rune(s[0]), 1
if r < 0x80 {
return r, 1
}
// Use stdlib for correctness
return []rune(s[:i])[0], len(string([]rune(s[:i])[0]))
}
return rune(s[0]), 1
}
// isHex64 returns true if s is exactly 64 hex characters (0-9, a-f)
func isHex64(s string) bool {
if len(s) != 64 {
return false
}
for i := 0; i < 64; i++ {
c := s[i]
if c >= '0' && c <= '9' {
continue
}
if c >= 'a' && c <= 'f' {
continue
}
if c >= 'A' && c <= 'F' {
continue
}
return false
}
return true
}