Add full-text search indexing for word tokens and update tokenization logic
- Introduced word index (`WordPrefix`) for tokenized search terms. - Added word token extraction in event and filter processing. - Implemented Unicode-aware, case-insensitive tokenizer with URL, mention, and hex filters. - Extended full-text indexing to include tags and content.
This commit is contained in:
@@ -96,4 +96,4 @@ log statements to help locate the cause of bugs
|
|||||||
|
|
||||||
always use Go v1.25.1 for everything involving Go
|
always use Go v1.25.1 for everything involving Go
|
||||||
|
|
||||||
always use the nips repository that is available at /nips in the root of the repository for documentation about nostr protocol
|
always use the nips repository also for information, found at ../github.com/nostr-protocol/nips attached to the project
|
||||||
@@ -46,6 +46,7 @@ func (s *Server) HandleRelayInfo(w http.ResponseWriter, r *http.Request) {
|
|||||||
relayinfo.ExpirationTimestamp,
|
relayinfo.ExpirationTimestamp,
|
||||||
relayinfo.ProtectedEvents,
|
relayinfo.ProtectedEvents,
|
||||||
relayinfo.RelayListMetadata,
|
relayinfo.RelayListMetadata,
|
||||||
|
relayinfo.SearchCapability,
|
||||||
)
|
)
|
||||||
if s.Config.ACLMode != "none" {
|
if s.Config.ACLMode != "none" {
|
||||||
supportedNIPs = relayinfo.GetList(
|
supportedNIPs = relayinfo.GetList(
|
||||||
@@ -62,6 +63,7 @@ func (s *Server) HandleRelayInfo(w http.ResponseWriter, r *http.Request) {
|
|||||||
relayinfo.ExpirationTimestamp,
|
relayinfo.ExpirationTimestamp,
|
||||||
relayinfo.ProtectedEvents,
|
relayinfo.ProtectedEvents,
|
||||||
relayinfo.RelayListMetadata,
|
relayinfo.RelayListMetadata,
|
||||||
|
relayinfo.SearchCapability,
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
sort.Sort(supportedNIPs)
|
sort.Sort(supportedNIPs)
|
||||||
|
|||||||
@@ -153,5 +153,35 @@ func GetIndexesForEvent(ev *event.E, serial uint64) (
|
|||||||
if err = appendIndexBytes(&idxs, kindPubkeyIndex); chk.E(err) {
|
if err = appendIndexBytes(&idxs, kindPubkeyIndex); chk.E(err) {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Word token indexes (from content)
|
||||||
|
if len(ev.Content) > 0 {
|
||||||
|
for _, h := range TokenHashes(ev.Content) {
|
||||||
|
w := new(Word)
|
||||||
|
w.FromWord(h) // 8-byte truncated hash
|
||||||
|
wIdx := indexes.WordEnc(w, ser)
|
||||||
|
if err = appendIndexBytes(&idxs, wIdx); chk.E(err) {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Extend full-text search to include all fields of all tags
|
||||||
|
if ev.Tags != nil && ev.Tags.Len() > 0 {
|
||||||
|
for _, t := range *ev.Tags {
|
||||||
|
for _, field := range t.T { // include key and all values
|
||||||
|
if len(field) == 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
for _, h := range TokenHashes(field) {
|
||||||
|
w := new(Word)
|
||||||
|
w.FromWord(h)
|
||||||
|
wIdx := indexes.WordEnc(w, ser)
|
||||||
|
if err = appendIndexBytes(&idxs, wIdx); chk.E(err) {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -113,6 +113,27 @@ func GetIndexesFromFilter(f *filter.F) (idxs []Range, err error) {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Word search: if Search field is present, generate word index ranges
|
||||||
|
if len(f.Search) > 0 {
|
||||||
|
for _, h := range TokenHashes(f.Search) {
|
||||||
|
w := new(types2.Word)
|
||||||
|
w.FromWord(h)
|
||||||
|
buf := new(bytes.Buffer)
|
||||||
|
idx := indexes.WordEnc(w, nil)
|
||||||
|
if err = idx.MarshalWrite(buf); chk.E(err) {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
b := buf.Bytes()
|
||||||
|
end := make([]byte, len(b))
|
||||||
|
copy(end, b)
|
||||||
|
for i := 0; i < 5; i++ { // match any serial
|
||||||
|
end = append(end, 0xff)
|
||||||
|
}
|
||||||
|
idxs = append(idxs, Range{b, end})
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
caStart := new(types2.Uint64)
|
caStart := new(types2.Uint64)
|
||||||
caEnd := new(types2.Uint64)
|
caEnd := new(types2.Uint64)
|
||||||
|
|
||||||
|
|||||||
@@ -69,6 +69,7 @@ const (
|
|||||||
TagPubkeyPrefix = I("tpc") // tag, pubkey, created at
|
TagPubkeyPrefix = I("tpc") // tag, pubkey, created at
|
||||||
TagKindPubkeyPrefix = I("tkp") // tag, kind, pubkey, created at
|
TagKindPubkeyPrefix = I("tkp") // tag, kind, pubkey, created at
|
||||||
|
|
||||||
|
WordPrefix = I("wrd") // word hash, serial
|
||||||
ExpirationPrefix = I("exp") // timestamp of expiration
|
ExpirationPrefix = I("exp") // timestamp of expiration
|
||||||
VersionPrefix = I("ver") // database version number, for triggering reindexes when new keys are added (policy is add-only).
|
VersionPrefix = I("ver") // database version number, for triggering reindexes when new keys are added (policy is add-only).
|
||||||
)
|
)
|
||||||
@@ -106,6 +107,8 @@ func Prefix(prf int) (i I) {
|
|||||||
return ExpirationPrefix
|
return ExpirationPrefix
|
||||||
case Version:
|
case Version:
|
||||||
return VersionPrefix
|
return VersionPrefix
|
||||||
|
case Word:
|
||||||
|
return WordPrefix
|
||||||
}
|
}
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
@@ -147,6 +150,8 @@ func Identify(r io.Reader) (i int, err error) {
|
|||||||
|
|
||||||
case ExpirationPrefix:
|
case ExpirationPrefix:
|
||||||
i = Expiration
|
i = Expiration
|
||||||
|
case WordPrefix:
|
||||||
|
i = Word
|
||||||
}
|
}
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
@@ -233,6 +238,21 @@ func FullIdPubkeyDec(
|
|||||||
return New(NewPrefix(), ser, fid, p, ca)
|
return New(NewPrefix(), ser, fid, p, ca)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Word index for tokenized search terms
|
||||||
|
//
|
||||||
|
// 3 prefix|8 word-hash|5 serial
|
||||||
|
var Word = next()
|
||||||
|
|
||||||
|
func WordVars() (w *types.Word, ser *types.Uint40) {
|
||||||
|
return new(types.Word), new(types.Uint40)
|
||||||
|
}
|
||||||
|
func WordEnc(w *types.Word, ser *types.Uint40) (enc *T) {
|
||||||
|
return New(NewPrefix(Word), w, ser)
|
||||||
|
}
|
||||||
|
func WordDec(w *types.Word, ser *types.Uint40) (enc *T) {
|
||||||
|
return New(NewPrefix(), w, ser)
|
||||||
|
}
|
||||||
|
|
||||||
// CreatedAt is an index that allows search for the timestamp on the event.
|
// CreatedAt is an index that allows search for the timestamp on the event.
|
||||||
//
|
//
|
||||||
// 3 prefix|8 timestamp|5 serial
|
// 3 prefix|8 timestamp|5 serial
|
||||||
|
|||||||
@@ -9,10 +9,12 @@ import (
|
|||||||
|
|
||||||
"github.com/dgraph-io/badger/v4"
|
"github.com/dgraph-io/badger/v4"
|
||||||
"lol.mleku.dev/chk"
|
"lol.mleku.dev/chk"
|
||||||
|
"lol.mleku.dev/log"
|
||||||
"next.orly.dev/pkg/database/indexes"
|
"next.orly.dev/pkg/database/indexes"
|
||||||
"next.orly.dev/pkg/database/indexes/types"
|
"next.orly.dev/pkg/database/indexes/types"
|
||||||
"next.orly.dev/pkg/encoders/event"
|
"next.orly.dev/pkg/encoders/event"
|
||||||
"next.orly.dev/pkg/encoders/filter"
|
"next.orly.dev/pkg/encoders/filter"
|
||||||
|
"next.orly.dev/pkg/encoders/hex"
|
||||||
"next.orly.dev/pkg/encoders/kind"
|
"next.orly.dev/pkg/encoders/kind"
|
||||||
"next.orly.dev/pkg/encoders/tag"
|
"next.orly.dev/pkg/encoders/tag"
|
||||||
)
|
)
|
||||||
@@ -230,10 +232,10 @@ func (d *D) SaveEvent(c context.Context, ev *event.E) (kc, vc int, err error) {
|
|||||||
return
|
return
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
// log.T.F(
|
log.T.F(
|
||||||
// "total data written: %d bytes keys %d bytes values for event ID %s", kc,
|
"total data written: %d bytes keys %d bytes values for event ID %s", kc,
|
||||||
// vc, hex.Enc(ev.ID),
|
vc, hex.Enc(ev.ID),
|
||||||
// )
|
)
|
||||||
// log.T.C(
|
// log.T.C(
|
||||||
// func() string {
|
// func() string {
|
||||||
// return fmt.Sprintf("event:\n%s\n", ev.Serialize())
|
// return fmt.Sprintf("event:\n%s\n", ev.Serialize())
|
||||||
|
|||||||
173
pkg/database/tokenize.go
Normal file
173
pkg/database/tokenize.go
Normal file
@@ -0,0 +1,173 @@
|
|||||||
|
package database
|
||||||
|
|
||||||
|
import (
|
||||||
|
"strings"
|
||||||
|
"unicode"
|
||||||
|
|
||||||
|
sha "next.orly.dev/pkg/crypto/sha256"
|
||||||
|
)
|
||||||
|
|
||||||
|
// TokenHashes extracts unique word hashes (8-byte truncated sha256) from content.
|
||||||
|
// Rules:
|
||||||
|
// - Unicode-aware: words are sequences of letters or numbers.
|
||||||
|
// - Lowercased using unicode case mapping.
|
||||||
|
// - Ignore URLs (starting with http://, https://, www., or containing "://").
|
||||||
|
// - Ignore nostr: URIs and #[n] mentions.
|
||||||
|
// - Ignore words shorter than 2 runes.
|
||||||
|
// - Exclude 64-character hexadecimal strings (likely IDs/pubkeys).
|
||||||
|
func TokenHashes(content []byte) [][]byte {
|
||||||
|
s := string(content)
|
||||||
|
var out [][]byte
|
||||||
|
seen := make(map[string]struct{})
|
||||||
|
|
||||||
|
i := 0
|
||||||
|
for i < len(s) {
|
||||||
|
r, size := rune(s[i]), 1
|
||||||
|
if r >= 0x80 {
|
||||||
|
r, size = utf8DecodeRuneInString(s[i:])
|
||||||
|
}
|
||||||
|
|
||||||
|
// Skip whitespace
|
||||||
|
if unicode.IsSpace(r) {
|
||||||
|
i += size
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
// Skip URLs and schemes
|
||||||
|
if hasPrefixFold(s[i:], "http://") || hasPrefixFold(s[i:], "https://") || hasPrefixFold(s[i:], "nostr:") || hasPrefixFold(s[i:], "www.") {
|
||||||
|
i = skipUntilSpace(s, i)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
// If token contains "://" ahead, treat as URL and skip to space
|
||||||
|
if j := strings.Index(s[i:], "://"); j == 0 || (j > 0 && isWordStart(r)) {
|
||||||
|
// Only if it's at start of token
|
||||||
|
before := s[i : i+j]
|
||||||
|
if len(before) == 0 || allAlphaNum(before) {
|
||||||
|
i = skipUntilSpace(s, i)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Skip #[n] mentions
|
||||||
|
if r == '#' && i+size < len(s) && s[i+size] == '[' {
|
||||||
|
end := strings.IndexByte(s[i:], ']')
|
||||||
|
if end >= 0 {
|
||||||
|
i += end + 1
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Collect a word
|
||||||
|
start := i
|
||||||
|
var runes []rune
|
||||||
|
for i < len(s) {
|
||||||
|
r2, size2 := rune(s[i]), 1
|
||||||
|
if r2 >= 0x80 {
|
||||||
|
r2, size2 = utf8DecodeRuneInString(s[i:])
|
||||||
|
}
|
||||||
|
if unicode.IsLetter(r2) || unicode.IsNumber(r2) {
|
||||||
|
runes = append(runes, unicode.ToLower(r2))
|
||||||
|
i += size2
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
break
|
||||||
|
}
|
||||||
|
_ = start
|
||||||
|
if len(runes) >= 2 {
|
||||||
|
w := string(runes)
|
||||||
|
// Exclude 64-char hex strings
|
||||||
|
if isHex64(w) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if _, ok := seen[w]; !ok {
|
||||||
|
seen[w] = struct{}{}
|
||||||
|
h := sha.Sum256([]byte(w))
|
||||||
|
out = append(out, h[:8])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
func hasPrefixFold(s, prefix string) bool {
|
||||||
|
if len(s) < len(prefix) {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
for i := 0; i < len(prefix); i++ {
|
||||||
|
c := s[i]
|
||||||
|
p := prefix[i]
|
||||||
|
if c == p {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
// ASCII case-insensitive
|
||||||
|
if 'A' <= c && c <= 'Z' {
|
||||||
|
c = c - 'A' + 'a'
|
||||||
|
}
|
||||||
|
if 'A' <= p && p <= 'Z' {
|
||||||
|
p = p - 'A' + 'a'
|
||||||
|
}
|
||||||
|
if c != p {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
|
func skipUntilSpace(s string, i int) int {
|
||||||
|
for i < len(s) {
|
||||||
|
r, size := rune(s[i]), 1
|
||||||
|
if r >= 0x80 {
|
||||||
|
r, size = utf8DecodeRuneInString(s[i:])
|
||||||
|
}
|
||||||
|
if unicode.IsSpace(r) {
|
||||||
|
return i
|
||||||
|
}
|
||||||
|
i += size
|
||||||
|
}
|
||||||
|
return i
|
||||||
|
}
|
||||||
|
|
||||||
|
func allAlphaNum(s string) bool {
|
||||||
|
for _, r := range s {
|
||||||
|
if !(unicode.IsLetter(r) || unicode.IsNumber(r)) {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
|
func isWordStart(r rune) bool { return unicode.IsLetter(r) || unicode.IsNumber(r) }
|
||||||
|
|
||||||
|
// Minimal utf8 rune decode without importing utf8 to avoid extra deps elsewhere
|
||||||
|
func utf8DecodeRuneInString(s string) (r rune, size int) {
|
||||||
|
// Fallback to standard library if available; however, using basic decoding
|
||||||
|
for i := 1; i <= 4 && i <= len(s); i++ {
|
||||||
|
r, size = rune(s[0]), 1
|
||||||
|
if r < 0x80 {
|
||||||
|
return r, 1
|
||||||
|
}
|
||||||
|
// Use stdlib for correctness
|
||||||
|
return []rune(s[:i])[0], len(string([]rune(s[:i])[0]))
|
||||||
|
}
|
||||||
|
return rune(s[0]), 1
|
||||||
|
}
|
||||||
|
|
||||||
|
// isHex64 returns true if s is exactly 64 hex characters (0-9, a-f)
|
||||||
|
func isHex64(s string) bool {
|
||||||
|
if len(s) != 64 {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
for i := 0; i < 64; i++ {
|
||||||
|
c := s[i]
|
||||||
|
if c >= '0' && c <= '9' {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if c >= 'a' && c <= 'f' {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if c >= 'A' && c <= 'F' {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
return true
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user