diff --git a/.aiassistant/rules/rules.md b/.aiassistant/rules/rules.md index f688626..5dcded8 100644 --- a/.aiassistant/rules/rules.md +++ b/.aiassistant/rules/rules.md @@ -96,4 +96,4 @@ log statements to help locate the cause of bugs always use Go v1.25.1 for everything involving Go -always use the nips repository that is available at /nips in the root of the repository for documentation about nostr protocol \ No newline at end of file +always use the nips repository also for information, found at ../github.com/nostr-protocol/nips attached to the project \ No newline at end of file diff --git a/app/handle-relayinfo.go b/app/handle-relayinfo.go index acb975c..a46f921 100644 --- a/app/handle-relayinfo.go +++ b/app/handle-relayinfo.go @@ -46,6 +46,7 @@ func (s *Server) HandleRelayInfo(w http.ResponseWriter, r *http.Request) { relayinfo.ExpirationTimestamp, relayinfo.ProtectedEvents, relayinfo.RelayListMetadata, + relayinfo.SearchCapability, ) if s.Config.ACLMode != "none" { supportedNIPs = relayinfo.GetList( @@ -62,6 +63,7 @@ func (s *Server) HandleRelayInfo(w http.ResponseWriter, r *http.Request) { relayinfo.ExpirationTimestamp, relayinfo.ProtectedEvents, relayinfo.RelayListMetadata, + relayinfo.SearchCapability, ) } sort.Sort(supportedNIPs) diff --git a/pkg/database/get-indexes-for-event.go b/pkg/database/get-indexes-for-event.go index 262b7e9..4887521 100644 --- a/pkg/database/get-indexes-for-event.go +++ b/pkg/database/get-indexes-for-event.go @@ -153,5 +153,35 @@ func GetIndexesForEvent(ev *event.E, serial uint64) ( if err = appendIndexBytes(&idxs, kindPubkeyIndex); chk.E(err) { return } + + // Word token indexes (from content) + if len(ev.Content) > 0 { + for _, h := range TokenHashes(ev.Content) { + w := new(Word) + w.FromWord(h) // 8-byte truncated hash + wIdx := indexes.WordEnc(w, ser) + if err = appendIndexBytes(&idxs, wIdx); chk.E(err) { + return + } + } + } + // Extend full-text search to include all fields of all tags + if ev.Tags != nil && ev.Tags.Len() > 0 { + for _, t := range *ev.Tags { + for _, field := range t.T { // include key and all values + if len(field) == 0 { + continue + } + for _, h := range TokenHashes(field) { + w := new(Word) + w.FromWord(h) + wIdx := indexes.WordEnc(w, ser) + if err = appendIndexBytes(&idxs, wIdx); chk.E(err) { + return + } + } + } + } + } return } diff --git a/pkg/database/get-indexes-from-filter.go b/pkg/database/get-indexes-from-filter.go index 01f6004..a615a49 100644 --- a/pkg/database/get-indexes-from-filter.go +++ b/pkg/database/get-indexes-from-filter.go @@ -113,6 +113,27 @@ func GetIndexesFromFilter(f *filter.F) (idxs []Range, err error) { return } + // Word search: if Search field is present, generate word index ranges + if len(f.Search) > 0 { + for _, h := range TokenHashes(f.Search) { + w := new(types2.Word) + w.FromWord(h) + buf := new(bytes.Buffer) + idx := indexes.WordEnc(w, nil) + if err = idx.MarshalWrite(buf); chk.E(err) { + return + } + b := buf.Bytes() + end := make([]byte, len(b)) + copy(end, b) + for i := 0; i < 5; i++ { // match any serial + end = append(end, 0xff) + } + idxs = append(idxs, Range{b, end}) + } + return + } + caStart := new(types2.Uint64) caEnd := new(types2.Uint64) diff --git a/pkg/database/indexes/keys.go b/pkg/database/indexes/keys.go index 84cc13f..0442570 100644 --- a/pkg/database/indexes/keys.go +++ b/pkg/database/indexes/keys.go @@ -69,6 +69,7 @@ const ( TagPubkeyPrefix = I("tpc") // tag, pubkey, created at TagKindPubkeyPrefix = I("tkp") // tag, kind, pubkey, created at + WordPrefix = I("wrd") // word hash, serial ExpirationPrefix = I("exp") // timestamp of expiration VersionPrefix = I("ver") // database version number, for triggering reindexes when new keys are added (policy is add-only). ) @@ -106,6 +107,8 @@ func Prefix(prf int) (i I) { return ExpirationPrefix case Version: return VersionPrefix + case Word: + return WordPrefix } return } @@ -147,6 +150,8 @@ func Identify(r io.Reader) (i int, err error) { case ExpirationPrefix: i = Expiration + case WordPrefix: + i = Word } return } @@ -233,6 +238,21 @@ func FullIdPubkeyDec( return New(NewPrefix(), ser, fid, p, ca) } +// Word index for tokenized search terms +// +// 3 prefix|8 word-hash|5 serial +var Word = next() + +func WordVars() (w *types.Word, ser *types.Uint40) { + return new(types.Word), new(types.Uint40) +} +func WordEnc(w *types.Word, ser *types.Uint40) (enc *T) { + return New(NewPrefix(Word), w, ser) +} +func WordDec(w *types.Word, ser *types.Uint40) (enc *T) { + return New(NewPrefix(), w, ser) +} + // CreatedAt is an index that allows search for the timestamp on the event. // // 3 prefix|8 timestamp|5 serial diff --git a/pkg/database/save-event.go b/pkg/database/save-event.go index 96a10e0..064b8b1 100644 --- a/pkg/database/save-event.go +++ b/pkg/database/save-event.go @@ -9,10 +9,12 @@ import ( "github.com/dgraph-io/badger/v4" "lol.mleku.dev/chk" + "lol.mleku.dev/log" "next.orly.dev/pkg/database/indexes" "next.orly.dev/pkg/database/indexes/types" "next.orly.dev/pkg/encoders/event" "next.orly.dev/pkg/encoders/filter" + "next.orly.dev/pkg/encoders/hex" "next.orly.dev/pkg/encoders/kind" "next.orly.dev/pkg/encoders/tag" ) @@ -230,10 +232,10 @@ func (d *D) SaveEvent(c context.Context, ev *event.E) (kc, vc int, err error) { return }, ) - // log.T.F( - // "total data written: %d bytes keys %d bytes values for event ID %s", kc, - // vc, hex.Enc(ev.ID), - // ) + log.T.F( + "total data written: %d bytes keys %d bytes values for event ID %s", kc, + vc, hex.Enc(ev.ID), + ) // log.T.C( // func() string { // return fmt.Sprintf("event:\n%s\n", ev.Serialize()) diff --git a/pkg/database/tokenize.go b/pkg/database/tokenize.go new file mode 100644 index 0000000..6aceb3c --- /dev/null +++ b/pkg/database/tokenize.go @@ -0,0 +1,173 @@ +package database + +import ( + "strings" + "unicode" + + sha "next.orly.dev/pkg/crypto/sha256" +) + +// TokenHashes extracts unique word hashes (8-byte truncated sha256) from content. +// Rules: +// - Unicode-aware: words are sequences of letters or numbers. +// - Lowercased using unicode case mapping. +// - Ignore URLs (starting with http://, https://, www., or containing "://"). +// - Ignore nostr: URIs and #[n] mentions. +// - Ignore words shorter than 2 runes. +// - Exclude 64-character hexadecimal strings (likely IDs/pubkeys). +func TokenHashes(content []byte) [][]byte { + s := string(content) + var out [][]byte + seen := make(map[string]struct{}) + + i := 0 + for i < len(s) { + r, size := rune(s[i]), 1 + if r >= 0x80 { + r, size = utf8DecodeRuneInString(s[i:]) + } + + // Skip whitespace + if unicode.IsSpace(r) { + i += size + continue + } + + // Skip URLs and schemes + if hasPrefixFold(s[i:], "http://") || hasPrefixFold(s[i:], "https://") || hasPrefixFold(s[i:], "nostr:") || hasPrefixFold(s[i:], "www.") { + i = skipUntilSpace(s, i) + continue + } + // If token contains "://" ahead, treat as URL and skip to space + if j := strings.Index(s[i:], "://"); j == 0 || (j > 0 && isWordStart(r)) { + // Only if it's at start of token + before := s[i : i+j] + if len(before) == 0 || allAlphaNum(before) { + i = skipUntilSpace(s, i) + continue + } + } + // Skip #[n] mentions + if r == '#' && i+size < len(s) && s[i+size] == '[' { + end := strings.IndexByte(s[i:], ']') + if end >= 0 { + i += end + 1 + continue + } + } + + // Collect a word + start := i + var runes []rune + for i < len(s) { + r2, size2 := rune(s[i]), 1 + if r2 >= 0x80 { + r2, size2 = utf8DecodeRuneInString(s[i:]) + } + if unicode.IsLetter(r2) || unicode.IsNumber(r2) { + runes = append(runes, unicode.ToLower(r2)) + i += size2 + continue + } + break + } + _ = start + if len(runes) >= 2 { + w := string(runes) + // Exclude 64-char hex strings + if isHex64(w) { + continue + } + if _, ok := seen[w]; !ok { + seen[w] = struct{}{} + h := sha.Sum256([]byte(w)) + out = append(out, h[:8]) + } + } + } + return out +} + +func hasPrefixFold(s, prefix string) bool { + if len(s) < len(prefix) { + return false + } + for i := 0; i < len(prefix); i++ { + c := s[i] + p := prefix[i] + if c == p { + continue + } + // ASCII case-insensitive + if 'A' <= c && c <= 'Z' { + c = c - 'A' + 'a' + } + if 'A' <= p && p <= 'Z' { + p = p - 'A' + 'a' + } + if c != p { + return false + } + } + return true +} + +func skipUntilSpace(s string, i int) int { + for i < len(s) { + r, size := rune(s[i]), 1 + if r >= 0x80 { + r, size = utf8DecodeRuneInString(s[i:]) + } + if unicode.IsSpace(r) { + return i + } + i += size + } + return i +} + +func allAlphaNum(s string) bool { + for _, r := range s { + if !(unicode.IsLetter(r) || unicode.IsNumber(r)) { + return false + } + } + return true +} + +func isWordStart(r rune) bool { return unicode.IsLetter(r) || unicode.IsNumber(r) } + +// Minimal utf8 rune decode without importing utf8 to avoid extra deps elsewhere +func utf8DecodeRuneInString(s string) (r rune, size int) { + // Fallback to standard library if available; however, using basic decoding + for i := 1; i <= 4 && i <= len(s); i++ { + r, size = rune(s[0]), 1 + if r < 0x80 { + return r, 1 + } + // Use stdlib for correctness + return []rune(s[:i])[0], len(string([]rune(s[:i])[0])) + } + return rune(s[0]), 1 +} + +// isHex64 returns true if s is exactly 64 hex characters (0-9, a-f) +func isHex64(s string) bool { + if len(s) != 64 { + return false + } + for i := 0; i < 64; i++ { + c := s[i] + if c >= '0' && c <= '9' { + continue + } + if c >= 'a' && c <= 'f' { + continue + } + if c >= 'A' && c <= 'F' { + continue + } + return false + } + return true +}