theoretically implemented fulltext search

2025-05-17 06:33:49 -01:06
parent b39d47b2d5
commit 09fc0a1029
9 changed files with 365 additions and 34 deletions
--- a/addresstag/addresstag.go
+++ b/addresstag/addresstag.go
@@ -1,21 +1,22 @@
 package addresstag

 import (
+	"bytes"
 	"strconv"
-	"strings"

 	"realy.lol/chk"
 	"realy.lol/hex"
 )

 // DecodeAddressTag unpacks the contents of an `a` tag.
-func DecodeAddressTag(tagValue string) (k uint16, pkb []byte, d string) {
-	split := strings.Split(tagValue, ":")
+func DecodeAddressTag(tagValue []byte) (k uint16, pkb []byte, d []byte) {
+	split := bytes.Split(tagValue, []byte(":"))
 	if len(split) == 3 {
 		var err error
 		var key uint64
-		if pkb, _ = hex.Dec(split[1]); len(pkb) == 32 {
-			if key, err = strconv.ParseUint(split[0], 10, 16); !chk.E(err) {
+		if pkb, _ = hex.DecAppend(pkb, split[1]); len(pkb) == 32 {
+			// todo: use ints package for this
+			if key, err = strconv.ParseUint(string(split[0]), 10, 16); !chk.E(err) {
 				return uint16(key), pkb, split[2]
 			}
 		}
--- a/ratel/create-a-tag.go
+++ b/ratel/create-a-tag.go
@@ -20,6 +20,8 @@ import (

 // Create_a_Tag generates tag indexes from a tag key, tag value, created_at
 // timestamp and the event serial.
+//
+// This only covers what are essentially p tags and a tags.
 func Create_a_Tag(tagKey, tagValue string, CA *createdat.T,
 	ser *serial.T) (prf index.P, elems []keys.Element, err error) {

--- a/ratel/fulltext-query.go
+++ b/ratel/fulltext-query.go
@@ -1 +1,270 @@
 package ratel
+
+import (
+	"bytes"
+	"sort"
+	"time"
+
+	"github.com/dgraph-io/badger/v4"
+
+	"realy.lol/chk"
+	"realy.lol/context"
+	"realy.lol/filter"
+	"realy.lol/log"
+	"realy.lol/ratel/keys/arb"
+	"realy.lol/ratel/keys/serial"
+	"realy.lol/ratel/prefixes"
+	"realy.lol/store"
+	"realy.lol/tag"
+)
+
+type FulltextSequence struct {
+	inSequence int
+	distance   int
+	sequence   []int
+	items      []*prefixes.FulltextIndexKey
+}
+
+func (r *T) QueryFulltextEvents(c context.T, f *filter.T) (evs []store.IdTsPk, err error) {
+	start := time.Now()
+	// just use QueryEvents if there isn't actually any fulltext search field content.
+	if len(f.Search) == 0 {
+		return r.QueryForIds(c, f)
+	}
+	split := bytes.Split(f.Search, []byte{' '})
+	var lang []byte
+	var terms [][]byte
+	for i := range split {
+		if bytes.HasPrefix(split[i], []byte("lang:")) {
+			lang = split[i][5:]
+		} else {
+			terms = append(terms, split[i])
+		}
+	}
+	var fTags []*tag.T
+	if f.Tags != nil {
+		fTags = f.Tags.ToSliceOfTags()
+	}
+	fAut := f.Authors.ToSliceOfBytes()
+	fKinds := f.Kinds.K
+	var matches []*prefixes.FulltextIndexKey
+	if err = r.View(func(txn *badger.Txn) (err error) {
+		it := txn.NewIterator(badger.IteratorOptions{
+			Prefix:  prefixes.FulltextIndex.Key(),
+			Reverse: true,
+		})
+		defer it.Close()
+		for _, v := range terms {
+			for it.Rewind(); it.ValidForPrefix(prefixes.FulltextIndex.Key(arb.New(v))); it.Next() {
+				item := it.Item()
+				k := item.KeyCopy(nil)
+				var idx *prefixes.FulltextIndexKey
+				if idx, err = prefixes.NewFulltextIndexKey(k); chk.E(err) {
+					continue
+				}
+				if f.Since != nil {
+					ts := idx.Timestamp()
+					if ts.I64() < f.Since.I64() {
+						// event is earlier than since
+						continue
+					}
+				}
+				if f.Until != nil {
+					ts := idx.Timestamp()
+					if ts.I64() > f.Until.I64() {
+						// event is later than until
+						continue
+					}
+				}
+				if len(fKinds) != 0 {
+					var found bool
+					ki := idx.Kind()
+					for _, kin := range fKinds {
+						if ki.Equal(kin) {
+							found = true
+							break
+						}
+					}
+					// kinds are present in filter and don't match
+					if !found {
+						continue
+					}
+				}
+				if len(fAut) > 0 {
+					var found bool
+					pk := idx.Pubkey()
+					for _, p := range fAut {
+						if bytes.Equal(p, pk) {
+							found = true
+							break
+						}
+					}
+					// pubkey is in filter and doesn't match
+					if !found {
+						continue
+					}
+				}
+				// get serial
+				ser := idx.Serial()
+				// check language tags
+				if len(lang) > 0 {
+					var found bool
+					func() {
+						itl := txn.NewIterator(badger.IteratorOptions{
+							Prefix: prefixes.LangIndex.Key(),
+						})
+						defer itl.Close()
+						for itl.Rewind(); itl.Valid(); itl.Next() {
+							s := serial.FromKey(itl.Item().KeyCopy(nil))
+							if s.Uint64() == ser.Uint64() {
+								found = true
+								return
+							}
+						}
+					}()
+					// the event does not have an associated language tag
+					if !found {
+						continue
+					}
+				}
+				// now we can check tags, they can't be squished into a fulltext index, and
+				// require a second table iteration
+				if len(fTags) > 0 {
+					var found bool
+					for _, ft := range fTags {
+						if len(ft.Key()) == 2 && ft.Key()[0] == '#' {
+							var tp []byte
+							if tp, err = GetTagKeyPrefix(ft.Key()[0], ft.Value()); chk.E(err) {
+								continue
+							}
+							if len(tp) == 0 {
+								// the tag did not generate an index
+								continue
+							}
+							func() {
+								itt := txn.NewIterator(badger.IteratorOptions{
+									Prefix: tp,
+								})
+								defer itt.Close()
+								for itt.Rewind(); itt.Valid(); itt.Next() {
+									s := serial.FromKey(itt.Item().KeyCopy(nil))
+									if s.Uint64() == ser.Uint64() {
+										found = true
+										return
+									}
+								}
+							}()
+							// the event does not have any of the required tags
+							if !found {
+								continue
+							}
+						}
+					}
+					if !found {
+						continue
+					}
+				}
+				// if we got to here, we have a match
+				matches = append(matches, idx)
+			}
+		}
+		return
+	}); chk.E(err) {
+		return
+	}
+	if len(matches) == 0 {
+		// didn't find any (?)
+		return
+	}
+	// next we need to group and sort the results
+	groups := make(map[uint64]FulltextSequence)
+	for _, v := range matches {
+		if _, ok := groups[v.Serial().Uint64()]; !ok {
+			groups[v.Serial().Uint64()] = FulltextSequence{items: []*prefixes.FulltextIndexKey{v}}
+		} else {
+			g := groups[v.Serial().Uint64()]
+			g.items = append(g.items, v)
+		}
+	}
+	// now we need to convert the map to a slice so we can sort it
+	var groupS []FulltextSequence
+	for _, g := range groups {
+		groupS = append(groupS, g)
+	}
+	// first, sort the groups by the number of elements in descending order
+	sort.Slice(groupS, func(i, j int) (e bool) {
+		return len(groupS[i].items) > len(groupS[j].items)
+	})
+	// get the distance of the groups
+	for _, g := range groupS {
+		g.distance = int(g.items[len(g.items)-1].Sequence().Val - g.items[0].Sequence().Val)
+	}
+	// get the sequence as relates to the search terms
+	for _, g := range groupS {
+		for i := range g.items {
+			if i > 0 {
+				for k := range terms {
+					if bytes.Equal(g.items[i].Word(), terms[k]) {
+						g.sequence = append(g.sequence, i)
+					}
+				}
+			}
+		}
+	}
+	// count the number of elements of the sequence that are in ascending order
+	for _, g := range groupS {
+		for i := range g.sequence {
+			if i > 0 {
+				if g.sequence[i-1] < g.sequence[i] {
+					g.inSequence++
+				}
+			}
+		}
+	}
+	// find the boundaries of each length segment of the group
+	var groupedCounts []int
+	var lastCount int
+	lastCount = len(groupS[0].items)
+	for i, g := range groupS {
+		if len(g.items) < lastCount {
+			groupedCounts = append(groupedCounts, i)
+			lastCount = len(g.items)
+		}
+	}
+	// break the groupS into segments of the same length
+	var segments [][]FulltextSequence
+	lastCount = 0
+	for i := range groupedCounts {
+		segments = append(segments, groupS[lastCount:groupedCounts[i]])
+	}
+	// sort the segments by distance and number in sequence
+	for _, s := range segments {
+		sort.Slice(s, func(i, j int) bool {
+			return (s[i].distance < s[j].distance) && s[i].inSequence > s[i].inSequence
+		})
+	}
+	// flatten the segments back into a list
+	var list []FulltextSequence
+	for _, seg := range segments {
+		for _, bit := range seg {
+			list = append(list, bit)
+		}
+	}
+	// convert into store.IdTsPk
+	for _, bit := range list {
+		for _, el := range bit.items {
+			evs = append(evs, store.IdTsPk{
+				Ts:  el.Timestamp().I64(),
+				Id:  el.EventId().Bytes(),
+				Pub: el.Pubkey(),
+			})
+		}
+	}
+	if f.Limit != nil {
+		evs = evs[:*f.Limit]
+	} else {
+		evs = evs[:r.MaxLimit]
+	}
+	log.I.F("performed search for '%s' in %v", f.Search, time.Now().Sub(start))
+	return
+}
--- a/ratel/getindexkeysforevent.go
+++ b/ratel/getindexkeysforevent.go
@@ -6,7 +6,7 @@ import (
 	"realy.lol/chk"
 	"realy.lol/event"
 	"realy.lol/eventid"
-	"realy.lol/log"
+	"realy.lol/hex"
 	"realy.lol/ratel/keys"
 	"realy.lol/ratel/keys/createdat"
 	"realy.lol/ratel/keys/fullid"
@@ -59,15 +59,19 @@ func GetIndexKeysForEvent(ev *event.T, ser *serial.T) (keyz [][]byte) {
 	}
 	// ~ by tag value + date
 	for i, t := range ev.Tags.ToSliceOfTags() {
-		// there is no value field
+		tsb := t.ToSliceOfBytes()
+		if t.Len() < 2 {
+			continue
+		}
+		tk, tv := tsb[0], tsb[1]
 		if t.Len() < 2 ||
-			// the tag is not a-zA-Z probably (this would permit arbitrary other
-			// single byte chars)
-			len(t.ToSliceOfBytes()[0]) != 1 ||
+			// the tag is not a-zA-Z probably (this would permit arbitrary other single byte
+			// chars)
+			len(tk) != 1 ||
 			// the second field is zero length
-			len(t.ToSliceOfBytes()[1]) == 0 ||
+			len(tv) == 0 ||
 			// the second field is more than 100 characters long
-			len(t.ToSliceOfBytes()[1]) > 100 {
+			len(tv) > 100 {
 			// any of the above is true then the tag is not indexable
 			continue
 		}
@@ -82,17 +86,27 @@ func GetIndexKeysForEvent(ev *event.T, ser *serial.T) (keyz [][]byte) {
 			// duplicate
 			continue
 		}
-		// get key prefix (with full length) and offset where to write the last
-		// parts
+		// create tags for e (event references) but we don't care about the optional third value
+		// as it can't be searched for anyway (it's for clients to render threads)
+		if bytes.Equal(tk, []byte("e")) {
+			if len(tv) != 64 {
+				continue
+			}
+			var ei []byte
+			if ei, err = hex.DecAppend(ei, tv); chk.E(err) {
+				continue
+			}
+			keyz = append(keyz, prefixes.TagEventId.Key(id.New(eventid.NewWith(ei)), ser))
+			continue
+		}
+		// get key prefix (with full length) and offset where to write the last parts.
 		prf, elems := index.P(0), []keys.Element(nil)
-		if prf, elems, err = Create_a_Tag(string(t.ToSliceOfBytes()[0]),
-			string(t.ToSliceOfBytes()[1]), CA,
-			ser); chk.E(err) {
-			log.I.F("%v", t.ToStringSlice())
+		if prf, elems, err = Create_a_Tag(string(tsb[0]),
+			string(tv), CA, ser); chk.E(err) {
+			// log.I.F("%v", t.ToStringSlice())
 			return
 		}
 		k := prf.Key(elems...)
-		// log.T.ToSliceOfBytes("tag '%s': %s key %0x", t.ToSliceOfBytes()[0], t.ToSliceOfBytes()[1:], k)
 		keyz = append(keyz, k)
 	}
 	{ // ~ by date only
--- a/ratel/gettagkeyprefix.go
+++ b/ratel/gettagkeyprefix.go
@@ -3,9 +3,11 @@ package ratel
 import (
 	eventstore "realy.lol/addresstag"
 	"realy.lol/chk"
+	"realy.lol/eventid"
 	"realy.lol/hex"
 	"realy.lol/ratel/keys"
 	"realy.lol/ratel/keys/arb"
+	"realy.lol/ratel/keys/id"
 	"realy.lol/ratel/keys/kinder"
 	"realy.lol/ratel/keys/pubkey"
 	"realy.lol/ratel/prefixes"
@@ -14,7 +16,7 @@ import (
 // GetTagKeyPrefix returns tag index prefixes based on the initial field of a
 // tag.
 //
-// There is 3 types of index tag keys:
+// There is 4 types of index tag keys:
 //
 // - TagAddr:   [ 8 ][ 2b Kind ][ 8b Pubkey ][ address/URL ][ 8b Serial ]
 //
@@ -22,9 +24,17 @@ import (
 //
 // - Tag:       [ 6 ][ address/URL ][ 8b Serial ]
 //
+// - TagEventId [ 8 bytes eventid.T prefix ][ 8 bytes Serial ]
+//
 // This function produces the initial bytes without the index.
-func GetTagKeyPrefix(tagValue string) (key []byte, err error) {
-	if k, pkb, d := eventstore.DecodeAddressTag(tagValue); len(pkb) == 32 {
+func GetTagKeyPrefix(prf byte, tagValue []byte) (key []byte, err error) {
+	if prf == 'e' {
+		var eid []byte
+		if eid, err = hex.DecAppend(eid, tagValue); chk.E(err) {
+			return
+		}
+		key = prefixes.TagEventId.Key(id.New(eventid.NewWith(eid)))
+	} else if k, pkb, d := eventstore.DecodeAddressTag(tagValue); len(pkb) == 32 {
 		// store value in the new special "a" tag index
 		var pk *pubkey.T
 		if pk, err = pubkey.NewFromBytes(pkb); chk.E(err) {
@@ -35,7 +45,7 @@ func GetTagKeyPrefix(tagValue string) (key []byte, err error) {
 			els = append(els, arb.New(d))
 		}
 		key = prefixes.TagAddr.Key(els...)
-	} else if pkb, _ := hex.Dec(tagValue); len(pkb) == 32 {
+	} else if pkb, _ := hex.DecAppend(nil, tagValue); len(pkb) == 32 {
 		// store value as bytes
 		var pkk *pubkey.T
 		if pkk, err = pubkey.NewFromBytes(pkb); chk.E(err) {
--- a/ratel/prefixes/fulltext.go
+++ b/ratel/prefixes/fulltext.go
@@ -33,7 +33,7 @@ type FulltextIndexKey struct {
 	pubkey    []byte
 	timestamp *timestamp.T
 	kind      *kind.T
-	sequence  uint32
+	sequence  *integer.T
 	serial    *serial.T
 }

@@ -51,42 +51,62 @@ func (f *FulltextIndexKey) Segment(start, end int) []byte {
 }

 func (f *FulltextIndexKey) Word() (v []byte) {
+	if f.word != nil {
+		return f.word
+	}
 	v = f.key[index.Len:f.endOfWord]
 	f.word = v
 	return
 }

 func (f *FulltextIndexKey) EventId() (v *eventid.T) {
+	if f.eventid != nil {
+		return f.eventid
+	}
 	v = eventid.NewWith(f.Segment(StartOfEventId, StartOfPubkey))
 	f.eventid = v
 	return
 }

 func (f *FulltextIndexKey) Pubkey() (v []byte) {
+	if f.pubkey != nil {
+		return f.pubkey
+	}
 	v = f.Segment(StartOfPubkey, StartOfTimestamp)
 	f.pubkey = v
 	return
 }

 func (f *FulltextIndexKey) Timestamp() (v *timestamp.T) {
+	if f.timestamp != nil {
+		return f.timestamp
+	}
 	v = timestamp.FromBytes(f.Segment(StartOfTimestamp, StartOfKind))
 	f.timestamp = v
 	return
 }

 func (f *FulltextIndexKey) Kind() (v *kind.T) {
+	if f.kind != nil {
+		return f.kind
+	}
 	v = kind.NewFromBytes(f.Segment(StartOfKind, StartOfSequence))
 	f.kind = v
 	return
 }

 func (f *FulltextIndexKey) Sequence() (v *integer.T) {
-	v = integer.NewFrom(f.Segment(StartOfSequence, StartOfSerial))
-	f.sequence = v.Val
+	if f.sequence != nil {
+		return f.sequence
+	}
+	f.sequence = integer.NewFrom(f.Segment(StartOfSequence, StartOfSerial))
 	return
 }

 func (f *FulltextIndexKey) Serial() (v *serial.T) {
+	if f.serial != nil {
+		return f.serial
+	}
 	v = serial.New(f.Segment(StartOfSerial, len(f.key)))
 	f.serial = v
 	return
--- a/ratel/prefixes/prefixes.go
+++ b/ratel/prefixes/prefixes.go
@@ -132,6 +132,14 @@ const (
 	//
 	// [ 16 ][ ISO639-2 code ][ serial ]
 	LangIndex
+
+	// TagEventId is a tag that is used to search for events containing e tag references to
+	// other events. This will greatly accelerate searches for threaded discussion and enable
+	// the creation of composite documents with a directed acyclic graph structure, such as git
+	// commits.
+	//
+	// [ 17 ][ 8 bytes eventid.T prefix ][ 8 bytes Serial ]
+	TagEventId
 )

 // FilterPrefixes is a slice of the prefixes used by filter index to enable a loop
--- a/ratel/preparequeries.go
+++ b/ratel/preparequeries.go
@@ -126,15 +126,19 @@ func PrepareQueries(f *filter.T) (
 		ext = &filter.T{Kinds: f.Kinds}
 		i := 0
 		for _, values := range f.Tags.ToSliceOfTags() {
-			for _, value := range values.ToSliceOfBytes()[1:] {
-				// get key prefix (with full length) and offset where to write the last parts
-				var prf []byte
-				if prf, err = GetTagKeyPrefix(string(value)); chk.E(err) {
-					continue
+			tsb := values.ToSliceOfBytes()
+			// indexable tags can only have 1 character in the key field.
+			if len(tsb[0]) == 1 {
+				for _, value := range tsb[1:] {
+					// get key prefix (with full length) and offset where to write the last parts
+					var prf []byte
+					if prf, err = GetTagKeyPrefix(tsb[0][0], value); chk.E(err) {
+						continue
+					}
+					// remove the last part to get just the prefix we want here
+					qs[i] = query{index: i, queryFilter: f, searchPrefix: prf}
+					i++
 				}
-				// remove the last part to get just the prefix we want here
-				qs[i] = query{index: i, queryFilter: f, searchPrefix: prf}
-				i++
 			}
 		}
 		// log.T.S("tags", qs)
--- a/realy/acceptevent.go
+++ b/realy/acceptevent.go
@@ -44,6 +44,9 @@ func (s *Server) acceptEvent(c context.T, evt *event.T, authedPubkey []byte,
 		}
 		// check the mute list, and reject events authored by muted pubkeys, even if
 		// they come from a pubkey that is on the follow list.
+		//
+		// note that some clients hide this info in the kind 10000 mute list, unfortunately.
+		// such as jumble. use old nostrudel or similar which still gives public readable info.
 		for pk := range s.muted {
 			if bytes.Equal(evt.Pubkey, []byte(pk)) {
 				notice = "rejecting event with pubkey " + hex.Enc(evt.Pubkey) +