theoretically implemented fulltext search

This commit is contained in:
2025-05-17 06:33:49 -01:06
parent b39d47b2d5
commit 09fc0a1029
9 changed files with 365 additions and 34 deletions

View File

@@ -1,21 +1,22 @@
package addresstag
import (
"bytes"
"strconv"
"strings"
"realy.lol/chk"
"realy.lol/hex"
)
// DecodeAddressTag unpacks the contents of an `a` tag.
func DecodeAddressTag(tagValue string) (k uint16, pkb []byte, d string) {
split := strings.Split(tagValue, ":")
func DecodeAddressTag(tagValue []byte) (k uint16, pkb []byte, d []byte) {
split := bytes.Split(tagValue, []byte(":"))
if len(split) == 3 {
var err error
var key uint64
if pkb, _ = hex.Dec(split[1]); len(pkb) == 32 {
if key, err = strconv.ParseUint(split[0], 10, 16); !chk.E(err) {
if pkb, _ = hex.DecAppend(pkb, split[1]); len(pkb) == 32 {
// todo: use ints package for this
if key, err = strconv.ParseUint(string(split[0]), 10, 16); !chk.E(err) {
return uint16(key), pkb, split[2]
}
}

View File

@@ -20,6 +20,8 @@ import (
// Create_a_Tag generates tag indexes from a tag key, tag value, created_at
// timestamp and the event serial.
//
// This only covers what are essentially p tags and a tags.
func Create_a_Tag(tagKey, tagValue string, CA *createdat.T,
ser *serial.T) (prf index.P, elems []keys.Element, err error) {

View File

@@ -1 +1,270 @@
package ratel
import (
"bytes"
"sort"
"time"
"github.com/dgraph-io/badger/v4"
"realy.lol/chk"
"realy.lol/context"
"realy.lol/filter"
"realy.lol/log"
"realy.lol/ratel/keys/arb"
"realy.lol/ratel/keys/serial"
"realy.lol/ratel/prefixes"
"realy.lol/store"
"realy.lol/tag"
)
type FulltextSequence struct {
inSequence int
distance int
sequence []int
items []*prefixes.FulltextIndexKey
}
func (r *T) QueryFulltextEvents(c context.T, f *filter.T) (evs []store.IdTsPk, err error) {
start := time.Now()
// just use QueryEvents if there isn't actually any fulltext search field content.
if len(f.Search) == 0 {
return r.QueryForIds(c, f)
}
split := bytes.Split(f.Search, []byte{' '})
var lang []byte
var terms [][]byte
for i := range split {
if bytes.HasPrefix(split[i], []byte("lang:")) {
lang = split[i][5:]
} else {
terms = append(terms, split[i])
}
}
var fTags []*tag.T
if f.Tags != nil {
fTags = f.Tags.ToSliceOfTags()
}
fAut := f.Authors.ToSliceOfBytes()
fKinds := f.Kinds.K
var matches []*prefixes.FulltextIndexKey
if err = r.View(func(txn *badger.Txn) (err error) {
it := txn.NewIterator(badger.IteratorOptions{
Prefix: prefixes.FulltextIndex.Key(),
Reverse: true,
})
defer it.Close()
for _, v := range terms {
for it.Rewind(); it.ValidForPrefix(prefixes.FulltextIndex.Key(arb.New(v))); it.Next() {
item := it.Item()
k := item.KeyCopy(nil)
var idx *prefixes.FulltextIndexKey
if idx, err = prefixes.NewFulltextIndexKey(k); chk.E(err) {
continue
}
if f.Since != nil {
ts := idx.Timestamp()
if ts.I64() < f.Since.I64() {
// event is earlier than since
continue
}
}
if f.Until != nil {
ts := idx.Timestamp()
if ts.I64() > f.Until.I64() {
// event is later than until
continue
}
}
if len(fKinds) != 0 {
var found bool
ki := idx.Kind()
for _, kin := range fKinds {
if ki.Equal(kin) {
found = true
break
}
}
// kinds are present in filter and don't match
if !found {
continue
}
}
if len(fAut) > 0 {
var found bool
pk := idx.Pubkey()
for _, p := range fAut {
if bytes.Equal(p, pk) {
found = true
break
}
}
// pubkey is in filter and doesn't match
if !found {
continue
}
}
// get serial
ser := idx.Serial()
// check language tags
if len(lang) > 0 {
var found bool
func() {
itl := txn.NewIterator(badger.IteratorOptions{
Prefix: prefixes.LangIndex.Key(),
})
defer itl.Close()
for itl.Rewind(); itl.Valid(); itl.Next() {
s := serial.FromKey(itl.Item().KeyCopy(nil))
if s.Uint64() == ser.Uint64() {
found = true
return
}
}
}()
// the event does not have an associated language tag
if !found {
continue
}
}
// now we can check tags, they can't be squished into a fulltext index, and
// require a second table iteration
if len(fTags) > 0 {
var found bool
for _, ft := range fTags {
if len(ft.Key()) == 2 && ft.Key()[0] == '#' {
var tp []byte
if tp, err = GetTagKeyPrefix(ft.Key()[0], ft.Value()); chk.E(err) {
continue
}
if len(tp) == 0 {
// the tag did not generate an index
continue
}
func() {
itt := txn.NewIterator(badger.IteratorOptions{
Prefix: tp,
})
defer itt.Close()
for itt.Rewind(); itt.Valid(); itt.Next() {
s := serial.FromKey(itt.Item().KeyCopy(nil))
if s.Uint64() == ser.Uint64() {
found = true
return
}
}
}()
// the event does not have any of the required tags
if !found {
continue
}
}
}
if !found {
continue
}
}
// if we got to here, we have a match
matches = append(matches, idx)
}
}
return
}); chk.E(err) {
return
}
if len(matches) == 0 {
// didn't find any (?)
return
}
// next we need to group and sort the results
groups := make(map[uint64]FulltextSequence)
for _, v := range matches {
if _, ok := groups[v.Serial().Uint64()]; !ok {
groups[v.Serial().Uint64()] = FulltextSequence{items: []*prefixes.FulltextIndexKey{v}}
} else {
g := groups[v.Serial().Uint64()]
g.items = append(g.items, v)
}
}
// now we need to convert the map to a slice so we can sort it
var groupS []FulltextSequence
for _, g := range groups {
groupS = append(groupS, g)
}
// first, sort the groups by the number of elements in descending order
sort.Slice(groupS, func(i, j int) (e bool) {
return len(groupS[i].items) > len(groupS[j].items)
})
// get the distance of the groups
for _, g := range groupS {
g.distance = int(g.items[len(g.items)-1].Sequence().Val - g.items[0].Sequence().Val)
}
// get the sequence as relates to the search terms
for _, g := range groupS {
for i := range g.items {
if i > 0 {
for k := range terms {
if bytes.Equal(g.items[i].Word(), terms[k]) {
g.sequence = append(g.sequence, i)
}
}
}
}
}
// count the number of elements of the sequence that are in ascending order
for _, g := range groupS {
for i := range g.sequence {
if i > 0 {
if g.sequence[i-1] < g.sequence[i] {
g.inSequence++
}
}
}
}
// find the boundaries of each length segment of the group
var groupedCounts []int
var lastCount int
lastCount = len(groupS[0].items)
for i, g := range groupS {
if len(g.items) < lastCount {
groupedCounts = append(groupedCounts, i)
lastCount = len(g.items)
}
}
// break the groupS into segments of the same length
var segments [][]FulltextSequence
lastCount = 0
for i := range groupedCounts {
segments = append(segments, groupS[lastCount:groupedCounts[i]])
}
// sort the segments by distance and number in sequence
for _, s := range segments {
sort.Slice(s, func(i, j int) bool {
return (s[i].distance < s[j].distance) && s[i].inSequence > s[i].inSequence
})
}
// flatten the segments back into a list
var list []FulltextSequence
for _, seg := range segments {
for _, bit := range seg {
list = append(list, bit)
}
}
// convert into store.IdTsPk
for _, bit := range list {
for _, el := range bit.items {
evs = append(evs, store.IdTsPk{
Ts: el.Timestamp().I64(),
Id: el.EventId().Bytes(),
Pub: el.Pubkey(),
})
}
}
if f.Limit != nil {
evs = evs[:*f.Limit]
} else {
evs = evs[:r.MaxLimit]
}
log.I.F("performed search for '%s' in %v", f.Search, time.Now().Sub(start))
return
}

View File

@@ -6,7 +6,7 @@ import (
"realy.lol/chk"
"realy.lol/event"
"realy.lol/eventid"
"realy.lol/log"
"realy.lol/hex"
"realy.lol/ratel/keys"
"realy.lol/ratel/keys/createdat"
"realy.lol/ratel/keys/fullid"
@@ -59,15 +59,19 @@ func GetIndexKeysForEvent(ev *event.T, ser *serial.T) (keyz [][]byte) {
}
// ~ by tag value + date
for i, t := range ev.Tags.ToSliceOfTags() {
// there is no value field
tsb := t.ToSliceOfBytes()
if t.Len() < 2 {
continue
}
tk, tv := tsb[0], tsb[1]
if t.Len() < 2 ||
// the tag is not a-zA-Z probably (this would permit arbitrary other
// single byte chars)
len(t.ToSliceOfBytes()[0]) != 1 ||
// the tag is not a-zA-Z probably (this would permit arbitrary other single byte
// chars)
len(tk) != 1 ||
// the second field is zero length
len(t.ToSliceOfBytes()[1]) == 0 ||
len(tv) == 0 ||
// the second field is more than 100 characters long
len(t.ToSliceOfBytes()[1]) > 100 {
len(tv) > 100 {
// any of the above is true then the tag is not indexable
continue
}
@@ -82,17 +86,27 @@ func GetIndexKeysForEvent(ev *event.T, ser *serial.T) (keyz [][]byte) {
// duplicate
continue
}
// get key prefix (with full length) and offset where to write the last
// parts
// create tags for e (event references) but we don't care about the optional third value
// as it can't be searched for anyway (it's for clients to render threads)
if bytes.Equal(tk, []byte("e")) {
if len(tv) != 64 {
continue
}
var ei []byte
if ei, err = hex.DecAppend(ei, tv); chk.E(err) {
continue
}
keyz = append(keyz, prefixes.TagEventId.Key(id.New(eventid.NewWith(ei)), ser))
continue
}
// get key prefix (with full length) and offset where to write the last parts.
prf, elems := index.P(0), []keys.Element(nil)
if prf, elems, err = Create_a_Tag(string(t.ToSliceOfBytes()[0]),
string(t.ToSliceOfBytes()[1]), CA,
ser); chk.E(err) {
log.I.F("%v", t.ToStringSlice())
if prf, elems, err = Create_a_Tag(string(tsb[0]),
string(tv), CA, ser); chk.E(err) {
// log.I.F("%v", t.ToStringSlice())
return
}
k := prf.Key(elems...)
// log.T.ToSliceOfBytes("tag '%s': %s key %0x", t.ToSliceOfBytes()[0], t.ToSliceOfBytes()[1:], k)
keyz = append(keyz, k)
}
{ // ~ by date only

View File

@@ -3,9 +3,11 @@ package ratel
import (
eventstore "realy.lol/addresstag"
"realy.lol/chk"
"realy.lol/eventid"
"realy.lol/hex"
"realy.lol/ratel/keys"
"realy.lol/ratel/keys/arb"
"realy.lol/ratel/keys/id"
"realy.lol/ratel/keys/kinder"
"realy.lol/ratel/keys/pubkey"
"realy.lol/ratel/prefixes"
@@ -14,7 +16,7 @@ import (
// GetTagKeyPrefix returns tag index prefixes based on the initial field of a
// tag.
//
// There is 3 types of index tag keys:
// There is 4 types of index tag keys:
//
// - TagAddr: [ 8 ][ 2b Kind ][ 8b Pubkey ][ address/URL ][ 8b Serial ]
//
@@ -22,9 +24,17 @@ import (
//
// - Tag: [ 6 ][ address/URL ][ 8b Serial ]
//
// - TagEventId [ 8 bytes eventid.T prefix ][ 8 bytes Serial ]
//
// This function produces the initial bytes without the index.
func GetTagKeyPrefix(tagValue string) (key []byte, err error) {
if k, pkb, d := eventstore.DecodeAddressTag(tagValue); len(pkb) == 32 {
func GetTagKeyPrefix(prf byte, tagValue []byte) (key []byte, err error) {
if prf == 'e' {
var eid []byte
if eid, err = hex.DecAppend(eid, tagValue); chk.E(err) {
return
}
key = prefixes.TagEventId.Key(id.New(eventid.NewWith(eid)))
} else if k, pkb, d := eventstore.DecodeAddressTag(tagValue); len(pkb) == 32 {
// store value in the new special "a" tag index
var pk *pubkey.T
if pk, err = pubkey.NewFromBytes(pkb); chk.E(err) {
@@ -35,7 +45,7 @@ func GetTagKeyPrefix(tagValue string) (key []byte, err error) {
els = append(els, arb.New(d))
}
key = prefixes.TagAddr.Key(els...)
} else if pkb, _ := hex.Dec(tagValue); len(pkb) == 32 {
} else if pkb, _ := hex.DecAppend(nil, tagValue); len(pkb) == 32 {
// store value as bytes
var pkk *pubkey.T
if pkk, err = pubkey.NewFromBytes(pkb); chk.E(err) {

View File

@@ -33,7 +33,7 @@ type FulltextIndexKey struct {
pubkey []byte
timestamp *timestamp.T
kind *kind.T
sequence uint32
sequence *integer.T
serial *serial.T
}
@@ -51,42 +51,62 @@ func (f *FulltextIndexKey) Segment(start, end int) []byte {
}
func (f *FulltextIndexKey) Word() (v []byte) {
if f.word != nil {
return f.word
}
v = f.key[index.Len:f.endOfWord]
f.word = v
return
}
func (f *FulltextIndexKey) EventId() (v *eventid.T) {
if f.eventid != nil {
return f.eventid
}
v = eventid.NewWith(f.Segment(StartOfEventId, StartOfPubkey))
f.eventid = v
return
}
func (f *FulltextIndexKey) Pubkey() (v []byte) {
if f.pubkey != nil {
return f.pubkey
}
v = f.Segment(StartOfPubkey, StartOfTimestamp)
f.pubkey = v
return
}
func (f *FulltextIndexKey) Timestamp() (v *timestamp.T) {
if f.timestamp != nil {
return f.timestamp
}
v = timestamp.FromBytes(f.Segment(StartOfTimestamp, StartOfKind))
f.timestamp = v
return
}
func (f *FulltextIndexKey) Kind() (v *kind.T) {
if f.kind != nil {
return f.kind
}
v = kind.NewFromBytes(f.Segment(StartOfKind, StartOfSequence))
f.kind = v
return
}
func (f *FulltextIndexKey) Sequence() (v *integer.T) {
v = integer.NewFrom(f.Segment(StartOfSequence, StartOfSerial))
f.sequence = v.Val
if f.sequence != nil {
return f.sequence
}
f.sequence = integer.NewFrom(f.Segment(StartOfSequence, StartOfSerial))
return
}
func (f *FulltextIndexKey) Serial() (v *serial.T) {
if f.serial != nil {
return f.serial
}
v = serial.New(f.Segment(StartOfSerial, len(f.key)))
f.serial = v
return

View File

@@ -132,6 +132,14 @@ const (
//
// [ 16 ][ ISO639-2 code ][ serial ]
LangIndex
// TagEventId is a tag that is used to search for events containing e tag references to
// other events. This will greatly accelerate searches for threaded discussion and enable
// the creation of composite documents with a directed acyclic graph structure, such as git
// commits.
//
// [ 17 ][ 8 bytes eventid.T prefix ][ 8 bytes Serial ]
TagEventId
)
// FilterPrefixes is a slice of the prefixes used by filter index to enable a loop

View File

@@ -126,15 +126,19 @@ func PrepareQueries(f *filter.T) (
ext = &filter.T{Kinds: f.Kinds}
i := 0
for _, values := range f.Tags.ToSliceOfTags() {
for _, value := range values.ToSliceOfBytes()[1:] {
// get key prefix (with full length) and offset where to write the last parts
var prf []byte
if prf, err = GetTagKeyPrefix(string(value)); chk.E(err) {
continue
tsb := values.ToSliceOfBytes()
// indexable tags can only have 1 character in the key field.
if len(tsb[0]) == 1 {
for _, value := range tsb[1:] {
// get key prefix (with full length) and offset where to write the last parts
var prf []byte
if prf, err = GetTagKeyPrefix(tsb[0][0], value); chk.E(err) {
continue
}
// remove the last part to get just the prefix we want here
qs[i] = query{index: i, queryFilter: f, searchPrefix: prf}
i++
}
// remove the last part to get just the prefix we want here
qs[i] = query{index: i, queryFilter: f, searchPrefix: prf}
i++
}
}
// log.T.S("tags", qs)

View File

@@ -44,6 +44,9 @@ func (s *Server) acceptEvent(c context.T, evt *event.T, authedPubkey []byte,
}
// check the mute list, and reject events authored by muted pubkeys, even if
// they come from a pubkey that is on the follow list.
//
// note that some clients hide this info in the kind 10000 mute list, unfortunately.
// such as jumble. use old nostrudel or similar which still gives public readable info.
for pk := range s.muted {
if bytes.Equal(evt.Pubkey, []byte(pk)) {
notice = "rejecting event with pubkey " + hex.Enc(evt.Pubkey) +