theoretically implemented fulltext search
This commit is contained in:
@@ -1,21 +1,22 @@
|
||||
package addresstag
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"realy.lol/chk"
|
||||
"realy.lol/hex"
|
||||
)
|
||||
|
||||
// DecodeAddressTag unpacks the contents of an `a` tag.
|
||||
func DecodeAddressTag(tagValue string) (k uint16, pkb []byte, d string) {
|
||||
split := strings.Split(tagValue, ":")
|
||||
func DecodeAddressTag(tagValue []byte) (k uint16, pkb []byte, d []byte) {
|
||||
split := bytes.Split(tagValue, []byte(":"))
|
||||
if len(split) == 3 {
|
||||
var err error
|
||||
var key uint64
|
||||
if pkb, _ = hex.Dec(split[1]); len(pkb) == 32 {
|
||||
if key, err = strconv.ParseUint(split[0], 10, 16); !chk.E(err) {
|
||||
if pkb, _ = hex.DecAppend(pkb, split[1]); len(pkb) == 32 {
|
||||
// todo: use ints package for this
|
||||
if key, err = strconv.ParseUint(string(split[0]), 10, 16); !chk.E(err) {
|
||||
return uint16(key), pkb, split[2]
|
||||
}
|
||||
}
|
||||
|
||||
@@ -20,6 +20,8 @@ import (
|
||||
|
||||
// Create_a_Tag generates tag indexes from a tag key, tag value, created_at
|
||||
// timestamp and the event serial.
|
||||
//
|
||||
// This only covers what are essentially p tags and a tags.
|
||||
func Create_a_Tag(tagKey, tagValue string, CA *createdat.T,
|
||||
ser *serial.T) (prf index.P, elems []keys.Element, err error) {
|
||||
|
||||
|
||||
@@ -1 +1,270 @@
|
||||
package ratel
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"sort"
|
||||
"time"
|
||||
|
||||
"github.com/dgraph-io/badger/v4"
|
||||
|
||||
"realy.lol/chk"
|
||||
"realy.lol/context"
|
||||
"realy.lol/filter"
|
||||
"realy.lol/log"
|
||||
"realy.lol/ratel/keys/arb"
|
||||
"realy.lol/ratel/keys/serial"
|
||||
"realy.lol/ratel/prefixes"
|
||||
"realy.lol/store"
|
||||
"realy.lol/tag"
|
||||
)
|
||||
|
||||
type FulltextSequence struct {
|
||||
inSequence int
|
||||
distance int
|
||||
sequence []int
|
||||
items []*prefixes.FulltextIndexKey
|
||||
}
|
||||
|
||||
func (r *T) QueryFulltextEvents(c context.T, f *filter.T) (evs []store.IdTsPk, err error) {
|
||||
start := time.Now()
|
||||
// just use QueryEvents if there isn't actually any fulltext search field content.
|
||||
if len(f.Search) == 0 {
|
||||
return r.QueryForIds(c, f)
|
||||
}
|
||||
split := bytes.Split(f.Search, []byte{' '})
|
||||
var lang []byte
|
||||
var terms [][]byte
|
||||
for i := range split {
|
||||
if bytes.HasPrefix(split[i], []byte("lang:")) {
|
||||
lang = split[i][5:]
|
||||
} else {
|
||||
terms = append(terms, split[i])
|
||||
}
|
||||
}
|
||||
var fTags []*tag.T
|
||||
if f.Tags != nil {
|
||||
fTags = f.Tags.ToSliceOfTags()
|
||||
}
|
||||
fAut := f.Authors.ToSliceOfBytes()
|
||||
fKinds := f.Kinds.K
|
||||
var matches []*prefixes.FulltextIndexKey
|
||||
if err = r.View(func(txn *badger.Txn) (err error) {
|
||||
it := txn.NewIterator(badger.IteratorOptions{
|
||||
Prefix: prefixes.FulltextIndex.Key(),
|
||||
Reverse: true,
|
||||
})
|
||||
defer it.Close()
|
||||
for _, v := range terms {
|
||||
for it.Rewind(); it.ValidForPrefix(prefixes.FulltextIndex.Key(arb.New(v))); it.Next() {
|
||||
item := it.Item()
|
||||
k := item.KeyCopy(nil)
|
||||
var idx *prefixes.FulltextIndexKey
|
||||
if idx, err = prefixes.NewFulltextIndexKey(k); chk.E(err) {
|
||||
continue
|
||||
}
|
||||
if f.Since != nil {
|
||||
ts := idx.Timestamp()
|
||||
if ts.I64() < f.Since.I64() {
|
||||
// event is earlier than since
|
||||
continue
|
||||
}
|
||||
}
|
||||
if f.Until != nil {
|
||||
ts := idx.Timestamp()
|
||||
if ts.I64() > f.Until.I64() {
|
||||
// event is later than until
|
||||
continue
|
||||
}
|
||||
}
|
||||
if len(fKinds) != 0 {
|
||||
var found bool
|
||||
ki := idx.Kind()
|
||||
for _, kin := range fKinds {
|
||||
if ki.Equal(kin) {
|
||||
found = true
|
||||
break
|
||||
}
|
||||
}
|
||||
// kinds are present in filter and don't match
|
||||
if !found {
|
||||
continue
|
||||
}
|
||||
}
|
||||
if len(fAut) > 0 {
|
||||
var found bool
|
||||
pk := idx.Pubkey()
|
||||
for _, p := range fAut {
|
||||
if bytes.Equal(p, pk) {
|
||||
found = true
|
||||
break
|
||||
}
|
||||
}
|
||||
// pubkey is in filter and doesn't match
|
||||
if !found {
|
||||
continue
|
||||
}
|
||||
}
|
||||
// get serial
|
||||
ser := idx.Serial()
|
||||
// check language tags
|
||||
if len(lang) > 0 {
|
||||
var found bool
|
||||
func() {
|
||||
itl := txn.NewIterator(badger.IteratorOptions{
|
||||
Prefix: prefixes.LangIndex.Key(),
|
||||
})
|
||||
defer itl.Close()
|
||||
for itl.Rewind(); itl.Valid(); itl.Next() {
|
||||
s := serial.FromKey(itl.Item().KeyCopy(nil))
|
||||
if s.Uint64() == ser.Uint64() {
|
||||
found = true
|
||||
return
|
||||
}
|
||||
}
|
||||
}()
|
||||
// the event does not have an associated language tag
|
||||
if !found {
|
||||
continue
|
||||
}
|
||||
}
|
||||
// now we can check tags, they can't be squished into a fulltext index, and
|
||||
// require a second table iteration
|
||||
if len(fTags) > 0 {
|
||||
var found bool
|
||||
for _, ft := range fTags {
|
||||
if len(ft.Key()) == 2 && ft.Key()[0] == '#' {
|
||||
var tp []byte
|
||||
if tp, err = GetTagKeyPrefix(ft.Key()[0], ft.Value()); chk.E(err) {
|
||||
continue
|
||||
}
|
||||
if len(tp) == 0 {
|
||||
// the tag did not generate an index
|
||||
continue
|
||||
}
|
||||
func() {
|
||||
itt := txn.NewIterator(badger.IteratorOptions{
|
||||
Prefix: tp,
|
||||
})
|
||||
defer itt.Close()
|
||||
for itt.Rewind(); itt.Valid(); itt.Next() {
|
||||
s := serial.FromKey(itt.Item().KeyCopy(nil))
|
||||
if s.Uint64() == ser.Uint64() {
|
||||
found = true
|
||||
return
|
||||
}
|
||||
}
|
||||
}()
|
||||
// the event does not have any of the required tags
|
||||
if !found {
|
||||
continue
|
||||
}
|
||||
}
|
||||
}
|
||||
if !found {
|
||||
continue
|
||||
}
|
||||
}
|
||||
// if we got to here, we have a match
|
||||
matches = append(matches, idx)
|
||||
}
|
||||
}
|
||||
return
|
||||
}); chk.E(err) {
|
||||
return
|
||||
}
|
||||
if len(matches) == 0 {
|
||||
// didn't find any (?)
|
||||
return
|
||||
}
|
||||
// next we need to group and sort the results
|
||||
groups := make(map[uint64]FulltextSequence)
|
||||
for _, v := range matches {
|
||||
if _, ok := groups[v.Serial().Uint64()]; !ok {
|
||||
groups[v.Serial().Uint64()] = FulltextSequence{items: []*prefixes.FulltextIndexKey{v}}
|
||||
} else {
|
||||
g := groups[v.Serial().Uint64()]
|
||||
g.items = append(g.items, v)
|
||||
}
|
||||
}
|
||||
// now we need to convert the map to a slice so we can sort it
|
||||
var groupS []FulltextSequence
|
||||
for _, g := range groups {
|
||||
groupS = append(groupS, g)
|
||||
}
|
||||
// first, sort the groups by the number of elements in descending order
|
||||
sort.Slice(groupS, func(i, j int) (e bool) {
|
||||
return len(groupS[i].items) > len(groupS[j].items)
|
||||
})
|
||||
// get the distance of the groups
|
||||
for _, g := range groupS {
|
||||
g.distance = int(g.items[len(g.items)-1].Sequence().Val - g.items[0].Sequence().Val)
|
||||
}
|
||||
// get the sequence as relates to the search terms
|
||||
for _, g := range groupS {
|
||||
for i := range g.items {
|
||||
if i > 0 {
|
||||
for k := range terms {
|
||||
if bytes.Equal(g.items[i].Word(), terms[k]) {
|
||||
g.sequence = append(g.sequence, i)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// count the number of elements of the sequence that are in ascending order
|
||||
for _, g := range groupS {
|
||||
for i := range g.sequence {
|
||||
if i > 0 {
|
||||
if g.sequence[i-1] < g.sequence[i] {
|
||||
g.inSequence++
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// find the boundaries of each length segment of the group
|
||||
var groupedCounts []int
|
||||
var lastCount int
|
||||
lastCount = len(groupS[0].items)
|
||||
for i, g := range groupS {
|
||||
if len(g.items) < lastCount {
|
||||
groupedCounts = append(groupedCounts, i)
|
||||
lastCount = len(g.items)
|
||||
}
|
||||
}
|
||||
// break the groupS into segments of the same length
|
||||
var segments [][]FulltextSequence
|
||||
lastCount = 0
|
||||
for i := range groupedCounts {
|
||||
segments = append(segments, groupS[lastCount:groupedCounts[i]])
|
||||
}
|
||||
// sort the segments by distance and number in sequence
|
||||
for _, s := range segments {
|
||||
sort.Slice(s, func(i, j int) bool {
|
||||
return (s[i].distance < s[j].distance) && s[i].inSequence > s[i].inSequence
|
||||
})
|
||||
}
|
||||
// flatten the segments back into a list
|
||||
var list []FulltextSequence
|
||||
for _, seg := range segments {
|
||||
for _, bit := range seg {
|
||||
list = append(list, bit)
|
||||
}
|
||||
}
|
||||
// convert into store.IdTsPk
|
||||
for _, bit := range list {
|
||||
for _, el := range bit.items {
|
||||
evs = append(evs, store.IdTsPk{
|
||||
Ts: el.Timestamp().I64(),
|
||||
Id: el.EventId().Bytes(),
|
||||
Pub: el.Pubkey(),
|
||||
})
|
||||
}
|
||||
}
|
||||
if f.Limit != nil {
|
||||
evs = evs[:*f.Limit]
|
||||
} else {
|
||||
evs = evs[:r.MaxLimit]
|
||||
}
|
||||
log.I.F("performed search for '%s' in %v", f.Search, time.Now().Sub(start))
|
||||
return
|
||||
}
|
||||
|
||||
@@ -6,7 +6,7 @@ import (
|
||||
"realy.lol/chk"
|
||||
"realy.lol/event"
|
||||
"realy.lol/eventid"
|
||||
"realy.lol/log"
|
||||
"realy.lol/hex"
|
||||
"realy.lol/ratel/keys"
|
||||
"realy.lol/ratel/keys/createdat"
|
||||
"realy.lol/ratel/keys/fullid"
|
||||
@@ -59,15 +59,19 @@ func GetIndexKeysForEvent(ev *event.T, ser *serial.T) (keyz [][]byte) {
|
||||
}
|
||||
// ~ by tag value + date
|
||||
for i, t := range ev.Tags.ToSliceOfTags() {
|
||||
// there is no value field
|
||||
tsb := t.ToSliceOfBytes()
|
||||
if t.Len() < 2 {
|
||||
continue
|
||||
}
|
||||
tk, tv := tsb[0], tsb[1]
|
||||
if t.Len() < 2 ||
|
||||
// the tag is not a-zA-Z probably (this would permit arbitrary other
|
||||
// single byte chars)
|
||||
len(t.ToSliceOfBytes()[0]) != 1 ||
|
||||
// the tag is not a-zA-Z probably (this would permit arbitrary other single byte
|
||||
// chars)
|
||||
len(tk) != 1 ||
|
||||
// the second field is zero length
|
||||
len(t.ToSliceOfBytes()[1]) == 0 ||
|
||||
len(tv) == 0 ||
|
||||
// the second field is more than 100 characters long
|
||||
len(t.ToSliceOfBytes()[1]) > 100 {
|
||||
len(tv) > 100 {
|
||||
// any of the above is true then the tag is not indexable
|
||||
continue
|
||||
}
|
||||
@@ -82,17 +86,27 @@ func GetIndexKeysForEvent(ev *event.T, ser *serial.T) (keyz [][]byte) {
|
||||
// duplicate
|
||||
continue
|
||||
}
|
||||
// get key prefix (with full length) and offset where to write the last
|
||||
// parts
|
||||
// create tags for e (event references) but we don't care about the optional third value
|
||||
// as it can't be searched for anyway (it's for clients to render threads)
|
||||
if bytes.Equal(tk, []byte("e")) {
|
||||
if len(tv) != 64 {
|
||||
continue
|
||||
}
|
||||
var ei []byte
|
||||
if ei, err = hex.DecAppend(ei, tv); chk.E(err) {
|
||||
continue
|
||||
}
|
||||
keyz = append(keyz, prefixes.TagEventId.Key(id.New(eventid.NewWith(ei)), ser))
|
||||
continue
|
||||
}
|
||||
// get key prefix (with full length) and offset where to write the last parts.
|
||||
prf, elems := index.P(0), []keys.Element(nil)
|
||||
if prf, elems, err = Create_a_Tag(string(t.ToSliceOfBytes()[0]),
|
||||
string(t.ToSliceOfBytes()[1]), CA,
|
||||
ser); chk.E(err) {
|
||||
log.I.F("%v", t.ToStringSlice())
|
||||
if prf, elems, err = Create_a_Tag(string(tsb[0]),
|
||||
string(tv), CA, ser); chk.E(err) {
|
||||
// log.I.F("%v", t.ToStringSlice())
|
||||
return
|
||||
}
|
||||
k := prf.Key(elems...)
|
||||
// log.T.ToSliceOfBytes("tag '%s': %s key %0x", t.ToSliceOfBytes()[0], t.ToSliceOfBytes()[1:], k)
|
||||
keyz = append(keyz, k)
|
||||
}
|
||||
{ // ~ by date only
|
||||
|
||||
@@ -3,9 +3,11 @@ package ratel
|
||||
import (
|
||||
eventstore "realy.lol/addresstag"
|
||||
"realy.lol/chk"
|
||||
"realy.lol/eventid"
|
||||
"realy.lol/hex"
|
||||
"realy.lol/ratel/keys"
|
||||
"realy.lol/ratel/keys/arb"
|
||||
"realy.lol/ratel/keys/id"
|
||||
"realy.lol/ratel/keys/kinder"
|
||||
"realy.lol/ratel/keys/pubkey"
|
||||
"realy.lol/ratel/prefixes"
|
||||
@@ -14,7 +16,7 @@ import (
|
||||
// GetTagKeyPrefix returns tag index prefixes based on the initial field of a
|
||||
// tag.
|
||||
//
|
||||
// There is 3 types of index tag keys:
|
||||
// There is 4 types of index tag keys:
|
||||
//
|
||||
// - TagAddr: [ 8 ][ 2b Kind ][ 8b Pubkey ][ address/URL ][ 8b Serial ]
|
||||
//
|
||||
@@ -22,9 +24,17 @@ import (
|
||||
//
|
||||
// - Tag: [ 6 ][ address/URL ][ 8b Serial ]
|
||||
//
|
||||
// - TagEventId [ 8 bytes eventid.T prefix ][ 8 bytes Serial ]
|
||||
//
|
||||
// This function produces the initial bytes without the index.
|
||||
func GetTagKeyPrefix(tagValue string) (key []byte, err error) {
|
||||
if k, pkb, d := eventstore.DecodeAddressTag(tagValue); len(pkb) == 32 {
|
||||
func GetTagKeyPrefix(prf byte, tagValue []byte) (key []byte, err error) {
|
||||
if prf == 'e' {
|
||||
var eid []byte
|
||||
if eid, err = hex.DecAppend(eid, tagValue); chk.E(err) {
|
||||
return
|
||||
}
|
||||
key = prefixes.TagEventId.Key(id.New(eventid.NewWith(eid)))
|
||||
} else if k, pkb, d := eventstore.DecodeAddressTag(tagValue); len(pkb) == 32 {
|
||||
// store value in the new special "a" tag index
|
||||
var pk *pubkey.T
|
||||
if pk, err = pubkey.NewFromBytes(pkb); chk.E(err) {
|
||||
@@ -35,7 +45,7 @@ func GetTagKeyPrefix(tagValue string) (key []byte, err error) {
|
||||
els = append(els, arb.New(d))
|
||||
}
|
||||
key = prefixes.TagAddr.Key(els...)
|
||||
} else if pkb, _ := hex.Dec(tagValue); len(pkb) == 32 {
|
||||
} else if pkb, _ := hex.DecAppend(nil, tagValue); len(pkb) == 32 {
|
||||
// store value as bytes
|
||||
var pkk *pubkey.T
|
||||
if pkk, err = pubkey.NewFromBytes(pkb); chk.E(err) {
|
||||
|
||||
@@ -33,7 +33,7 @@ type FulltextIndexKey struct {
|
||||
pubkey []byte
|
||||
timestamp *timestamp.T
|
||||
kind *kind.T
|
||||
sequence uint32
|
||||
sequence *integer.T
|
||||
serial *serial.T
|
||||
}
|
||||
|
||||
@@ -51,42 +51,62 @@ func (f *FulltextIndexKey) Segment(start, end int) []byte {
|
||||
}
|
||||
|
||||
func (f *FulltextIndexKey) Word() (v []byte) {
|
||||
if f.word != nil {
|
||||
return f.word
|
||||
}
|
||||
v = f.key[index.Len:f.endOfWord]
|
||||
f.word = v
|
||||
return
|
||||
}
|
||||
|
||||
func (f *FulltextIndexKey) EventId() (v *eventid.T) {
|
||||
if f.eventid != nil {
|
||||
return f.eventid
|
||||
}
|
||||
v = eventid.NewWith(f.Segment(StartOfEventId, StartOfPubkey))
|
||||
f.eventid = v
|
||||
return
|
||||
}
|
||||
|
||||
func (f *FulltextIndexKey) Pubkey() (v []byte) {
|
||||
if f.pubkey != nil {
|
||||
return f.pubkey
|
||||
}
|
||||
v = f.Segment(StartOfPubkey, StartOfTimestamp)
|
||||
f.pubkey = v
|
||||
return
|
||||
}
|
||||
|
||||
func (f *FulltextIndexKey) Timestamp() (v *timestamp.T) {
|
||||
if f.timestamp != nil {
|
||||
return f.timestamp
|
||||
}
|
||||
v = timestamp.FromBytes(f.Segment(StartOfTimestamp, StartOfKind))
|
||||
f.timestamp = v
|
||||
return
|
||||
}
|
||||
|
||||
func (f *FulltextIndexKey) Kind() (v *kind.T) {
|
||||
if f.kind != nil {
|
||||
return f.kind
|
||||
}
|
||||
v = kind.NewFromBytes(f.Segment(StartOfKind, StartOfSequence))
|
||||
f.kind = v
|
||||
return
|
||||
}
|
||||
|
||||
func (f *FulltextIndexKey) Sequence() (v *integer.T) {
|
||||
v = integer.NewFrom(f.Segment(StartOfSequence, StartOfSerial))
|
||||
f.sequence = v.Val
|
||||
if f.sequence != nil {
|
||||
return f.sequence
|
||||
}
|
||||
f.sequence = integer.NewFrom(f.Segment(StartOfSequence, StartOfSerial))
|
||||
return
|
||||
}
|
||||
|
||||
func (f *FulltextIndexKey) Serial() (v *serial.T) {
|
||||
if f.serial != nil {
|
||||
return f.serial
|
||||
}
|
||||
v = serial.New(f.Segment(StartOfSerial, len(f.key)))
|
||||
f.serial = v
|
||||
return
|
||||
|
||||
@@ -132,6 +132,14 @@ const (
|
||||
//
|
||||
// [ 16 ][ ISO639-2 code ][ serial ]
|
||||
LangIndex
|
||||
|
||||
// TagEventId is a tag that is used to search for events containing e tag references to
|
||||
// other events. This will greatly accelerate searches for threaded discussion and enable
|
||||
// the creation of composite documents with a directed acyclic graph structure, such as git
|
||||
// commits.
|
||||
//
|
||||
// [ 17 ][ 8 bytes eventid.T prefix ][ 8 bytes Serial ]
|
||||
TagEventId
|
||||
)
|
||||
|
||||
// FilterPrefixes is a slice of the prefixes used by filter index to enable a loop
|
||||
|
||||
@@ -126,15 +126,19 @@ func PrepareQueries(f *filter.T) (
|
||||
ext = &filter.T{Kinds: f.Kinds}
|
||||
i := 0
|
||||
for _, values := range f.Tags.ToSliceOfTags() {
|
||||
for _, value := range values.ToSliceOfBytes()[1:] {
|
||||
// get key prefix (with full length) and offset where to write the last parts
|
||||
var prf []byte
|
||||
if prf, err = GetTagKeyPrefix(string(value)); chk.E(err) {
|
||||
continue
|
||||
tsb := values.ToSliceOfBytes()
|
||||
// indexable tags can only have 1 character in the key field.
|
||||
if len(tsb[0]) == 1 {
|
||||
for _, value := range tsb[1:] {
|
||||
// get key prefix (with full length) and offset where to write the last parts
|
||||
var prf []byte
|
||||
if prf, err = GetTagKeyPrefix(tsb[0][0], value); chk.E(err) {
|
||||
continue
|
||||
}
|
||||
// remove the last part to get just the prefix we want here
|
||||
qs[i] = query{index: i, queryFilter: f, searchPrefix: prf}
|
||||
i++
|
||||
}
|
||||
// remove the last part to get just the prefix we want here
|
||||
qs[i] = query{index: i, queryFilter: f, searchPrefix: prf}
|
||||
i++
|
||||
}
|
||||
}
|
||||
// log.T.S("tags", qs)
|
||||
|
||||
@@ -44,6 +44,9 @@ func (s *Server) acceptEvent(c context.T, evt *event.T, authedPubkey []byte,
|
||||
}
|
||||
// check the mute list, and reject events authored by muted pubkeys, even if
|
||||
// they come from a pubkey that is on the follow list.
|
||||
//
|
||||
// note that some clients hide this info in the kind 10000 mute list, unfortunately.
|
||||
// such as jumble. use old nostrudel or similar which still gives public readable info.
|
||||
for pk := range s.muted {
|
||||
if bytes.Equal(evt.Pubkey, []byte(pk)) {
|
||||
notice = "rejecting event with pubkey " + hex.Enc(evt.Pubkey) +
|
||||
|
||||
Reference in New Issue
Block a user