diff --git a/ratel/fulltext.go b/ratel/fulltext.go index 266d481..d8622bd 100644 --- a/ratel/fulltext.go +++ b/ratel/fulltext.go @@ -25,6 +25,10 @@ type Words struct { wordMap map[string]struct{} } +// FulltextIndex adds serials to the list of serials associated with a given word so a free text +// search can find text events with specific words and in sequences in the events. This only has +// to be done one time per event so it stores the serial of the newest index it has already done +// in a special key. func (r *T) FulltextIndex() (err error) { r.WG.Add(1) defer r.WG.Done() diff --git a/ratel/keys/kinder/kind.go b/ratel/keys/kinder/kind.go index 4762790..9be6d37 100644 --- a/ratel/keys/kinder/kind.go +++ b/ratel/keys/kinder/kind.go @@ -6,6 +6,8 @@ import ( "encoding/binary" "io" + "golang.org/x/exp/constraints" + "realy.lol/chk" "realy.lol/kind" "realy.lol/ratel/keys" @@ -20,7 +22,7 @@ type T struct { var _ keys.Element = &T{} // New creates a new kinder.T for reading/writing kind.T values. -func New[V uint16 | uint32 | int32 | uint64 | int64 | int](c V) (p *T) { return &T{Val: kind.New(c)} } +func New[V constraints.Integer](c V) (p *T) { return &T{Val: kind.New(c)} } func Make(c *kind.T) (v []byte) { v = make([]byte, Len) diff --git a/ratel/keys/lang/lang.go b/ratel/keys/lang/lang.go new file mode 100644 index 0000000..62d9cb6 --- /dev/null +++ b/ratel/keys/lang/lang.go @@ -0,0 +1,39 @@ +// Package lang implements a keys.Element for an ISO-639-2 3 letter language code. +package lang + +import ( + "io" + + "realy.lol/chk" + "realy.lol/ratel/keys" +) + +const Len = 3 + +type T struct { + Val []byte +} + +var _ keys.Element = &T{} + +// New creates a new kinder.T for reading/writing kind.T values. +func New[V string | []byte](c V) (p *T) { + if len(c) != Len { + return &T{Val: make([]byte, Len)} + } + return &T{Val: []byte(c)} +} + +func (c *T) Write(buf io.Writer) { + buf.Write(c.Val) +} + +func (c *T) Read(buf io.Reader) (el keys.Element) { + c.Val = make([]byte, Len) + if n, err := buf.Read(c.Val); chk.E(err) || n != Len { + return nil + } + return c +} + +func (c *T) Len() int { return Len } diff --git a/ratel/keys/lang/lang_test.go b/ratel/keys/lang/lang_test.go new file mode 100644 index 0000000..8ead4bc --- /dev/null +++ b/ratel/keys/lang/lang_test.go @@ -0,0 +1,21 @@ +package lang + +import ( + "bytes" + "testing" + + "realy.lol/kind" +) + +func TestT(t *testing.T) { + n := kind.New(1059) + v := New(n.ToU16()) + buf := new(bytes.Buffer) + v.Write(buf) + buf2 := bytes.NewBuffer(buf.Bytes()) + v2 := New(0) + el := v2.Read(buf2).(*T) + if el.Val.ToU16() != n.ToU16() { + t.Fatalf("expected %d got %d", n, el.Val) + } +} diff --git a/ratel/langindex.go b/ratel/langindex.go new file mode 100644 index 0000000..3a76d32 --- /dev/null +++ b/ratel/langindex.go @@ -0,0 +1,139 @@ +package ratel + +import ( + "time" + + "github.com/dgraph-io/badger/v4" + + "realy.lol/chk" + "realy.lol/event" + "realy.lol/kind" + "realy.lol/log" + "realy.lol/ratel/keys/lang" + "realy.lol/ratel/keys/serial" + "realy.lol/ratel/prefixes" + "realy.lol/tag" +) + +type Langs struct { + ser *serial.T + langs []string +} + +// LangIndex searches through events for language tags and stores a LangIndex key containing the +// ISO-639-2 language code and serial to search for text events by language. +func (r *T) LangIndex() (err error) { + r.WG.Add(1) + defer r.WG.Done() + log.I.F("indexing language tags") + defer log.I.F("finished indexing language tags") + langChan := make(chan Langs) + go func() { + for { + select { + case <-r.Ctx.Done(): + return + case l := <-langChan: + if len(l.langs) < 1 { + continue + } + log.I.S("making lang index for %d %v", l.ser.Uint64(), l.langs) + retry: + if err = r.Update(func(txn *badger.Txn) (err error) { + for _, v := range l.langs { + select { + case <-r.Ctx.Done(): + return + default: + } + key := prefixes.LangIndex.Key(lang.New(v), l.ser) + if err = txn.Set(key, nil); chk.E(err) { + return + } + return + } + return + }); chk.E(err) { + time.Sleep(time.Second / 4) + goto retry + } + + } + } + }() + var last *serial.T + if err = r.View(func(txn *badger.Txn) (err error) { + var item *badger.Item + if item, err = txn.Get(prefixes.LangLastIndexed.Key()); chk.E(err) { + return + } + var val []byte + if val, err = item.ValueCopy(nil); chk.E(err) { + return + } + last = serial.New(val) + return + }); chk.E(err) { + } + if last == nil { + last = serial.New(serial.Make(0)) + } + if err = r.Update(func(txn *badger.Txn) (err error) { + it := txn.NewIterator(badger.IteratorOptions{Prefix: prefixes.Event.Key()}) + defer it.Close() + for it.Seek(prefixes.Event.Key(last)); it.Valid(); it.Next() { + item := it.Item() + k := item.KeyCopy(nil) + ser := serial.New(k[1:]) + log.I.F("lang index scanning %d", ser.Uint64()) + if ser.Uint64() < last.Uint64() { + k = k[:0] + log.I.F("already done %d", ser.Uint64()) + continue + } + var val []byte + if val, err = item.ValueCopy(nil); chk.E(err) { + continue + } + ev := &event.T{} + if _, err = r.Unmarshal(ev, val); chk.E(err) { + return + } + langs := r.GetLangTags(ev) + lprf := prefixes.LangLastIndexed.Key() + if err = txn.Set(lprf, ser.Val); chk.E(err) { + return + } + if len(langs) > 0 { + l := Langs{ser: ser, langs: langs} + log.I.S(l) + langChan <- l + } + select { + case <-r.Ctx.Done(): + log.I.F("context closed") + return + default: + } + } + return + }); chk.E(err) { + return + } + return +} + +func (r *T) GetLangTags(ev *event.T) (langs []string) { + if ev.Kind.OneOf(kind.TextNote, kind.Article) { + tgs := ev.Tags.GetAll(tag.New("l")) + tgsl := tgs.ToStringsSlice() + for _, v := range tgsl { + for _, w := range LanguageCodes { + if v[1] == w.ISO639_1 || v[1] == w.ISO639_2 { + langs = append(langs, w.ISO639_2) + } + } + } + } + return +} diff --git a/ratel/prefixes/prefixes.go b/ratel/prefixes/prefixes.go index 6b0fcb8..61ae6f5 100644 --- a/ratel/prefixes/prefixes.go +++ b/ratel/prefixes/prefixes.go @@ -127,10 +127,16 @@ const ( // [ 16 ] [ 8 byte serial ] FulltextLastIndexed - // LangIndex is an index of events with language tags. + // LangIndex is an index of events with language tags. These use ISO639-2 3-letter codes + // regardless of if there is ISO639-1 codes because they cover more languages and are less + // ambiguous. // - // [ 17 ][ + // [ 17 ][ ISO639-2 code ][ serial ] LangIndex + + // LangLastIndexed is a progress cursor for the LangIndex that marks the newest index that + // has been scanned for an l language tag + LangLastIndexed ) // FilterPrefixes is a slice of the prefixes used by filter index to enable a loop diff --git a/ratel/saveevent.go b/ratel/saveevent.go index 63efffd..fff9be7 100644 --- a/ratel/saveevent.go +++ b/ratel/saveevent.go @@ -129,6 +129,9 @@ func (r *T) SaveEvent(c context.T, ev *event.T) (err error) { if err = r.FulltextIndex(); chk.E(err) { return } + if err = r.LangIndex(); chk.E(err) { + return + } return } diff --git a/realy/init.go b/realy/init.go index 6656ea5..3a436e0 100644 --- a/realy/init.go +++ b/realy/init.go @@ -37,6 +37,7 @@ func (s *Server) Init() { } go func() { chk.E(s.Store.FulltextIndex()) + chk.E(s.Store.LangIndex()) }() } diff --git a/store/store_interface.go b/store/store_interface.go index 47b1058..629e26c 100644 --- a/store/store_interface.go +++ b/store/store_interface.go @@ -32,6 +32,7 @@ type I interface { EventIdSerialer Accountant Fulltexter + Languager } type Initer interface { @@ -131,3 +132,7 @@ type EventIdSerialer interface { type Fulltexter interface { FulltextIndex() (err error) } + +type Languager interface { + LangIndex() (err error) +}