completed language tag indexer
This commit is contained in:
@@ -25,6 +25,10 @@ type Words struct {
|
||||
wordMap map[string]struct{}
|
||||
}
|
||||
|
||||
// FulltextIndex adds serials to the list of serials associated with a given word so a free text
|
||||
// search can find text events with specific words and in sequences in the events. This only has
|
||||
// to be done one time per event so it stores the serial of the newest index it has already done
|
||||
// in a special key.
|
||||
func (r *T) FulltextIndex() (err error) {
|
||||
r.WG.Add(1)
|
||||
defer r.WG.Done()
|
||||
|
||||
@@ -6,6 +6,8 @@ import (
|
||||
"encoding/binary"
|
||||
"io"
|
||||
|
||||
"golang.org/x/exp/constraints"
|
||||
|
||||
"realy.lol/chk"
|
||||
"realy.lol/kind"
|
||||
"realy.lol/ratel/keys"
|
||||
@@ -20,7 +22,7 @@ type T struct {
|
||||
var _ keys.Element = &T{}
|
||||
|
||||
// New creates a new kinder.T for reading/writing kind.T values.
|
||||
func New[V uint16 | uint32 | int32 | uint64 | int64 | int](c V) (p *T) { return &T{Val: kind.New(c)} }
|
||||
func New[V constraints.Integer](c V) (p *T) { return &T{Val: kind.New(c)} }
|
||||
|
||||
func Make(c *kind.T) (v []byte) {
|
||||
v = make([]byte, Len)
|
||||
|
||||
39
ratel/keys/lang/lang.go
Normal file
39
ratel/keys/lang/lang.go
Normal file
@@ -0,0 +1,39 @@
|
||||
// Package lang implements a keys.Element for an ISO-639-2 3 letter language code.
|
||||
package lang
|
||||
|
||||
import (
|
||||
"io"
|
||||
|
||||
"realy.lol/chk"
|
||||
"realy.lol/ratel/keys"
|
||||
)
|
||||
|
||||
const Len = 3
|
||||
|
||||
type T struct {
|
||||
Val []byte
|
||||
}
|
||||
|
||||
var _ keys.Element = &T{}
|
||||
|
||||
// New creates a new kinder.T for reading/writing kind.T values.
|
||||
func New[V string | []byte](c V) (p *T) {
|
||||
if len(c) != Len {
|
||||
return &T{Val: make([]byte, Len)}
|
||||
}
|
||||
return &T{Val: []byte(c)}
|
||||
}
|
||||
|
||||
func (c *T) Write(buf io.Writer) {
|
||||
buf.Write(c.Val)
|
||||
}
|
||||
|
||||
func (c *T) Read(buf io.Reader) (el keys.Element) {
|
||||
c.Val = make([]byte, Len)
|
||||
if n, err := buf.Read(c.Val); chk.E(err) || n != Len {
|
||||
return nil
|
||||
}
|
||||
return c
|
||||
}
|
||||
|
||||
func (c *T) Len() int { return Len }
|
||||
21
ratel/keys/lang/lang_test.go
Normal file
21
ratel/keys/lang/lang_test.go
Normal file
@@ -0,0 +1,21 @@
|
||||
package lang
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"testing"
|
||||
|
||||
"realy.lol/kind"
|
||||
)
|
||||
|
||||
func TestT(t *testing.T) {
|
||||
n := kind.New(1059)
|
||||
v := New(n.ToU16())
|
||||
buf := new(bytes.Buffer)
|
||||
v.Write(buf)
|
||||
buf2 := bytes.NewBuffer(buf.Bytes())
|
||||
v2 := New(0)
|
||||
el := v2.Read(buf2).(*T)
|
||||
if el.Val.ToU16() != n.ToU16() {
|
||||
t.Fatalf("expected %d got %d", n, el.Val)
|
||||
}
|
||||
}
|
||||
139
ratel/langindex.go
Normal file
139
ratel/langindex.go
Normal file
@@ -0,0 +1,139 @@
|
||||
package ratel
|
||||
|
||||
import (
|
||||
"time"
|
||||
|
||||
"github.com/dgraph-io/badger/v4"
|
||||
|
||||
"realy.lol/chk"
|
||||
"realy.lol/event"
|
||||
"realy.lol/kind"
|
||||
"realy.lol/log"
|
||||
"realy.lol/ratel/keys/lang"
|
||||
"realy.lol/ratel/keys/serial"
|
||||
"realy.lol/ratel/prefixes"
|
||||
"realy.lol/tag"
|
||||
)
|
||||
|
||||
type Langs struct {
|
||||
ser *serial.T
|
||||
langs []string
|
||||
}
|
||||
|
||||
// LangIndex searches through events for language tags and stores a LangIndex key containing the
|
||||
// ISO-639-2 language code and serial to search for text events by language.
|
||||
func (r *T) LangIndex() (err error) {
|
||||
r.WG.Add(1)
|
||||
defer r.WG.Done()
|
||||
log.I.F("indexing language tags")
|
||||
defer log.I.F("finished indexing language tags")
|
||||
langChan := make(chan Langs)
|
||||
go func() {
|
||||
for {
|
||||
select {
|
||||
case <-r.Ctx.Done():
|
||||
return
|
||||
case l := <-langChan:
|
||||
if len(l.langs) < 1 {
|
||||
continue
|
||||
}
|
||||
log.I.S("making lang index for %d %v", l.ser.Uint64(), l.langs)
|
||||
retry:
|
||||
if err = r.Update(func(txn *badger.Txn) (err error) {
|
||||
for _, v := range l.langs {
|
||||
select {
|
||||
case <-r.Ctx.Done():
|
||||
return
|
||||
default:
|
||||
}
|
||||
key := prefixes.LangIndex.Key(lang.New(v), l.ser)
|
||||
if err = txn.Set(key, nil); chk.E(err) {
|
||||
return
|
||||
}
|
||||
return
|
||||
}
|
||||
return
|
||||
}); chk.E(err) {
|
||||
time.Sleep(time.Second / 4)
|
||||
goto retry
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}()
|
||||
var last *serial.T
|
||||
if err = r.View(func(txn *badger.Txn) (err error) {
|
||||
var item *badger.Item
|
||||
if item, err = txn.Get(prefixes.LangLastIndexed.Key()); chk.E(err) {
|
||||
return
|
||||
}
|
||||
var val []byte
|
||||
if val, err = item.ValueCopy(nil); chk.E(err) {
|
||||
return
|
||||
}
|
||||
last = serial.New(val)
|
||||
return
|
||||
}); chk.E(err) {
|
||||
}
|
||||
if last == nil {
|
||||
last = serial.New(serial.Make(0))
|
||||
}
|
||||
if err = r.Update(func(txn *badger.Txn) (err error) {
|
||||
it := txn.NewIterator(badger.IteratorOptions{Prefix: prefixes.Event.Key()})
|
||||
defer it.Close()
|
||||
for it.Seek(prefixes.Event.Key(last)); it.Valid(); it.Next() {
|
||||
item := it.Item()
|
||||
k := item.KeyCopy(nil)
|
||||
ser := serial.New(k[1:])
|
||||
log.I.F("lang index scanning %d", ser.Uint64())
|
||||
if ser.Uint64() < last.Uint64() {
|
||||
k = k[:0]
|
||||
log.I.F("already done %d", ser.Uint64())
|
||||
continue
|
||||
}
|
||||
var val []byte
|
||||
if val, err = item.ValueCopy(nil); chk.E(err) {
|
||||
continue
|
||||
}
|
||||
ev := &event.T{}
|
||||
if _, err = r.Unmarshal(ev, val); chk.E(err) {
|
||||
return
|
||||
}
|
||||
langs := r.GetLangTags(ev)
|
||||
lprf := prefixes.LangLastIndexed.Key()
|
||||
if err = txn.Set(lprf, ser.Val); chk.E(err) {
|
||||
return
|
||||
}
|
||||
if len(langs) > 0 {
|
||||
l := Langs{ser: ser, langs: langs}
|
||||
log.I.S(l)
|
||||
langChan <- l
|
||||
}
|
||||
select {
|
||||
case <-r.Ctx.Done():
|
||||
log.I.F("context closed")
|
||||
return
|
||||
default:
|
||||
}
|
||||
}
|
||||
return
|
||||
}); chk.E(err) {
|
||||
return
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
func (r *T) GetLangTags(ev *event.T) (langs []string) {
|
||||
if ev.Kind.OneOf(kind.TextNote, kind.Article) {
|
||||
tgs := ev.Tags.GetAll(tag.New("l"))
|
||||
tgsl := tgs.ToStringsSlice()
|
||||
for _, v := range tgsl {
|
||||
for _, w := range LanguageCodes {
|
||||
if v[1] == w.ISO639_1 || v[1] == w.ISO639_2 {
|
||||
langs = append(langs, w.ISO639_2)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
@@ -127,10 +127,16 @@ const (
|
||||
// [ 16 ] [ 8 byte serial ]
|
||||
FulltextLastIndexed
|
||||
|
||||
// LangIndex is an index of events with language tags.
|
||||
// LangIndex is an index of events with language tags. These use ISO639-2 3-letter codes
|
||||
// regardless of if there is ISO639-1 codes because they cover more languages and are less
|
||||
// ambiguous.
|
||||
//
|
||||
// [ 17 ][
|
||||
// [ 17 ][ ISO639-2 code ][ serial ]
|
||||
LangIndex
|
||||
|
||||
// LangLastIndexed is a progress cursor for the LangIndex that marks the newest index that
|
||||
// has been scanned for an l language tag
|
||||
LangLastIndexed
|
||||
)
|
||||
|
||||
// FilterPrefixes is a slice of the prefixes used by filter index to enable a loop
|
||||
|
||||
@@ -129,6 +129,9 @@ func (r *T) SaveEvent(c context.T, ev *event.T) (err error) {
|
||||
if err = r.FulltextIndex(); chk.E(err) {
|
||||
return
|
||||
}
|
||||
if err = r.LangIndex(); chk.E(err) {
|
||||
return
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
|
||||
@@ -37,6 +37,7 @@ func (s *Server) Init() {
|
||||
}
|
||||
go func() {
|
||||
chk.E(s.Store.FulltextIndex())
|
||||
chk.E(s.Store.LangIndex())
|
||||
}()
|
||||
}
|
||||
|
||||
|
||||
@@ -32,6 +32,7 @@ type I interface {
|
||||
EventIdSerialer
|
||||
Accountant
|
||||
Fulltexter
|
||||
Languager
|
||||
}
|
||||
|
||||
type Initer interface {
|
||||
@@ -131,3 +132,7 @@ type EventIdSerialer interface {
|
||||
type Fulltexter interface {
|
||||
FulltextIndex() (err error)
|
||||
}
|
||||
|
||||
type Languager interface {
|
||||
LangIndex() (err error)
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user