completed language tag indexer

This commit is contained in:
2025-05-13 17:29:19 -01:06
parent af588e24ae
commit 001ccb0d5b
9 changed files with 223 additions and 3 deletions

View File

@@ -25,6 +25,10 @@ type Words struct {
wordMap map[string]struct{} wordMap map[string]struct{}
} }
// FulltextIndex adds serials to the list of serials associated with a given word so a free text
// search can find text events with specific words and in sequences in the events. This only has
// to be done one time per event so it stores the serial of the newest index it has already done
// in a special key.
func (r *T) FulltextIndex() (err error) { func (r *T) FulltextIndex() (err error) {
r.WG.Add(1) r.WG.Add(1)
defer r.WG.Done() defer r.WG.Done()

View File

@@ -6,6 +6,8 @@ import (
"encoding/binary" "encoding/binary"
"io" "io"
"golang.org/x/exp/constraints"
"realy.lol/chk" "realy.lol/chk"
"realy.lol/kind" "realy.lol/kind"
"realy.lol/ratel/keys" "realy.lol/ratel/keys"
@@ -20,7 +22,7 @@ type T struct {
var _ keys.Element = &T{} var _ keys.Element = &T{}
// New creates a new kinder.T for reading/writing kind.T values. // New creates a new kinder.T for reading/writing kind.T values.
func New[V uint16 | uint32 | int32 | uint64 | int64 | int](c V) (p *T) { return &T{Val: kind.New(c)} } func New[V constraints.Integer](c V) (p *T) { return &T{Val: kind.New(c)} }
func Make(c *kind.T) (v []byte) { func Make(c *kind.T) (v []byte) {
v = make([]byte, Len) v = make([]byte, Len)

39
ratel/keys/lang/lang.go Normal file
View File

@@ -0,0 +1,39 @@
// Package lang implements a keys.Element for an ISO-639-2 3 letter language code.
package lang
import (
"io"
"realy.lol/chk"
"realy.lol/ratel/keys"
)
const Len = 3
type T struct {
Val []byte
}
var _ keys.Element = &T{}
// New creates a new kinder.T for reading/writing kind.T values.
func New[V string | []byte](c V) (p *T) {
if len(c) != Len {
return &T{Val: make([]byte, Len)}
}
return &T{Val: []byte(c)}
}
func (c *T) Write(buf io.Writer) {
buf.Write(c.Val)
}
func (c *T) Read(buf io.Reader) (el keys.Element) {
c.Val = make([]byte, Len)
if n, err := buf.Read(c.Val); chk.E(err) || n != Len {
return nil
}
return c
}
func (c *T) Len() int { return Len }

View File

@@ -0,0 +1,21 @@
package lang
import (
"bytes"
"testing"
"realy.lol/kind"
)
func TestT(t *testing.T) {
n := kind.New(1059)
v := New(n.ToU16())
buf := new(bytes.Buffer)
v.Write(buf)
buf2 := bytes.NewBuffer(buf.Bytes())
v2 := New(0)
el := v2.Read(buf2).(*T)
if el.Val.ToU16() != n.ToU16() {
t.Fatalf("expected %d got %d", n, el.Val)
}
}

139
ratel/langindex.go Normal file
View File

@@ -0,0 +1,139 @@
package ratel
import (
"time"
"github.com/dgraph-io/badger/v4"
"realy.lol/chk"
"realy.lol/event"
"realy.lol/kind"
"realy.lol/log"
"realy.lol/ratel/keys/lang"
"realy.lol/ratel/keys/serial"
"realy.lol/ratel/prefixes"
"realy.lol/tag"
)
type Langs struct {
ser *serial.T
langs []string
}
// LangIndex searches through events for language tags and stores a LangIndex key containing the
// ISO-639-2 language code and serial to search for text events by language.
func (r *T) LangIndex() (err error) {
r.WG.Add(1)
defer r.WG.Done()
log.I.F("indexing language tags")
defer log.I.F("finished indexing language tags")
langChan := make(chan Langs)
go func() {
for {
select {
case <-r.Ctx.Done():
return
case l := <-langChan:
if len(l.langs) < 1 {
continue
}
log.I.S("making lang index for %d %v", l.ser.Uint64(), l.langs)
retry:
if err = r.Update(func(txn *badger.Txn) (err error) {
for _, v := range l.langs {
select {
case <-r.Ctx.Done():
return
default:
}
key := prefixes.LangIndex.Key(lang.New(v), l.ser)
if err = txn.Set(key, nil); chk.E(err) {
return
}
return
}
return
}); chk.E(err) {
time.Sleep(time.Second / 4)
goto retry
}
}
}
}()
var last *serial.T
if err = r.View(func(txn *badger.Txn) (err error) {
var item *badger.Item
if item, err = txn.Get(prefixes.LangLastIndexed.Key()); chk.E(err) {
return
}
var val []byte
if val, err = item.ValueCopy(nil); chk.E(err) {
return
}
last = serial.New(val)
return
}); chk.E(err) {
}
if last == nil {
last = serial.New(serial.Make(0))
}
if err = r.Update(func(txn *badger.Txn) (err error) {
it := txn.NewIterator(badger.IteratorOptions{Prefix: prefixes.Event.Key()})
defer it.Close()
for it.Seek(prefixes.Event.Key(last)); it.Valid(); it.Next() {
item := it.Item()
k := item.KeyCopy(nil)
ser := serial.New(k[1:])
log.I.F("lang index scanning %d", ser.Uint64())
if ser.Uint64() < last.Uint64() {
k = k[:0]
log.I.F("already done %d", ser.Uint64())
continue
}
var val []byte
if val, err = item.ValueCopy(nil); chk.E(err) {
continue
}
ev := &event.T{}
if _, err = r.Unmarshal(ev, val); chk.E(err) {
return
}
langs := r.GetLangTags(ev)
lprf := prefixes.LangLastIndexed.Key()
if err = txn.Set(lprf, ser.Val); chk.E(err) {
return
}
if len(langs) > 0 {
l := Langs{ser: ser, langs: langs}
log.I.S(l)
langChan <- l
}
select {
case <-r.Ctx.Done():
log.I.F("context closed")
return
default:
}
}
return
}); chk.E(err) {
return
}
return
}
func (r *T) GetLangTags(ev *event.T) (langs []string) {
if ev.Kind.OneOf(kind.TextNote, kind.Article) {
tgs := ev.Tags.GetAll(tag.New("l"))
tgsl := tgs.ToStringsSlice()
for _, v := range tgsl {
for _, w := range LanguageCodes {
if v[1] == w.ISO639_1 || v[1] == w.ISO639_2 {
langs = append(langs, w.ISO639_2)
}
}
}
}
return
}

View File

@@ -127,10 +127,16 @@ const (
// [ 16 ] [ 8 byte serial ] // [ 16 ] [ 8 byte serial ]
FulltextLastIndexed FulltextLastIndexed
// LangIndex is an index of events with language tags. // LangIndex is an index of events with language tags. These use ISO639-2 3-letter codes
// regardless of if there is ISO639-1 codes because they cover more languages and are less
// ambiguous.
// //
// [ 17 ][ // [ 17 ][ ISO639-2 code ][ serial ]
LangIndex LangIndex
// LangLastIndexed is a progress cursor for the LangIndex that marks the newest index that
// has been scanned for an l language tag
LangLastIndexed
) )
// FilterPrefixes is a slice of the prefixes used by filter index to enable a loop // FilterPrefixes is a slice of the prefixes used by filter index to enable a loop

View File

@@ -129,6 +129,9 @@ func (r *T) SaveEvent(c context.T, ev *event.T) (err error) {
if err = r.FulltextIndex(); chk.E(err) { if err = r.FulltextIndex(); chk.E(err) {
return return
} }
if err = r.LangIndex(); chk.E(err) {
return
}
return return
} }

View File

@@ -37,6 +37,7 @@ func (s *Server) Init() {
} }
go func() { go func() {
chk.E(s.Store.FulltextIndex()) chk.E(s.Store.FulltextIndex())
chk.E(s.Store.LangIndex())
}() }()
} }

View File

@@ -32,6 +32,7 @@ type I interface {
EventIdSerialer EventIdSerialer
Accountant Accountant
Fulltexter Fulltexter
Languager
} }
type Initer interface { type Initer interface {
@@ -131,3 +132,7 @@ type EventIdSerialer interface {
type Fulltexter interface { type Fulltexter interface {
FulltextIndex() (err error) FulltextIndex() (err error)
} }
type Languager interface {
LangIndex() (err error)
}