optimize e and p tags

This commit is contained in:
2025-11-22 19:40:48 +00:00
parent 5c12c467b7
commit ef51382760
15 changed files with 2564 additions and 6 deletions

View File

@@ -16,7 +16,7 @@ import (
)
const (
currentVersion uint32 = 4
currentVersion uint32 = 5
)
func (d *D) RunMigrations() {
@@ -90,6 +90,13 @@ func (d *D) RunMigrations() {
// bump to version 4
_ = d.writeVersionTag(4)
}
if dbVersion < 5 {
log.I.F("migrating to version 5...")
// re-encode events with optimized tag binary format (e/p tags)
d.ReencodeEventsWithOptimizedTags()
// bump to version 5
_ = d.writeVersionTag(5)
}
}
// writeVersionTag writes a new version tag key to the database (no value)
@@ -537,3 +544,140 @@ func (d *D) ConvertSmallEventsToInline() {
log.I.F("migration complete: converted %d events to optimized inline storage, deleted %d old keys", convertedCount, deletedCount)
}
// ReencodeEventsWithOptimizedTags re-encodes all events to use the new binary
// tag format that stores e/p tag values as 33-byte binary (32-byte hash + null)
// instead of 64-byte hex strings. This reduces memory usage by ~48% for these tags.
func (d *D) ReencodeEventsWithOptimizedTags() {
log.I.F("re-encoding events with optimized tag binary format...")
var err error
type EventUpdate struct {
Key []byte
OldData []byte
NewData []byte
}
var updates []EventUpdate
var processedCount int
// Helper to collect event updates from iterator
// Only processes regular events (evt prefix) - inline storage already benefits
collectUpdates := func(it *badger.Iterator, prefix []byte) error {
for it.Rewind(); it.Valid(); it.Next() {
item := it.Item()
key := item.KeyCopy(nil)
var val []byte
if val, err = item.ValueCopy(nil); chk.E(err) {
continue
}
// Regular event storage - data is in value
eventData := val
if len(eventData) == 0 {
continue
}
// Decode the event
ev := new(event.E)
if err = ev.UnmarshalBinary(bytes.NewBuffer(eventData)); chk.E(err) {
continue
}
// Check if this event has e or p tags that could benefit from optimization
hasOptimizableTags := false
if ev.Tags != nil && ev.Tags.Len() > 0 {
for _, t := range *ev.Tags {
if t.Len() >= 2 {
key := t.Key()
if len(key) == 1 && (key[0] == 'e' || key[0] == 'p') {
hasOptimizableTags = true
break
}
}
}
}
if !hasOptimizableTags {
continue
}
// Re-encode the event (this will apply the new tag optimization)
newData := ev.MarshalBinaryToBytes(nil)
// Only update if the data actually changed
if !bytes.Equal(eventData, newData) {
updates = append(updates, EventUpdate{
Key: key,
OldData: eventData,
NewData: newData,
})
}
}
return nil
}
// Only process regular "evt" prefix events (not inline storage)
// Inline storage (sev, rev, aev) already benefits from the optimization
// because the binary data is stored directly in the key
prf := new(bytes.Buffer)
if err = indexes.EventEnc(nil).MarshalWrite(prf); chk.E(err) {
return
}
evtPrefix := prf.Bytes()
// Collect updates from regular events only
if err = d.View(func(txn *badger.Txn) error {
it := txn.NewIterator(badger.IteratorOptions{Prefix: evtPrefix})
defer it.Close()
return collectUpdates(it, evtPrefix)
}); chk.E(err) {
return
}
log.I.F("found %d events with e/p tags to re-encode", len(updates))
if len(updates) == 0 {
log.I.F("no events need re-encoding")
return
}
// Apply updates in batches
const batchSize = 1000
for i := 0; i < len(updates); i += batchSize {
end := i + batchSize
if end > len(updates) {
end = len(updates)
}
batch := updates[i:end]
if err = d.Update(func(txn *badger.Txn) error {
for _, upd := range batch {
// Since we're only processing regular events (evt prefix),
// we just update the value directly
if err = txn.Set(upd.Key, upd.NewData); chk.E(err) {
log.W.F("failed to update event: %v", err)
continue
}
processedCount++
}
return nil
}); chk.E(err) {
log.W.F("batch update failed: %v", err)
continue
}
if (i/batchSize)%10 == 0 && i > 0 {
log.I.F("progress: %d/%d events re-encoded", i, len(updates))
}
}
savedBytes := 0
for _, upd := range updates {
savedBytes += len(upd.OldData) - len(upd.NewData)
}
log.I.F("migration complete: re-encoded %d events, saved approximately %d bytes (%.2f KB)",
processedCount, savedBytes, float64(savedBytes)/1024.0)
}

View File

@@ -291,3 +291,43 @@ func BenchmarkTagsToSliceOfSliceOfStrings(b *testing.B) {
})
}
func BenchmarkTagEquals(b *testing.B) {
b.Run("BinaryToBinary", func(b *testing.B) {
b.ReportAllocs()
// Create two tags with same binary-encoded value
tag1 := New()
_, _ = tag1.Unmarshal([]byte(`["e","0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef"]`))
tag2 := New()
_, _ = tag2.Unmarshal([]byte(`["e","0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef"]`))
b.ResetTimer()
for i := 0; i < b.N; i++ {
_ = tag1.Equals(tag2)
}
})
b.Run("BinaryToHex", func(b *testing.B) {
b.ReportAllocs()
// One binary-encoded, one hex (simulate comparison with non-optimized tag)
tag1 := New()
_, _ = tag1.Unmarshal([]byte(`["e","0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef"]`))
// Create hex version manually (simulating older format)
tag2 := NewFromBytesSlice([]byte("e"), []byte("0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef"))
b.ResetTimer()
for i := 0; i < b.N; i++ {
_ = tag1.Equals(tag2)
}
})
b.Run("HexToHex", func(b *testing.B) {
b.ReportAllocs()
// Both hex (non-optimized tags)
tag1 := NewFromBytesSlice([]byte("t"), []byte("0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef"))
tag2 := NewFromBytesSlice([]byte("t"), []byte("0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef"))
b.ResetTimer()
for i := 0; i < b.N; i++ {
_ = tag1.Equals(tag2)
}
})
}

View File

@@ -7,6 +7,7 @@ import (
"bytes"
"lol.mleku.dev/errorf"
"next.orly.dev/pkg/encoders/hex"
"next.orly.dev/pkg/encoders/text"
"next.orly.dev/pkg/utils"
)
@@ -18,6 +19,22 @@ const (
Relay
)
// Binary encoding constants for optimized storage of hex-encoded identifiers
const (
// BinaryEncodedLen is the length of a binary-encoded 32-byte hash with null terminator
BinaryEncodedLen = 33
// HexEncodedLen is the length of a hex-encoded 32-byte hash
HexEncodedLen = 64
// HashLen is the raw length of a hash (pubkey/event ID)
HashLen = 32
)
// Tags that use binary encoding optimization for their value field
var binaryOptimizedTags = map[string]bool{
"e": true, // event references
"p": true, // pubkey references
}
type T struct {
T [][]byte
}
@@ -76,6 +93,7 @@ func (t *T) Contains(s []byte) (b bool) {
}
// Marshal encodes a tag.T as standard minified JSON array of strings.
// Binary-encoded values (e/p tags) are automatically converted back to hex.
func (t *T) Marshal(dst []byte) (b []byte) {
b = dst
// Pre-allocate buffer if nil to reduce reallocations
@@ -83,14 +101,25 @@ func (t *T) Marshal(dst []byte) (b []byte) {
// Each field might be escaped, so estimate len(field) * 1.5 + 2 quotes + comma
if b == nil && len(t.T) > 0 {
estimatedSize := 2 // brackets
for _, s := range t.T {
estimatedSize += len(s)*3/2 + 4 // escaped field + quotes + comma
for i, s := range t.T {
fieldLen := len(s)
// Binary-encoded fields become hex (33 -> 64 chars)
if i == Value && isBinaryEncoded(s) {
fieldLen = HexEncodedLen
}
estimatedSize += fieldLen*3/2 + 4 // escaped field + quotes + comma
}
b = make([]byte, 0, estimatedSize)
}
b = append(b, '[')
for i, s := range t.T {
b = text.AppendQuote(b, s, text.NostrEscape)
// Convert binary-encoded value fields back to hex for JSON
if i == Value && isBinaryEncoded(s) {
hexVal := hex.EncAppend(nil, s[:HashLen])
b = text.AppendQuote(b, hexVal, text.NostrEscape)
} else {
b = text.AppendQuote(b, s, text.NostrEscape)
}
if i < len(t.T)-1 {
b = append(b, ',')
}
@@ -112,6 +141,8 @@ func (t *T) MarshalJSON() (b []byte, err error) {
}
// Unmarshal decodes a standard minified JSON array of strings to a tags.T.
// For "e" and "p" tags with 64-character hex values, it converts them to
// 33-byte binary format (32 bytes hash + null terminator) for efficiency.
func (t *T) Unmarshal(b []byte) (r []byte, err error) {
var inQuotes, openedBracket bool
var quoteStart int
@@ -135,7 +166,23 @@ func (t *T) Unmarshal(b []byte) (r []byte, err error) {
// original JSON buffer in-place (which would corrupt subsequent parsing).
copyBuf := make([]byte, i-quoteStart)
copy(copyBuf, b[quoteStart:i])
t.T = append(t.T, text.NostrUnescape(copyBuf))
unescaped := text.NostrUnescape(copyBuf)
// Optimize e/p tag values by converting hex to binary
fieldIdx := len(t.T)
if fieldIdx == Value && len(t.T) > 0 && shouldOptimize(t.T[Key], unescaped) {
// Decode hex to binary format: 32 bytes + null terminator
binVal := make([]byte, BinaryEncodedLen)
if _, err = hex.DecBytes(binVal[:HashLen], unescaped); err == nil {
binVal[HashLen] = 0 // null terminator
t.T = append(t.T, binVal)
} else {
// If decode fails, store as-is
t.T = append(t.T, unescaped)
}
} else {
t.T = append(t.T, unescaped)
}
}
}
if !openedBracket || inQuotes {
@@ -193,3 +240,148 @@ func (t *T) ToSliceOfStrings() (s []string) {
}
return
}
// isBinaryEncoded checks if a value field is stored in optimized binary format
// (32-byte hash + null terminator = 33 bytes total)
func isBinaryEncoded(val []byte) bool {
return len(val) == BinaryEncodedLen && val[HashLen] == 0
}
// shouldOptimize checks if a tag should use binary encoding optimization
func shouldOptimize(key []byte, val []byte) bool {
if len(key) != 1 {
return false
}
keyStr := string(key)
if !binaryOptimizedTags[keyStr] {
return false
}
// Only optimize if it's a valid 64-character hex string
return len(val) == HexEncodedLen && isValidHex(val)
}
// isValidHex checks if all bytes are valid hex characters
func isValidHex(b []byte) bool {
for _, c := range b {
if !((c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F')) {
return false
}
}
return true
}
// ValueHex returns the value field as hex string. If the value is stored in
// binary format, it converts it to hex. Otherwise, it returns the value as-is.
func (t *T) ValueHex() []byte {
if t == nil || len(t.T) <= Value {
return nil
}
val := t.T[Value]
if isBinaryEncoded(val) {
// Convert binary back to hex
return hex.EncAppend(nil, val[:HashLen])
}
return val
}
// ValueBinary returns the raw binary value if it's binary-encoded, or nil otherwise.
// This is useful for database operations that need the raw hash bytes.
func (t *T) ValueBinary() []byte {
if t == nil || len(t.T) <= Value {
return nil
}
val := t.T[Value]
if isBinaryEncoded(val) {
return val[:HashLen]
}
return nil
}
// Equals compares two tags for equality, handling both binary and hex encodings.
// This ensures that ["e", "abc..."] and ["e", <binary>] are equal if they
// represent the same hash. This method does NOT allocate memory.
func (t *T) Equals(other *T) bool {
if t == nil && other == nil {
return true
}
if t == nil || other == nil {
return false
}
if len(t.T) != len(other.T) {
return false
}
for i := range t.T {
if i == Value && len(t.T) > Value {
// Special handling for value field to compare binary vs hex without allocating
tVal := t.T[Value]
oVal := other.T[Value]
tIsBinary := isBinaryEncoded(tVal)
oIsBinary := isBinaryEncoded(oVal)
// Both binary - compare first 32 bytes directly
if tIsBinary && oIsBinary {
if !bytes.Equal(tVal[:HashLen], oVal[:HashLen]) {
return false
}
} else if tIsBinary || oIsBinary {
// One is binary, one is hex - need to compare carefully
// Compare the binary one's raw bytes with hex-decoded version of the other
var binBytes, hexBytes []byte
if tIsBinary {
binBytes = tVal[:HashLen]
hexBytes = oVal
} else {
binBytes = oVal[:HashLen]
hexBytes = tVal
}
// Decode hex inline without allocation by comparing byte by byte
if len(hexBytes) != HexEncodedLen {
return false
}
for j := 0; j < HashLen; j++ {
// Convert two hex chars to one byte and compare
hi := hexBytes[j*2]
lo := hexBytes[j*2+1]
var hiByte, loByte byte
if hi >= '0' && hi <= '9' {
hiByte = hi - '0'
} else if hi >= 'a' && hi <= 'f' {
hiByte = hi - 'a' + 10
} else if hi >= 'A' && hi <= 'F' {
hiByte = hi - 'A' + 10
} else {
return false
}
if lo >= '0' && lo <= '9' {
loByte = lo - '0'
} else if lo >= 'a' && lo <= 'f' {
loByte = lo - 'a' + 10
} else if lo >= 'A' && lo <= 'F' {
loByte = lo - 'A' + 10
} else {
return false
}
expectedByte := (hiByte << 4) | loByte
if binBytes[j] != expectedByte {
return false
}
}
} else {
// Both are regular (hex or other) - direct comparison
if !bytes.Equal(tVal, oVal) {
return false
}
}
} else {
if !bytes.Equal(t.T[i], other.T[i]) {
return false
}
}
}
return true
}

View File

@@ -30,3 +30,207 @@ func TestMarshalUnmarshal(t *testing.T) {
}
}
}
// TestBinaryEncodingOptimization verifies that e/p tags are stored in binary format
func TestBinaryEncodingOptimization(t *testing.T) {
testCases := []struct {
name string
json string
expectBinary bool
internalLen int // expected length of Value() field in internal storage
}{
{
name: "e tag with hex value",
json: `["e","0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef"]`,
expectBinary: true,
internalLen: BinaryEncodedLen, // 33 bytes (32 + null terminator)
},
{
name: "p tag with hex value",
json: `["p","fedcba9876543210fedcba9876543210fedcba9876543210fedcba9876543210"]`,
expectBinary: true,
internalLen: BinaryEncodedLen,
},
{
name: "e tag with relay",
json: `["e","0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef","wss://relay.example.com"]`,
expectBinary: true,
internalLen: BinaryEncodedLen,
},
{
name: "t tag not optimized",
json: `["t","bitcoin"]`,
expectBinary: false,
internalLen: 7, // "bitcoin" as-is
},
{
name: "e tag with short value not optimized",
json: `["e","short"]`,
expectBinary: false,
internalLen: 5,
},
{
name: "e tag with invalid hex not optimized",
json: `["e","zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz"]`,
expectBinary: false,
internalLen: 64,
},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
tag := New()
_, err := tag.Unmarshal([]byte(tc.json))
if err != nil {
t.Fatalf("Unmarshal failed: %v", err)
}
// Check internal storage length
if tag.Len() < 2 {
t.Fatal("Tag should have at least 2 fields")
}
valueField := tag.T[Value]
if len(valueField) != tc.internalLen {
t.Errorf("Expected internal value length %d, got %d", tc.internalLen, len(valueField))
}
// Check if binary encoded as expected
if tc.expectBinary {
if !isBinaryEncoded(valueField) {
t.Error("Expected binary encoding, but tag is not binary encoded")
}
// Verify null terminator
if valueField[HashLen] != 0 {
t.Error("Binary encoded value should have null terminator at position 32")
}
} else {
if isBinaryEncoded(valueField) {
t.Error("Did not expect binary encoding, but tag is binary encoded")
}
}
// Marshal back to JSON and verify it matches original
marshaled := tag.Marshal(nil)
if string(marshaled) != tc.json {
t.Errorf("Marshaled JSON doesn't match original.\nExpected: %s\nGot: %s", tc.json, string(marshaled))
}
})
}
}
// TestValueHexMethod verifies ValueHex() correctly converts binary to hex
func TestValueHexMethod(t *testing.T) {
json := `["e","0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef"]`
tag := New()
_, err := tag.Unmarshal([]byte(json))
if err != nil {
t.Fatalf("Unmarshal failed: %v", err)
}
// Internal storage should be binary
if !isBinaryEncoded(tag.T[Value]) {
t.Fatal("Expected binary encoding")
}
// ValueHex should return the original hex string
hexVal := tag.ValueHex()
expectedHex := "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef"
if string(hexVal) != expectedHex {
t.Errorf("ValueHex returned wrong value.\nExpected: %s\nGot: %s", expectedHex, string(hexVal))
}
}
// TestValueBinaryMethod verifies ValueBinary() returns raw hash bytes
func TestValueBinaryMethod(t *testing.T) {
json := `["p","0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef"]`
tag := New()
_, err := tag.Unmarshal([]byte(json))
if err != nil {
t.Fatalf("Unmarshal failed: %v", err)
}
// ValueBinary should return 32 bytes
binVal := tag.ValueBinary()
if binVal == nil {
t.Fatal("ValueBinary returned nil")
}
if len(binVal) != HashLen {
t.Errorf("Expected %d bytes, got %d", HashLen, len(binVal))
}
// It should match the first 32 bytes of the internal storage
if !utils.FastEqual(binVal, tag.T[Value][:HashLen]) {
t.Error("ValueBinary doesn't match internal storage")
}
}
// TestEqualsMethod verifies Equals() handles binary vs hex comparison
func TestEqualsMethod(t *testing.T) {
// Create tag from JSON (will be binary internally)
tag1 := New()
_, err := tag1.Unmarshal([]byte(`["e","0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef"]`))
if err != nil {
t.Fatalf("Unmarshal failed: %v", err)
}
// Create identical tag
tag2 := New()
_, err = tag2.Unmarshal([]byte(`["e","0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef"]`))
if err != nil {
t.Fatalf("Unmarshal failed: %v", err)
}
if !tag1.Equals(tag2) {
t.Error("Identical tags should be equal")
}
// Create tag with different value
tag3 := New()
_, err = tag3.Unmarshal([]byte(`["e","fedcba9876543210fedcba9876543210fedcba9876543210fedcba9876543210"]`))
if err != nil {
t.Fatalf("Unmarshal failed: %v", err)
}
if tag1.Equals(tag3) {
t.Error("Different tags should not be equal")
}
// Test with relay field
tag4 := New()
_, err = tag4.Unmarshal([]byte(`["e","0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef","wss://relay.example.com"]`))
if err != nil {
t.Fatalf("Unmarshal failed: %v", err)
}
if tag1.Equals(tag4) {
t.Error("Tags with different lengths should not be equal")
}
}
// TestBinaryEncodingSavesSpace verifies the optimization reduces memory usage
func TestBinaryEncodingSavesSpace(t *testing.T) {
// Tag without optimization (non-hex or non e/p tag)
tagNonOpt := New()
_, _ = tagNonOpt.Unmarshal([]byte(`["t","0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef"]`))
nonOptSize := len(tagNonOpt.T[Value])
// Tag with optimization
tagOpt := New()
_, _ = tagOpt.Unmarshal([]byte(`["e","0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef"]`))
optSize := len(tagOpt.T[Value])
// Binary should be smaller (33 vs 64 bytes)
if optSize >= nonOptSize {
t.Errorf("Binary encoding should save space. Non-opt: %d bytes, Opt: %d bytes", nonOptSize, optSize)
}
expectedSavings := HexEncodedLen - BinaryEncodedLen // 64 - 33 = 31 bytes
actualSavings := nonOptSize - optSize
if actualSavings != expectedSavings {
t.Errorf("Expected to save %d bytes, actually saved %d bytes", expectedSavings, actualSavings)
}
t.Logf("Space savings: %d bytes per e/p tag value (%.1f%% reduction)",
actualSavings, float64(actualSavings)/float64(nonOptSize)*100)
}