Implement comprehensive WebSocket subscription stability fixes
Some checks failed
Go / build (push) Has been cancelled
Go / release (push) Has been cancelled

- Resolved critical issues causing subscriptions to drop after 30-60 seconds due to unconsumed receiver channels.
- Introduced per-subscription consumer goroutines to ensure continuous event delivery and prevent channel overflow.
- Enhanced REQ parsing to handle both wrapped and unwrapped filter arrays, eliminating EOF errors.
- Updated publisher logic to correctly send events to receiver channels, ensuring proper event delivery to subscribers.
- Added extensive documentation and testing tools to verify subscription stability and performance.
- Bumped version to v0.26.2 to reflect these significant improvements.
This commit is contained in:
2025-11-06 18:21:00 +00:00
parent d604341a27
commit 581e0ec588
23 changed files with 3054 additions and 81 deletions

View File

@@ -0,0 +1,268 @@
package main
import (
"context"
"encoding/json"
"flag"
"fmt"
"log"
"os"
"os/signal"
"syscall"
"time"
"github.com/gorilla/websocket"
)
var (
relayURL = flag.String("url", "ws://localhost:3334", "Relay WebSocket URL")
duration = flag.Int("duration", 120, "Test duration in seconds")
)
func main() {
flag.Parse()
log.SetFlags(log.Ltime)
fmt.Println("===================================")
fmt.Println("Simple Subscription Stability Test")
fmt.Println("===================================")
fmt.Printf("Relay: %s\n", *relayURL)
fmt.Printf("Duration: %d seconds\n", *duration)
fmt.Println()
fmt.Println("This test verifies that subscriptions remain")
fmt.Println("active without dropping over the test period.")
fmt.Println()
// Connect to relay
log.Printf("Connecting to %s...", *relayURL)
conn, _, err := websocket.DefaultDialer.Dial(*relayURL, nil)
if err != nil {
log.Fatalf("Failed to connect: %v", err)
}
defer conn.Close()
log.Printf("✓ Connected")
// Context for the test
ctx, cancel := context.WithTimeout(context.Background(), time.Duration(*duration+10)*time.Second)
defer cancel()
// Handle interrupts
sigChan := make(chan os.Signal, 1)
signal.Notify(sigChan, os.Interrupt, syscall.SIGTERM)
go func() {
<-sigChan
log.Println("\nInterrupted, shutting down...")
cancel()
}()
// Subscribe
subID := fmt.Sprintf("stability-test-%d", time.Now().Unix())
reqMsg := []interface{}{"REQ", subID, map[string]interface{}{"kinds": []int{1}}}
reqMsgBytes, _ := json.Marshal(reqMsg)
log.Printf("Sending subscription: %s", subID)
if err := conn.WriteMessage(websocket.TextMessage, reqMsgBytes); err != nil {
log.Fatalf("Failed to send REQ: %v", err)
}
// Track connection health
lastMessageTime := time.Now()
gotEOSE := false
messageCount := 0
pingCount := 0
// Read goroutine
readDone := make(chan struct{})
go func() {
defer close(readDone)
for {
select {
case <-ctx.Done():
return
default:
}
conn.SetReadDeadline(time.Now().Add(10 * time.Second))
msgType, msg, err := conn.ReadMessage()
if err != nil {
if ctx.Err() != nil {
return
}
if netErr, ok := err.(interface{ Timeout() bool }); ok && netErr.Timeout() {
continue
}
log.Printf("Read error: %v", err)
return
}
lastMessageTime = time.Now()
messageCount++
// Handle PING
if msgType == websocket.PingMessage {
pingCount++
log.Printf("Received PING #%d, sending PONG", pingCount)
conn.WriteMessage(websocket.PongMessage, nil)
continue
}
// Parse message
var envelope []json.RawMessage
if err := json.Unmarshal(msg, &envelope); err != nil {
continue
}
if len(envelope) < 2 {
continue
}
var msgTypeStr string
json.Unmarshal(envelope[0], &msgTypeStr)
switch msgTypeStr {
case "EOSE":
var recvSubID string
json.Unmarshal(envelope[1], &recvSubID)
if recvSubID == subID && !gotEOSE {
gotEOSE = true
log.Printf("✓ Received EOSE - subscription is active")
}
case "EVENT":
var recvSubID string
json.Unmarshal(envelope[1], &recvSubID)
if recvSubID == subID {
log.Printf("Received EVENT (subscription still active)")
}
case "CLOSED":
var recvSubID string
json.Unmarshal(envelope[1], &recvSubID)
if recvSubID == subID {
log.Printf("⚠ Subscription CLOSED by relay!")
cancel()
return
}
case "NOTICE":
var notice string
json.Unmarshal(envelope[1], &notice)
log.Printf("NOTICE: %s", notice)
}
}
}()
// Wait for EOSE
log.Println("Waiting for EOSE...")
for !gotEOSE && ctx.Err() == nil {
time.Sleep(100 * time.Millisecond)
}
if !gotEOSE {
log.Fatal("Did not receive EOSE")
}
// Monitor loop
startTime := time.Now()
ticker := time.NewTicker(10 * time.Second)
defer ticker.Stop()
log.Println()
log.Printf("Subscription is active. Monitoring for %d seconds...", *duration)
log.Println("(Subscription should stay active even without events)")
log.Println()
for {
select {
case <-ctx.Done():
goto done
case <-ticker.C:
elapsed := time.Since(startTime)
timeSinceMessage := time.Since(lastMessageTime)
log.Printf("[%3.0fs/%ds] Messages: %d | Last message: %.0fs ago | Status: %s",
elapsed.Seconds(),
*duration,
messageCount,
timeSinceMessage.Seconds(),
getStatus(timeSinceMessage),
)
// Check if we've reached duration
if elapsed >= time.Duration(*duration)*time.Second {
goto done
}
}
}
done:
cancel()
// Wait for reader
select {
case <-readDone:
case <-time.After(2 * time.Second):
}
// Send CLOSE
closeMsg := []interface{}{"CLOSE", subID}
closeMsgBytes, _ := json.Marshal(closeMsg)
conn.WriteMessage(websocket.TextMessage, closeMsgBytes)
// Results
elapsed := time.Since(startTime)
timeSinceMessage := time.Since(lastMessageTime)
fmt.Println()
fmt.Println("===================================")
fmt.Println("Test Results")
fmt.Println("===================================")
fmt.Printf("Duration: %.1f seconds\n", elapsed.Seconds())
fmt.Printf("Total messages: %d\n", messageCount)
fmt.Printf("Last message: %.0f seconds ago\n", timeSinceMessage.Seconds())
fmt.Println()
// Determine success
if timeSinceMessage < 15*time.Second {
// Recent message - subscription is alive
fmt.Println("✓ TEST PASSED")
fmt.Println("Subscription remained active throughout test period.")
fmt.Println("Recent messages indicate healthy connection.")
} else if timeSinceMessage < 30*time.Second {
// Somewhat recent - probably OK
fmt.Println("✓ TEST LIKELY PASSED")
fmt.Println("Subscription appears active (message received recently).")
fmt.Println("Some delay is normal if relay is idle.")
} else if messageCount > 0 {
// Got EOSE but nothing since
fmt.Println("⚠ INCONCLUSIVE")
fmt.Println("Subscription was established but no activity since.")
fmt.Println("This is expected if relay has no events and doesn't send pings.")
fmt.Println("To properly test, publish events during the test period.")
} else {
// No messages at all
fmt.Println("✗ TEST FAILED")
fmt.Println("No messages received - subscription may have failed.")
}
fmt.Println()
fmt.Println("Note: This test verifies the subscription stays registered.")
fmt.Println("For full testing, publish events while this runs and verify")
fmt.Println("they are received throughout the entire test duration.")
}
func getStatus(timeSince time.Duration) string {
seconds := timeSince.Seconds()
switch {
case seconds < 10:
return "ACTIVE (recent message)"
case seconds < 30:
return "IDLE (normal)"
case seconds < 60:
return "QUIET (possibly normal)"
default:
return "STALE (may have dropped)"
}
}

View File

@@ -0,0 +1,347 @@
package main
import (
"context"
"encoding/json"
"flag"
"fmt"
"log"
"os"
"os/signal"
"sync/atomic"
"syscall"
"time"
"github.com/gorilla/websocket"
)
var (
relayURL = flag.String("url", "ws://localhost:3334", "Relay WebSocket URL")
duration = flag.Int("duration", 60, "Test duration in seconds")
eventKind = flag.Int("kind", 1, "Event kind to subscribe to")
verbose = flag.Bool("v", false, "Verbose output")
subID = flag.String("sub", "", "Subscription ID (default: auto-generated)")
)
type NostrEvent struct {
ID string `json:"id"`
PubKey string `json:"pubkey"`
CreatedAt int64 `json:"created_at"`
Kind int `json:"kind"`
Tags [][]string `json:"tags"`
Content string `json:"content"`
Sig string `json:"sig"`
}
func main() {
flag.Parse()
log.SetFlags(log.Ltime | log.Lmicroseconds)
// Generate subscription ID if not provided
subscriptionID := *subID
if subscriptionID == "" {
subscriptionID = fmt.Sprintf("test-%d", time.Now().Unix())
}
log.Printf("Starting subscription stability test")
log.Printf("Relay: %s", *relayURL)
log.Printf("Duration: %d seconds", *duration)
log.Printf("Event kind: %d", *eventKind)
log.Printf("Subscription ID: %s", subscriptionID)
log.Println()
// Connect to relay
log.Printf("Connecting to %s...", *relayURL)
conn, _, err := websocket.DefaultDialer.Dial(*relayURL, nil)
if err != nil {
log.Fatalf("Failed to connect: %v", err)
}
defer conn.Close()
log.Printf("✓ Connected")
log.Println()
// Context for the test
ctx, cancel := context.WithTimeout(context.Background(), time.Duration(*duration+10)*time.Second)
defer cancel()
// Handle interrupts
sigChan := make(chan os.Signal, 1)
signal.Notify(sigChan, os.Interrupt, syscall.SIGTERM)
go func() {
<-sigChan
log.Println("\nInterrupted, shutting down...")
cancel()
}()
// Counters
var receivedCount atomic.Int64
var lastEventTime atomic.Int64
lastEventTime.Store(time.Now().Unix())
// Subscribe
reqMsg := map[string]interface{}{
"kinds": []int{*eventKind},
}
reqMsgBytes, _ := json.Marshal(reqMsg)
subscribeMsg := []interface{}{"REQ", subscriptionID, json.RawMessage(reqMsgBytes)}
subscribeMsgBytes, _ := json.Marshal(subscribeMsg)
log.Printf("Sending REQ: %s", string(subscribeMsgBytes))
if err := conn.WriteMessage(websocket.TextMessage, subscribeMsgBytes); err != nil {
log.Fatalf("Failed to send REQ: %v", err)
}
// Read messages
gotEOSE := false
readDone := make(chan struct{})
consecutiveTimeouts := 0
maxConsecutiveTimeouts := 20 // Exit if we get too many consecutive timeouts
go func() {
defer close(readDone)
for {
select {
case <-ctx.Done():
return
default:
}
conn.SetReadDeadline(time.Now().Add(5 * time.Second))
_, msg, err := conn.ReadMessage()
if err != nil {
// Check for normal close
if websocket.IsCloseError(err, websocket.CloseNormalClosure, websocket.CloseGoingAway) {
log.Println("Connection closed normally")
return
}
// Check if context was cancelled
if ctx.Err() != nil {
return
}
// Check for timeout errors (these are expected during idle periods)
if netErr, ok := err.(interface{ Timeout() bool }); ok && netErr.Timeout() {
consecutiveTimeouts++
if consecutiveTimeouts >= maxConsecutiveTimeouts {
log.Printf("Too many consecutive read timeouts (%d), connection may be dead", consecutiveTimeouts)
return
}
// Only log every 5th timeout to avoid spam
if *verbose && consecutiveTimeouts%5 == 0 {
log.Printf("Read timeout (idle period, %d consecutive)", consecutiveTimeouts)
}
continue
}
// For any other error, log and exit
log.Printf("Read error: %v", err)
return
}
// Reset timeout counter on successful read
consecutiveTimeouts = 0
// Parse message
var envelope []json.RawMessage
if err := json.Unmarshal(msg, &envelope); err != nil {
if *verbose {
log.Printf("Failed to parse message: %v", err)
}
continue
}
if len(envelope) < 2 {
continue
}
var msgType string
json.Unmarshal(envelope[0], &msgType)
// Check message type
switch msgType {
case "EOSE":
var recvSubID string
json.Unmarshal(envelope[1], &recvSubID)
if recvSubID == subscriptionID {
if !gotEOSE {
gotEOSE = true
log.Printf("✓ Received EOSE - subscription is active")
log.Println()
log.Println("Waiting for real-time events...")
log.Println()
}
}
case "EVENT":
var recvSubID string
json.Unmarshal(envelope[1], &recvSubID)
if recvSubID == subscriptionID {
var event NostrEvent
if err := json.Unmarshal(envelope[2], &event); err == nil {
count := receivedCount.Add(1)
lastEventTime.Store(time.Now().Unix())
eventIDShort := event.ID
if len(eventIDShort) > 8 {
eventIDShort = eventIDShort[:8]
}
log.Printf("[EVENT #%d] id=%s kind=%d created=%d",
count, eventIDShort, event.Kind, event.CreatedAt)
if *verbose {
log.Printf(" content: %s", event.Content)
}
}
}
case "NOTICE":
var notice string
json.Unmarshal(envelope[1], &notice)
log.Printf("[NOTICE] %s", notice)
case "CLOSED":
var recvSubID, reason string
json.Unmarshal(envelope[1], &recvSubID)
if len(envelope) > 2 {
json.Unmarshal(envelope[2], &reason)
}
if recvSubID == subscriptionID {
log.Printf("⚠ Subscription CLOSED by relay: %s", reason)
cancel()
return
}
case "OK":
// Ignore OK messages for this test
default:
if *verbose {
log.Printf("Unknown message type: %s", msgType)
}
}
}
}()
// Wait for EOSE with timeout
eoseTimeout := time.After(10 * time.Second)
for !gotEOSE {
select {
case <-eoseTimeout:
log.Printf("⚠ Warning: No EOSE received within 10 seconds")
gotEOSE = true // Continue anyway
case <-ctx.Done():
log.Println("Test cancelled before EOSE")
return
case <-time.After(100 * time.Millisecond):
// Keep waiting
}
}
// Monitor for subscription drops
startTime := time.Now()
endTime := startTime.Add(time.Duration(*duration) * time.Second)
// Start monitoring goroutine
go func() {
ticker := time.NewTicker(5 * time.Second)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
return
case <-ticker.C:
elapsed := time.Since(startTime).Seconds()
lastEvent := lastEventTime.Load()
timeSinceLastEvent := time.Now().Unix() - lastEvent
log.Printf("[STATUS] Elapsed: %.0fs/%ds | Events: %d | Last event: %ds ago",
elapsed, *duration, receivedCount.Load(), timeSinceLastEvent)
// Warn if no events for a while (but only if we've seen events before)
if receivedCount.Load() > 0 && timeSinceLastEvent > 30 {
log.Printf("⚠ Warning: No events received for %ds - subscription may have dropped", timeSinceLastEvent)
}
}
}
}()
// Wait for test duration
log.Printf("Test running for %d seconds...", *duration)
log.Println("(You can publish events to the relay in another terminal)")
log.Println()
select {
case <-ctx.Done():
// Test completed or interrupted
case <-time.After(time.Until(endTime)):
// Duration elapsed
}
// Wait a bit for final events
time.Sleep(2 * time.Second)
cancel()
// Wait for reader to finish
select {
case <-readDone:
case <-time.After(5 * time.Second):
log.Println("Reader goroutine didn't exit cleanly")
}
// Send CLOSE
closeMsg := []interface{}{"CLOSE", subscriptionID}
closeMsgBytes, _ := json.Marshal(closeMsg)
conn.WriteMessage(websocket.TextMessage, closeMsgBytes)
// Print results
log.Println()
log.Println("===================================")
log.Println("Test Results")
log.Println("===================================")
log.Printf("Duration: %.1f seconds", time.Since(startTime).Seconds())
log.Printf("Events received: %d", receivedCount.Load())
log.Printf("Subscription ID: %s", subscriptionID)
lastEvent := lastEventTime.Load()
if lastEvent > startTime.Unix() {
log.Printf("Last event: %ds ago", time.Now().Unix()-lastEvent)
}
log.Println()
// Determine pass/fail
received := receivedCount.Load()
testDuration := time.Since(startTime).Seconds()
if received == 0 {
log.Println("⚠ No events received during test")
log.Println("This is expected if no events were published")
log.Println("To test properly, publish events while this is running:")
log.Println()
log.Println(" # In another terminal:")
log.Printf(" ./orly # Make sure relay is running\n")
log.Println()
log.Println(" # Then publish test events (implementation-specific)")
} else {
eventsPerSecond := float64(received) / testDuration
log.Printf("Rate: %.2f events/second", eventsPerSecond)
lastEvent := lastEventTime.Load()
timeSinceLastEvent := time.Now().Unix() - lastEvent
if timeSinceLastEvent < 10 {
log.Println()
log.Println("✓ TEST PASSED - Subscription remained stable")
log.Println("Events were received recently, indicating subscription is still active")
} else {
log.Println()
log.Printf("⚠ Potential issue - Last event was %ds ago", timeSinceLastEvent)
log.Println("Subscription may have dropped if events were still being published")
}
}
}