From b70f03bce07530e8279ff0e9e18af4dbc29de9c0 Mon Sep 17 00:00:00 2001
From: mleku <me@mleku.dev>
Date: Thu, 30 Oct 2025 18:22:56 +0000
Subject: [PATCH] Refactor policy script handling and improve fallback logic

- Renamed test functions for clarity, changing "NotRunning" to "Disabled" to better reflect the policy state.
- Updated policy checks to ensure that if the policy is disabled, it falls back to the default policy immediately.
- Enhanced error handling in the policy manager to ensure proper startup and running state management.
- Introduced a new method to ensure the policy is running, with timeout handling for startup completion.
- Bumped version to v0.20.3 to reflect these changes.
---
 pkg/policy/policy.go      | 144 ++++++++++++++++++++++++++++++++++++--
 pkg/policy/policy_test.go |  29 ++++----
 pkg/version/version       |   2 +-
 3 files changed, 154 insertions(+), 21 deletions(-)

diff --git a/pkg/policy/policy.go b/pkg/policy/policy.go
index e6e081f..9ed227f 100644
--- a/pkg/policy/policy.go
+++ b/pkg/policy/policy.go
@@ -10,6 +10,7 @@ import (
 	"os"
 	"os/exec"
 	"path/filepath"
+	"runtime"
 	"sync"
 	"time"
 
@@ -131,11 +132,13 @@ type PolicyManager struct {
 	currentCancel context.CancelFunc
 	mutex         sync.RWMutex
 	isRunning     bool
+	isStarting    bool
 	enabled       bool
 	stdin         io.WriteCloser
 	stdout        io.ReadCloser
 	stderr        io.ReadCloser
 	responseChan  chan PolicyResponse
+	startupChan   chan error
 }
 
 // P represents a complete policy configuration for a Nostr relay.
@@ -203,6 +206,7 @@ func NewWithManager(ctx context.Context, appName string, enabled bool) *P {
 		scriptPath:   scriptPath,
 		enabled:      enabled,
 		responseChan: make(chan PolicyResponse, 100), // Buffered channel for responses
+		startupChan:  make(chan error, 1),            // Channel for startup completion
 	}
 
 	// Load policy configuration from JSON file
@@ -279,8 +283,19 @@ func (p *P) CheckPolicy(access string, ev *event.E, loggedInPubkey []byte, ipAdd
 	}
 
 	// Check if script is present and enabled
-	if rule.Script != "" && p.Manager != nil && p.Manager.IsEnabled() {
-		return p.checkScriptPolicy(access, ev, rule.Script, loggedInPubkey, ipAddress)
+	if rule.Script != "" && p.Manager != nil {
+		if p.Manager.IsEnabled() {
+			return p.checkScriptPolicy(access, ev, rule.Script, loggedInPubkey, ipAddress)
+		}
+		// Script is configured but policy is disabled - use default policy if rule has no other restrictions
+		hasOtherRestrictions := len(rule.WriteAllow) > 0 || len(rule.WriteDeny) > 0 || len(rule.ReadAllow) > 0 || len(rule.ReadDeny) > 0 ||
+			rule.SizeLimit != nil || rule.ContentLimit != nil || len(rule.MustHaveTags) > 0 ||
+			rule.MaxExpiry != nil || rule.Privileged || rule.RateLimit != nil ||
+			rule.MaxAgeOfEvent != nil || rule.MaxAgeEventInFuture != nil
+		if !hasOtherRestrictions {
+			// No other restrictions, use default policy
+			return p.getDefaultPolicyAction(), nil
+		}
 	}
 
 	// Apply rule-based filtering
@@ -452,12 +467,41 @@ func (p *P) checkRulePolicy(access string, ev *event.E, rule Rule, loggedInPubke
 
 // checkScriptPolicy runs the policy script to determine if event should be allowed
 func (p *P) checkScriptPolicy(access string, ev *event.E, scriptPath string, loggedInPubkey []byte, ipAddress string) (allowed bool, err error) {
-	if p.Manager == nil || !p.Manager.IsRunning() {
-		// If script is not running, fall back to default policy
-		log.W.F("policy rule for kind %d is inactive (script not running), falling back to default policy (%s)", ev.Kind, p.DefaultPolicy)
+	if p.Manager == nil {
+		return false, fmt.Errorf("policy manager is not initialized")
+	}
+
+	// If policy is disabled, fall back to default policy immediately
+	if !p.Manager.IsEnabled() {
+		log.W.F("policy rule for kind %d is inactive (policy disabled), falling back to default policy (%s)", ev.Kind, p.DefaultPolicy)
 		return p.getDefaultPolicyAction(), nil
 	}
 
+	// Policy is enabled, check if it's running
+	if !p.Manager.IsRunning() {
+		// Check if script file exists
+		if _, err := os.Stat(p.Manager.GetScriptPath()); os.IsNotExist(err) {
+			// Script doesn't exist, this is a fatal error
+			buf := make([]byte, 1024*1024)
+			n := runtime.Stack(buf, true)
+			log.E.F("policy script does not exist at %s", p.Manager.GetScriptPath())
+			fmt.Fprintf(os.Stderr, "FATAL: Policy script required but not found at %s\n", p.Manager.GetScriptPath())
+			fmt.Fprintf(os.Stderr, "Stack trace:\n%s\n", buf[:n])
+			os.Exit(1)
+		}
+
+		// Try to start the policy and wait for it
+		if err := p.Manager.ensureRunning(); err != nil {
+			// Startup failed, this is a fatal error
+			buf := make([]byte, 1024*1024)
+			n := runtime.Stack(buf, true)
+			log.E.F("failed to start policy script: %v", err)
+			fmt.Fprintf(os.Stderr, "FATAL: Failed to start policy script: %v\n", err)
+			fmt.Fprintf(os.Stderr, "Stack trace:\n%s\n", buf[:n])
+			os.Exit(1)
+		}
+	}
+
 	// Create policy event with additional context
 	policyEvent := &PolicyEvent{
 		E:              ev,
@@ -535,6 +579,91 @@ func (pm *PolicyManager) startPolicyIfExists() {
 	}
 }
 
+// ensureRunning ensures the policy is running, starting it if necessary.
+// It waits for startup to complete with a timeout and returns an error if startup fails.
+func (pm *PolicyManager) ensureRunning() error {
+	pm.mutex.Lock()
+	// Check if already running
+	if pm.isRunning {
+		pm.mutex.Unlock()
+		return nil
+	}
+
+	// Check if already starting
+	if pm.isStarting {
+		pm.mutex.Unlock()
+		// Wait for startup to complete
+		select {
+		case err := <-pm.startupChan:
+			if err != nil {
+				return fmt.Errorf("policy startup failed: %v", err)
+			}
+			// Double-check it's actually running after receiving signal
+			pm.mutex.RLock()
+			running := pm.isRunning
+			pm.mutex.RUnlock()
+			if !running {
+				return fmt.Errorf("policy startup completed but process is not running")
+			}
+			return nil
+		case <-time.After(10 * time.Second):
+			return fmt.Errorf("policy startup timeout")
+		case <-pm.ctx.Done():
+			return fmt.Errorf("policy context cancelled")
+		}
+	}
+
+	// Mark as starting
+	pm.isStarting = true
+	pm.mutex.Unlock()
+
+	// Start the policy in a goroutine
+	go func() {
+		err := pm.StartPolicy()
+		pm.mutex.Lock()
+		pm.isStarting = false
+		pm.mutex.Unlock()
+		// Signal startup completion (non-blocking)
+		// Drain any stale value first, then send
+		select {
+		case <-pm.startupChan:
+		default:
+		}
+		select {
+		case pm.startupChan <- err:
+		default:
+			// Channel should be empty now, but if it's full, try again
+			pm.startupChan <- err
+		}
+	}()
+
+	// Wait for startup to complete
+	select {
+	case err := <-pm.startupChan:
+		if err != nil {
+			return fmt.Errorf("policy startup failed: %v", err)
+		}
+		// Double-check it's actually running after receiving signal
+		pm.mutex.RLock()
+		running := pm.isRunning
+		pm.mutex.RUnlock()
+		if !running {
+			return fmt.Errorf("policy startup completed but process is not running")
+		}
+		return nil
+	case <-time.After(10 * time.Second):
+		pm.mutex.Lock()
+		pm.isStarting = false
+		pm.mutex.Unlock()
+		return fmt.Errorf("policy startup timeout")
+	case <-pm.ctx.Done():
+		pm.mutex.Lock()
+		pm.isStarting = false
+		pm.mutex.Unlock()
+		return fmt.Errorf("policy context cancelled")
+	}
+}
+
 // StartPolicy starts the policy script process.
 // Returns an error if the script doesn't exist, can't be executed, or is already running.
 func (pm *PolicyManager) StartPolicy() error {
@@ -800,6 +929,11 @@ func (pm *PolicyManager) IsRunning() bool {
 	return pm.isRunning
 }
 
+// GetScriptPath returns the path to the policy script.
+func (pm *PolicyManager) GetScriptPath() string {
+	return pm.scriptPath
+}
+
 // Shutdown gracefully shuts down the policy manager.
 // It cancels the context and stops any running policy script.
 func (pm *PolicyManager) Shutdown() {
diff --git a/pkg/policy/policy_test.go b/pkg/policy/policy_test.go
index b13febb..c56d874 100644
--- a/pkg/policy/policy_test.go
+++ b/pkg/policy/policy_test.go
@@ -1136,11 +1136,11 @@ func TestMaxAgeChecks(t *testing.T) {
 	}
 }
 
-func TestScriptPolicyNotRunningFallsBackToDefault(t *testing.T) {
+func TestScriptPolicyDisabledFallsBackToDefault(t *testing.T) {
 	// Generate real keypair for testing
 	eventSigner, eventPubkey := generateTestKeypair(t)
 
-	// Create a policy with a script rule but no running manager, default policy is "allow"
+	// Create a policy with a script rule but policy is disabled, default policy is "allow"
 	policy := &P{
 		DefaultPolicy: "allow",
 		Rules: map[int]Rule{
@@ -1150,21 +1150,21 @@ func TestScriptPolicyNotRunningFallsBackToDefault(t *testing.T) {
 			},
 		},
 		Manager: &PolicyManager{
-			enabled:   true,
-			isRunning: false, // Script is not running
+			enabled:   false, // Policy is disabled
+			isRunning: false,
 		},
 	}
 
 	// Create real test event with proper signing
 	testEvent := createTestEvent(t, eventSigner, "test content", 1)
 
-	// Should allow the event when script is configured but not running (falls back to default "allow")
+	// Should allow the event when policy is disabled (falls back to default "allow")
 	allowed, err := policy.CheckPolicy("write", testEvent, eventPubkey, "127.0.0.1")
 	if err != nil {
 		t.Errorf("Unexpected error: %v", err)
 	}
 	if !allowed {
-		t.Error("Expected event to be allowed when script is not running (should fall back to default policy 'allow')")
+		t.Error("Expected event to be allowed when policy is disabled (should fall back to default policy 'allow')")
 	}
 
 	// Test with default policy "deny"
@@ -1174,7 +1174,7 @@ func TestScriptPolicyNotRunningFallsBackToDefault(t *testing.T) {
 		t.Errorf("Unexpected error: %v", err2)
 	}
 	if allowed2 {
-		t.Error("Expected event to be denied when script is not running and default policy is 'deny'")
+		t.Error("Expected event to be denied when policy is disabled and default policy is 'deny'")
 	}
 }
 
@@ -1340,12 +1340,11 @@ func TestNewPolicyWithDefaultPolicyJSON(t *testing.T) {
 	}
 }
 
-func TestScriptProcessingFailureFallsBackToDefault(t *testing.T) {
+func TestScriptProcessingDisabledFallsBackToDefault(t *testing.T) {
 	// Generate real keypair for testing
 	eventSigner, eventPubkey := generateTestKeypair(t)
 
-	// Test that script processing failures fall back to default policy
-	// We'll test this by using a manager that's not running (simulating failure)
+	// Test that when policy is disabled, it falls back to default policy
 	policy := &P{
 		DefaultPolicy: "allow",
 		Rules: map[int]Rule{
@@ -1355,21 +1354,21 @@ func TestScriptProcessingFailureFallsBackToDefault(t *testing.T) {
 			},
 		},
 		Manager: &PolicyManager{
-			enabled:   true,
-			isRunning: false, // Script is not running (simulating failure)
+			enabled:   false, // Policy is disabled
+			isRunning: false,
 		},
 	}
 
 	// Create real test event with proper signing
 	testEvent := createTestEvent(t, eventSigner, "test content", 1)
 
-	// Should allow the event when script is not running (falls back to default "allow")
+	// Should allow the event when policy is disabled (falls back to default "allow")
 	allowed, err := policy.checkScriptPolicy("write", testEvent, "policy.sh", eventPubkey, "127.0.0.1")
 	if err != nil {
 		t.Errorf("Unexpected error: %v", err)
 	}
 	if !allowed {
-		t.Error("Expected event to be allowed when script is not running (should fall back to default policy 'allow')")
+		t.Error("Expected event to be allowed when policy is disabled (should fall back to default policy 'allow')")
 	}
 
 	// Test with default policy "deny"
@@ -1379,7 +1378,7 @@ func TestScriptProcessingFailureFallsBackToDefault(t *testing.T) {
 		t.Errorf("Unexpected error: %v", err2)
 	}
 	if allowed2 {
-		t.Error("Expected event to be denied when script is not running and default policy is 'deny'")
+		t.Error("Expected event to be denied when policy is disabled and default policy is 'deny'")
 	}
 }
 
diff --git a/pkg/version/version b/pkg/version/version
index 014ec61..4646fd9 100644
--- a/pkg/version/version
+++ b/pkg/version/version
@@ -1 +1 @@
-v0.20.2
\ No newline at end of file
+v0.20.3
\ No newline at end of file