fix(plugin_manager): atomic state+error write via set_state_with_error

The two-step set_state() / set_error_info() sequence left a window where readers could observe ENABLED state without the accompanying error context. Add threading.RLock to PluginStateManager and a new set_state_with_error() method that holds the lock for both the state-transition write and the _error_info write together. The method inlines the state-transition logic rather than calling set_state() internally to intentionally skip the "clear _error_info for non-ERROR states" side effect — the recoverable error dict is exactly what we want stored. Replace both paired set_state / set_error_info call sites in run_scheduled_updates() with the single atomic method. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-30 12:33:01 +00:00 · 2026-04-29 09:31:04 -04:00
parent 37566d93ac
commit 09c7940986
2 changed files with 49 additions and 4 deletions
--- a/src/plugin_system/plugin_manager.py
+++ b/src/plugin_system/plugin_manager.py
@@ -747,8 +747,7 @@ class PluginManager:
                        }
                        self.logger.warning("Plugin %s update() failed; will retry after interval", plugin_id)
                        self.plugin_last_update[plugin_id] = failure_time
-                        self.state_manager.set_state(plugin_id, PluginState.ENABLED)
-                        self.state_manager.set_error_info(plugin_id, error_info)
+                        self.state_manager.set_state_with_error(plugin_id, PluginState.ENABLED, error_info, error=err)
                        if self.health_tracker:
                            self.health_tracker.record_failure(plugin_id, err)
                except Exception as exc:  # pylint: disable=broad-except
@@ -761,8 +760,7 @@ class PluginManager:
                        'recoverable': True,
                    }
                    self.plugin_last_update[plugin_id] = failure_time
-                    self.state_manager.set_state(plugin_id, PluginState.ENABLED)
-                    self.state_manager.set_error_info(plugin_id, error_info)
+                    self.state_manager.set_state_with_error(plugin_id, PluginState.ENABLED, error_info, error=exc)
                    if self.health_tracker:
                        self.health_tracker.record_failure(plugin_id, exc)

--- a/src/plugin_system/plugin_state.py
+++ b/src/plugin_system/plugin_state.py
@@ -5,6 +5,7 @@ Manages plugin state machine (loaded → enabled → running → error)
 with state transitions and queries.
 """

+import threading
 from enum import Enum
 from typing import Optional, Dict, Any
 from datetime import datetime
@@ -34,6 +35,7 @@ class PluginStateManager:
            logger: Optional logger instance
        """
        self.logger = logger or get_logger(__name__)
+        self._lock = threading.RLock()
        self._states: Dict[str, PluginState] = {}
        self._state_history: Dict[str, list] = {}
        self._error_info: Dict[str, Dict[str, Any]] = {}
@@ -149,6 +151,51 @@ class PluginStateManager:
        """
        self._error_info[plugin_id] = error_info

+    def set_state_with_error(
+        self,
+        plugin_id: str,
+        state: PluginState,
+        error_info: Dict[str, Any],
+        error: Optional[Exception] = None,
+    ) -> None:
+        """Set plugin state and persist error context atomically.
+
+        Unlike calling set_state() then set_error_info() separately, this
+        method holds ``_lock`` for both writes so no reader can observe the
+        new state without the accompanying error context.
+
+        Intentionally does not clear ``_error_info`` the way set_state() does
+        for non-ERROR transitions — this is the recoverable-failure path where
+        the error dict is the entire point.
+
+        Args:
+            plugin_id: Plugin identifier
+            state: New state
+            error_info: Structured error dict to persist alongside the state
+            error: Optional exception recorded in the transition history
+        """
+        with self._lock:
+            old_state = self._states.get(plugin_id, PluginState.UNLOADED)
+            self._states[plugin_id] = state
+
+            if plugin_id not in self._state_history:
+                self._state_history[plugin_id] = []
+            self._state_history[plugin_id].append({
+                'timestamp': datetime.now(),
+                'from': old_state.value,
+                'to': state.value,
+                'error': str(error) if error else None,
+            })
+
+            self._error_info[plugin_id] = error_info
+
+            self.logger.debug(
+                "Plugin %s state transition: %s → %s (recoverable error stored)",
+                plugin_id,
+                old_state.value,
+                state.value,
+            )
+
    def get_error_info(self, plugin_id: str) -> Optional[Dict[str, Any]]:
        """
        Get error information for a plugin.