fix(plugin_manager): atomic state+error write via set_state_with_error

The two-step set_state() / set_error_info() sequence left a window where
readers could observe ENABLED state without the accompanying error context.

Add threading.RLock to PluginStateManager and a new set_state_with_error()
method that holds the lock for both the state-transition write and the
_error_info write together. The method inlines the state-transition logic
rather than calling set_state() internally to intentionally skip the
"clear _error_info for non-ERROR states" side effect — the recoverable
error dict is exactly what we want stored.

Replace both paired set_state / set_error_info call sites in
run_scheduled_updates() with the single atomic method.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Chuck
2026-04-29 09:31:04 -04:00
parent 37566d93ac
commit 09c7940986
2 changed files with 49 additions and 4 deletions

View File

@@ -747,8 +747,7 @@ class PluginManager:
} }
self.logger.warning("Plugin %s update() failed; will retry after interval", plugin_id) self.logger.warning("Plugin %s update() failed; will retry after interval", plugin_id)
self.plugin_last_update[plugin_id] = failure_time self.plugin_last_update[plugin_id] = failure_time
self.state_manager.set_state(plugin_id, PluginState.ENABLED) self.state_manager.set_state_with_error(plugin_id, PluginState.ENABLED, error_info, error=err)
self.state_manager.set_error_info(plugin_id, error_info)
if self.health_tracker: if self.health_tracker:
self.health_tracker.record_failure(plugin_id, err) self.health_tracker.record_failure(plugin_id, err)
except Exception as exc: # pylint: disable=broad-except except Exception as exc: # pylint: disable=broad-except
@@ -761,8 +760,7 @@ class PluginManager:
'recoverable': True, 'recoverable': True,
} }
self.plugin_last_update[plugin_id] = failure_time self.plugin_last_update[plugin_id] = failure_time
self.state_manager.set_state(plugin_id, PluginState.ENABLED) self.state_manager.set_state_with_error(plugin_id, PluginState.ENABLED, error_info, error=exc)
self.state_manager.set_error_info(plugin_id, error_info)
if self.health_tracker: if self.health_tracker:
self.health_tracker.record_failure(plugin_id, exc) self.health_tracker.record_failure(plugin_id, exc)

View File

@@ -5,6 +5,7 @@ Manages plugin state machine (loaded → enabled → running → error)
with state transitions and queries. with state transitions and queries.
""" """
import threading
from enum import Enum from enum import Enum
from typing import Optional, Dict, Any from typing import Optional, Dict, Any
from datetime import datetime from datetime import datetime
@@ -34,6 +35,7 @@ class PluginStateManager:
logger: Optional logger instance logger: Optional logger instance
""" """
self.logger = logger or get_logger(__name__) self.logger = logger or get_logger(__name__)
self._lock = threading.RLock()
self._states: Dict[str, PluginState] = {} self._states: Dict[str, PluginState] = {}
self._state_history: Dict[str, list] = {} self._state_history: Dict[str, list] = {}
self._error_info: Dict[str, Dict[str, Any]] = {} self._error_info: Dict[str, Dict[str, Any]] = {}
@@ -149,6 +151,51 @@ class PluginStateManager:
""" """
self._error_info[plugin_id] = error_info self._error_info[plugin_id] = error_info
def set_state_with_error(
self,
plugin_id: str,
state: PluginState,
error_info: Dict[str, Any],
error: Optional[Exception] = None,
) -> None:
"""Set plugin state and persist error context atomically.
Unlike calling set_state() then set_error_info() separately, this
method holds ``_lock`` for both writes so no reader can observe the
new state without the accompanying error context.
Intentionally does not clear ``_error_info`` the way set_state() does
for non-ERROR transitions — this is the recoverable-failure path where
the error dict is the entire point.
Args:
plugin_id: Plugin identifier
state: New state
error_info: Structured error dict to persist alongside the state
error: Optional exception recorded in the transition history
"""
with self._lock:
old_state = self._states.get(plugin_id, PluginState.UNLOADED)
self._states[plugin_id] = state
if plugin_id not in self._state_history:
self._state_history[plugin_id] = []
self._state_history[plugin_id].append({
'timestamp': datetime.now(),
'from': old_state.value,
'to': state.value,
'error': str(error) if error else None,
})
self._error_info[plugin_id] = error_info
self.logger.debug(
"Plugin %s state transition: %s%s (recoverable error stored)",
plugin_id,
old_state.value,
state.value,
)
def get_error_info(self, plugin_id: str) -> Optional[Dict[str, Any]]: def get_error_info(self, plugin_id: str) -> Optional[Dict[str, Any]]:
""" """
Get error information for a plugin. Get error information for a plugin.