From 65e3e8319bdeb8514d90785521598749134bce6a Mon Sep 17 00:00:00 2001 From: Chuck <33324927+ChuckBuilds@users.noreply.github.com> Date: Wed, 29 Apr 2026 15:22:12 -0400 Subject: [PATCH] fix(plugin_manager): prevent permanent ERROR state after update timeout (#316) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix(plugin_manager): prevent permanent ERROR state after update timeout When execute_update() fails (timeout or unhandled exception), the plugin state was set to ERROR with no recovery path. can_execute() returns False for ERROR state, so the plugin's update() was never called again, leaving it showing stale data indefinitely. Instead, update plugin_last_update so the plugin waits one configured interval before retrying, and keep the state ENABLED so recovery is automatic on the next cycle. Co-Authored-By: Claude Sonnet 4.6 * fix(plugin_manager): address PR review — failure timestamp and error context - Use time.time() at the point of failure instead of reusing current_time (captured before execution), so the full retry interval always elapses after a timeout rather than one execution-duration shorter - Add PluginStateManager.set_error_info() to persist structured error context without changing plugin state; call it in both failure branches so get_error_info() / get_state_info() surface recoverable errors alongside ERROR-state errors - Add warning log on the success=False branch (was previously silent) - Pass a descriptive Exception (not a generic "Plugin execution failed") to health_tracker.record_failure() in the timeout/executor-error path Co-Authored-By: Claude Sonnet 4.6 * fix(plugin_manager): atomic state+error write via set_state_with_error The two-step set_state() / set_error_info() sequence left a window where readers could observe ENABLED state without the accompanying error context. Add threading.RLock to PluginStateManager and a new set_state_with_error() method that holds the lock for both the state-transition write and the _error_info write together. The method inlines the state-transition logic rather than calling set_state() internally to intentionally skip the "clear _error_info for non-ERROR states" side effect — the recoverable error dict is exactly what we want stored. Replace both paired set_state / set_error_info call sites in run_scheduled_updates() with the single atomic method. Co-Authored-By: Claude Sonnet 4.6 * fix(plugin_state): lock _error_info accesses and store defensive copies Three verified issues: - set_error_info wrote _error_info without holding _lock and stored the caller's dict by reference, allowing races and post-write mutation - set_state_with_error stored error_info by reference (lock was already held) - get_error_info read _error_info without _lock and returned the live reference, letting callers mutate the stored snapshot Implicit fourth fix: set_state also wrote _error_info without _lock; locking get_error_info while leaving that writer unguarded would have created a new race, so set_state is now wrapped in _lock too for consistency. Changes: - set_state: wrap entire body in self._lock (covers _states, _state_history, and _error_info writes atomically; ERROR-path _error_info value was already a fresh dict literal so no copy needed) - set_error_info: acquire self._lock + store dict(error_info) shallow copy - set_state_with_error: store dict(error_info) shallow copy (lock already held) - get_error_info: acquire self._lock + return dict(info) copy or None All stored values are flat dicts of strings/floats/bools, so shallow copy is sufficient — deepcopy is not needed. Co-Authored-By: Claude Sonnet 4.6 * fix(plugin_manager): apply recovery logic to update_all_plugins; extract helper update_all_plugins still set PluginState.ERROR on both failure paths, leaving it inconsistent with the run_scheduled_updates fix from the same PR. Extract _record_update_failure(plugin_id, exc=None) to hold all shared failure logic: capture actual failure time, build structured error_info, log the retry warning, stamp plugin_last_update, call set_state_with_error(ENABLED), and forward to health_tracker. Replace all four failure sites (two in run_scheduled_updates, two in update_all_plugins) with calls to this helper. Co-Authored-By: Claude Sonnet 4.6 --------- Co-authored-by: Chuck Co-authored-by: Claude Sonnet 4.6 --- src/plugin_system/plugin_manager.py | 54 ++++++++--- src/plugin_system/plugin_state.py | 145 ++++++++++++++++++++-------- 2 files changed, 148 insertions(+), 51 deletions(-) diff --git a/src/plugin_system/plugin_manager.py b/src/plugin_system/plugin_manager.py index d6cc7139..c6baed69 100644 --- a/src/plugin_system/plugin_manager.py +++ b/src/plugin_system/plugin_manager.py @@ -677,6 +677,44 @@ class PluginManager: # Default: 60 seconds return 60.0 + def _record_update_failure( + self, + plugin_id: str, + exc: Optional[Exception] = None, + ) -> None: + """Apply the standard failure-recovery path for a plugin update. + + Stamps plugin_last_update with the actual failure time so the full + configured interval elapses before the next retry, then transitions + the plugin back to ENABLED (not ERROR) with structured error context + so automatic recovery happens on the next scheduled cycle. + + Args: + plugin_id: Plugin identifier + exc: The exception that caused the failure, if any. When None a + synthetic ExecutionFailure exception is constructed from the + timeout/executor-error path. + """ + failure_time = time.time() + if exc is not None: + err: Exception = exc + error_type = type(exc).__name__ + else: + err = Exception(f"Plugin {plugin_id} execution failed (timeout or executor error)") + error_type = 'ExecutionFailure' + + error_info = { + 'error': str(err), + 'error_type': error_type, + 'timestamp': failure_time, + 'recoverable': True, + } + self.logger.warning("Plugin %s update() failed; will retry after interval", plugin_id) + self.plugin_last_update[plugin_id] = failure_time + self.state_manager.set_state_with_error(plugin_id, PluginState.ENABLED, error_info, error=err) + if self.health_tracker: + self.health_tracker.record_failure(plugin_id, err) + def run_scheduled_updates(self, current_time: Optional[float] = None) -> None: """ Trigger plugin updates based on their defined update intervals. @@ -734,16 +772,10 @@ class PluginManager: if self.health_tracker: self.health_tracker.record_success(plugin_id) else: - # Execution failed (timeout or error) - self.state_manager.set_state(plugin_id, PluginState.ERROR) - if self.health_tracker: - self.health_tracker.record_failure(plugin_id, Exception("Plugin execution failed")) + self._record_update_failure(plugin_id) except Exception as exc: # pylint: disable=broad-except self.logger.exception("Error updating plugin %s: %s", plugin_id, exc) - self.state_manager.set_state(plugin_id, PluginState.ERROR, error=exc) - # Record failure - if self.health_tracker: - self.health_tracker.record_failure(plugin_id, exc) + self._record_update_failure(plugin_id, exc=exc) def update_all_plugins(self) -> None: """ @@ -769,14 +801,12 @@ class PluginManager: if success: self.plugin_last_update[plugin_id] = time.time() self.state_manager.record_update(plugin_id) - # Update state back to ENABLED self.state_manager.set_state(plugin_id, PluginState.ENABLED) else: - # Execution failed - self.state_manager.set_state(plugin_id, PluginState.ERROR) + self._record_update_failure(plugin_id) except Exception as exc: # pylint: disable=broad-except self.logger.exception("Error updating plugin %s: %s", plugin_id, exc) - self.state_manager.set_state(plugin_id, PluginState.ERROR, error=exc) + self._record_update_failure(plugin_id, exc=exc) def get_plugin_health_metrics(self) -> Dict[str, Any]: """ diff --git a/src/plugin_system/plugin_state.py b/src/plugin_system/plugin_state.py index f9332ff9..269bb423 100644 --- a/src/plugin_system/plugin_state.py +++ b/src/plugin_system/plugin_state.py @@ -5,6 +5,7 @@ Manages plugin state machine (loaded → enabled → running → error) with state transitions and queries. """ +import threading from enum import Enum from typing import Optional, Dict, Any from datetime import datetime @@ -34,6 +35,7 @@ class PluginStateManager: logger: Optional logger instance """ self.logger = logger or get_logger(__name__) + self._lock = threading.RLock() self._states: Dict[str, PluginState] = {} self._state_history: Dict[str, list] = {} self._error_info: Dict[str, Dict[str, Any]] = {} @@ -48,44 +50,44 @@ class PluginStateManager: ) -> None: """ Set plugin state and record transition. - + Args: plugin_id: Plugin identifier state: New state error: Optional error if transitioning to ERROR state """ - old_state = self._states.get(plugin_id, PluginState.UNLOADED) - self._states[plugin_id] = state - - # Record state transition - if plugin_id not in self._state_history: - self._state_history[plugin_id] = [] - - transition = { - 'timestamp': datetime.now(), - 'from': old_state.value, - 'to': state.value, - 'error': str(error) if error else None - } - self._state_history[plugin_id].append(transition) - - # Store error info if transitioning to ERROR state - if state == PluginState.ERROR and error: - self._error_info[plugin_id] = { - 'error': str(error), - 'error_type': type(error).__name__, - 'timestamp': datetime.now() + with self._lock: + old_state = self._states.get(plugin_id, PluginState.UNLOADED) + self._states[plugin_id] = state + + if plugin_id not in self._state_history: + self._state_history[plugin_id] = [] + + transition = { + 'timestamp': datetime.now(), + 'from': old_state.value, + 'to': state.value, + 'error': str(error) if error else None } - elif state != PluginState.ERROR: - # Clear error info when leaving ERROR state - self._error_info.pop(plugin_id, None) - - self.logger.debug( - "Plugin %s state transition: %s → %s", - plugin_id, - old_state.value, - state.value - ) + self._state_history[plugin_id].append(transition) + + # Store error info if transitioning to ERROR state + if state == PluginState.ERROR and error: + self._error_info[plugin_id] = { + 'error': str(error), + 'error_type': type(error).__name__, + 'timestamp': datetime.now() + } + elif state != PluginState.ERROR: + # Clear error info when leaving ERROR state + self._error_info.pop(plugin_id, None) + + self.logger.debug( + "Plugin %s state transition: %s → %s", + plugin_id, + old_state.value, + state.value + ) def get_state(self, plugin_id: str) -> PluginState: """ @@ -136,17 +138,82 @@ class PluginStateManager: """ return self._state_history.get(plugin_id, []) - def get_error_info(self, plugin_id: str) -> Optional[Dict[str, Any]]: + def set_error_info(self, plugin_id: str, error_info: Dict[str, Any]) -> None: """ - Get error information for a plugin in ERROR state. - + Persist structured error context without changing plugin state. + + Used for recoverable failures (e.g. update timeout) where the plugin + stays ENABLED but the error details should remain queryable. + Args: plugin_id: Plugin identifier - - Returns: - Error information dict or None + error_info: Arbitrary dict describing the error """ - return self._error_info.get(plugin_id) + with self._lock: + self._error_info[plugin_id] = dict(error_info) + + def set_state_with_error( + self, + plugin_id: str, + state: PluginState, + error_info: Dict[str, Any], + error: Optional[Exception] = None, + ) -> None: + """Set plugin state and persist error context atomically. + + Unlike calling set_state() then set_error_info() separately, this + method holds ``_lock`` for both writes so no reader can observe the + new state without the accompanying error context. + + Intentionally does not clear ``_error_info`` the way set_state() does + for non-ERROR transitions — this is the recoverable-failure path where + the error dict is the entire point. + + Args: + plugin_id: Plugin identifier + state: New state + error_info: Structured error dict to persist alongside the state + error: Optional exception recorded in the transition history + """ + with self._lock: + old_state = self._states.get(plugin_id, PluginState.UNLOADED) + self._states[plugin_id] = state + + if plugin_id not in self._state_history: + self._state_history[plugin_id] = [] + self._state_history[plugin_id].append({ + 'timestamp': datetime.now(), + 'from': old_state.value, + 'to': state.value, + 'error': str(error) if error else None, + }) + + self._error_info[plugin_id] = dict(error_info) + + self.logger.debug( + "Plugin %s state transition: %s → %s (recoverable error stored)", + plugin_id, + old_state.value, + state.value, + ) + + def get_error_info(self, plugin_id: str) -> Optional[Dict[str, Any]]: + """ + Get error information for a plugin. + + Returns the stored error dict whether the plugin is in ERROR state or + still ENABLED after a recoverable failure. Returns a shallow copy so + callers cannot mutate the stored snapshot. + + Args: + plugin_id: Plugin identifier + + Returns: + Copy of the error information dict, or None + """ + with self._lock: + info = self._error_info.get(plugin_id) + return dict(info) if info is not None else None def record_update(self, plugin_id: str) -> None: """Record that plugin update() was called."""