fix(plugin_manager): address PR review — failure timestamp and error context

- Use time.time() at the point of failure instead of reusing current_time
  (captured before execution), so the full retry interval always elapses
  after a timeout rather than one execution-duration shorter

- Add PluginStateManager.set_error_info() to persist structured error context
  without changing plugin state; call it in both failure branches so
  get_error_info() / get_state_info() surface recoverable errors alongside
  ERROR-state errors

- Add warning log on the success=False branch (was previously silent)

- Pass a descriptive Exception (not a generic "Plugin execution failed") to
  health_tracker.record_failure() in the timeout/executor-error path

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Chuck
2026-04-28 14:53:23 -04:00
parent d0969ad57a
commit 37566d93ac
2 changed files with 44 additions and 11 deletions

View File

@@ -734,18 +734,35 @@ class PluginManager:
if self.health_tracker:
self.health_tracker.record_success(plugin_id)
else:
# Execution failed (timeout or error) — update timestamp so the
# plugin waits one full interval before retrying, but keep state
# ENABLED so can_execute() returns True and recovery is automatic.
self.plugin_last_update[plugin_id] = current_time
# Execution failed (timeout or executor error) — stamp with the
# actual failure time (not current_time captured before execution)
# so the full interval elapses before the next retry.
failure_time = time.time()
err = Exception(f"Plugin {plugin_id} execution failed (timeout or executor error)")
error_info = {
'error': str(err),
'error_type': 'ExecutionFailure',
'timestamp': failure_time,
'recoverable': True,
}
self.logger.warning("Plugin %s update() failed; will retry after interval", plugin_id)
self.plugin_last_update[plugin_id] = failure_time
self.state_manager.set_state(plugin_id, PluginState.ENABLED)
self.state_manager.set_error_info(plugin_id, error_info)
if self.health_tracker:
self.health_tracker.record_failure(plugin_id, Exception("Plugin execution failed"))
self.health_tracker.record_failure(plugin_id, err)
except Exception as exc: # pylint: disable=broad-except
failure_time = time.time()
self.logger.exception("Error updating plugin %s: %s", plugin_id, exc)
# Same as the failure path above: stay ENABLED and wait one interval.
self.plugin_last_update[plugin_id] = current_time
error_info = {
'error': str(exc),
'error_type': type(exc).__name__,
'timestamp': failure_time,
'recoverable': True,
}
self.plugin_last_update[plugin_id] = failure_time
self.state_manager.set_state(plugin_id, PluginState.ENABLED)
self.state_manager.set_error_info(plugin_id, error_info)
if self.health_tracker:
self.health_tracker.record_failure(plugin_id, exc)

View File

@@ -136,9 +136,25 @@ class PluginStateManager:
"""
return self._state_history.get(plugin_id, [])
def set_error_info(self, plugin_id: str, error_info: Dict[str, Any]) -> None:
"""
Persist structured error context without changing plugin state.
Used for recoverable failures (e.g. update timeout) where the plugin
stays ENABLED but the error details should remain queryable.
Args:
plugin_id: Plugin identifier
error_info: Arbitrary dict describing the error
"""
self._error_info[plugin_id] = error_info
def get_error_info(self, plugin_id: str) -> Optional[Dict[str, Any]]:
"""
Get error information for a plugin in ERROR state.
Get error information for a plugin.
Returns the stored error dict whether the plugin is in ERROR state or
still ENABLED after a recoverable failure.
Args:
plugin_id: Plugin identifier