fix(plugins): stop reconciliation install loop, slow plugin list, and uninstall resurrection

Three interacting bugs reported by a user (Discord/ericepe) on a fresh install:

1. The state reconciler retried failed auto-repairs on every HTTP request,
   pegging CPU and flooding logs with "Plugin not found in registry: github
   / youtube". Root cause: ``_run_startup_reconciliation`` reset
   ``_reconciliation_started`` to False on any unresolved inconsistency, so
   ``@app.before_request`` re-fired the entire pass on the next request.
   Fix: run reconciliation exactly once per process; cache per-plugin
   unrecoverable failures inside the reconciler so even an explicit
   re-trigger stays cheap; add a registry pre-check to skip the expensive
   GitHub fetch when we already know the plugin is missing; expose
   ``force=True`` on ``/plugins/state/reconcile`` so users can retry after
   fixing the underlying issue.

2. Uninstalling a plugin via the UI succeeded but the plugin reappeared.
   Root cause: a race between ``store_manager.uninstall_plugin`` (removes
   files) and ``cleanup_plugin_config`` (removes config entry) — if
   reconciliation fired in the gap it saw "config entry with no files" and
   reinstalled. Fix: reorder uninstall to clean config FIRST, drop a
   short-lived "recently uninstalled" tombstone on the store manager that
   the reconciler honors, and pass ``store_manager`` to the manual
   ``/plugins/state/reconcile`` endpoint (it was previously omitted, which
   silently disabled auto-repair entirely).

3. ``GET /plugins/installed`` was very slow on a Pi4 (UI hung on
   "connecting to display" for minutes, ~98% CPU). Root causes: per-request
   ``discover_plugins()`` + manifest re-read + four ``git`` subprocesses per
   plugin (``rev-parse``, ``--abbrev-ref``, ``config``, ``log``). Fix:
   mtime-gate ``discover_plugins()`` and drop the per-plugin manifest
   re-read in the endpoint; cache ``_get_local_git_info`` keyed on
   ``.git/HEAD`` mtime so subprocesses only run when the working copy
   actually moved; bump registry cache TTL from 5 to 15 minutes and fall
   back to stale cache on transient network failure.

Tests: 16 reconciliation cases (including 5 new ones covering the
unrecoverable cache, force-reconcile path, transient-failure handling, and
recently-uninstalled tombstone) and 8 new store_manager cache tests
covering tombstone TTL, git-info mtime cache hit/miss, and the registry
stale-cache fallback. All 24 pass; the broader 288-test suite continues to
pass with no new failures.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Chuck
2026-04-08 09:46:55 -04:00
parent 781224591f
commit b031b99130
6 changed files with 506 additions and 50 deletions

View File

@@ -86,16 +86,38 @@ class StateReconciliation:
self.plugins_dir = Path(plugins_dir)
self.store_manager = store_manager
self.logger = get_logger(__name__)
# Plugin IDs that failed auto-repair and should NOT be retried this
# process lifetime. Prevents the infinite "attempt to reinstall missing
# plugin" loop when a config entry references a plugin that isn't in
# the registry (e.g. legacy 'github', 'youtube' entries). A process
# restart — or an explicit user-initiated reconcile with force=True —
# clears this so recovery is possible after the underlying issue is
# fixed.
self._unrecoverable_missing_on_disk: set = set()
def reconcile_state(self) -> ReconciliationResult:
def reconcile_state(self, force: bool = False) -> ReconciliationResult:
"""
Perform state reconciliation.
Compares state from all sources and fixes safe inconsistencies.
Args:
force: If True, clear the unrecoverable-plugin cache before
reconciling so previously-failed auto-repairs are retried.
Intended for user-initiated reconcile requests after the
underlying issue (e.g. registry update) has been fixed.
Returns:
ReconciliationResult with findings and fixes
"""
if force and self._unrecoverable_missing_on_disk:
self.logger.info(
"Force reconcile requested; clearing %d cached unrecoverable plugin(s)",
len(self._unrecoverable_missing_on_disk),
)
self._unrecoverable_missing_on_disk.clear()
self.logger.info("Starting state reconciliation")
inconsistencies = []
@@ -280,7 +302,26 @@ class StateReconciliation:
# Check: Plugin in config but not on disk
if config.get('exists_in_config') and not disk.get('exists_on_disk'):
can_repair = self.store_manager is not None
# Skip plugins that previously failed auto-repair in this process.
# Re-attempting wastes CPU (network + git clone each request) and
# spams the logs with the same "Plugin not found in registry"
# error. The entry is still surfaced as MANUAL_FIX_REQUIRED so the
# UI can show it, but no auto-repair will run.
previously_unrecoverable = plugin_id in self._unrecoverable_missing_on_disk
# Also refuse to re-install a plugin that the user just uninstalled
# through the UI — prevents a race where the reconciler fires
# between file removal and config cleanup and resurrects the
# plugin the user just deleted.
recently_uninstalled = (
self.store_manager is not None
and hasattr(self.store_manager, 'was_recently_uninstalled')
and self.store_manager.was_recently_uninstalled(plugin_id)
)
can_repair = (
self.store_manager is not None
and not previously_unrecoverable
and not recently_uninstalled
)
inconsistencies.append(Inconsistency(
plugin_id=plugin_id,
inconsistency_type=InconsistencyType.PLUGIN_MISSING_ON_DISK,
@@ -342,7 +383,13 @@ class StateReconciliation:
return False
def _auto_repair_missing_plugin(self, plugin_id: str) -> bool:
"""Attempt to reinstall a missing plugin from the store."""
"""Attempt to reinstall a missing plugin from the store.
On failure, records plugin_id in ``_unrecoverable_missing_on_disk`` so
subsequent reconciliation passes within this process do not retry and
spam the log / CPU. A process restart (or an explicit ``force=True``
reconcile) is required to clear the cache.
"""
if not self.store_manager:
return False
@@ -351,6 +398,35 @@ class StateReconciliation:
if plugin_id.startswith('ledmatrix-'):
candidates.append(plugin_id[len('ledmatrix-'):])
# Cheap pre-check: is any candidate actually present in the registry
# at all? If not, we know up-front this is unrecoverable and can skip
# the expensive install_plugin path (which does a forced GitHub fetch
# before failing).
registry_has_candidate = False
try:
registry = self.store_manager.fetch_registry()
registry_ids = {
p.get('id') for p in (registry.get('plugins', []) or []) if p.get('id')
}
registry_has_candidate = any(c in registry_ids for c in candidates)
except Exception as e:
# If we can't reach the registry, treat this as transient — don't
# mark unrecoverable, let the next pass try again.
self.logger.warning(
"[AutoRepair] Could not read registry to check %s: %s", plugin_id, e
)
return False
if not registry_has_candidate:
self.logger.warning(
"[AutoRepair] %s not present in registry; marking unrecoverable "
"(will not retry this session). Reinstall from the Plugin Store "
"or remove the stale config entry to clear this warning.",
plugin_id,
)
self._unrecoverable_missing_on_disk.add(plugin_id)
return False
for candidate_id in candidates:
try:
self.logger.info("[AutoRepair] Attempting to reinstall missing plugin: %s", candidate_id)
@@ -366,6 +442,11 @@ class StateReconciliation:
except Exception as e:
self.logger.error("[AutoRepair] Error reinstalling %s: %s", candidate_id, e, exc_info=True)
self.logger.warning("[AutoRepair] Could not reinstall %s from store", plugin_id)
self.logger.warning(
"[AutoRepair] Could not reinstall %s from store; marking unrecoverable "
"(will not retry this session).",
plugin_id,
)
self._unrecoverable_missing_on_disk.add(plugin_id)
return False

View File

@@ -53,7 +53,12 @@ class PluginStoreManager:
self.registry_cache_time = None # Timestamp of when registry was cached
self.github_cache = {} # Cache for GitHub API responses
self.cache_timeout = 3600 # 1 hour cache timeout
self.registry_cache_timeout = 300 # 5 minutes for registry cache
# 15 minutes for registry cache. Long enough that the plugin list
# endpoint on a warm cache never hits the network, short enough that
# new plugins show up within a reasonable window. See also the
# stale-cache fallback in fetch_registry for transient network
# failures.
self.registry_cache_timeout = 900
self.commit_info_cache = {} # Cache for latest commit info: {key: (timestamp, data)}
self.commit_cache_timeout = 300 # 5 minutes (same as registry)
self.manifest_cache = {} # Cache for GitHub manifest fetches: {key: (timestamp, data)}
@@ -62,9 +67,38 @@ class PluginStoreManager:
self._token_validation_cache = {} # Cache for token validation results: {token: (is_valid, timestamp, error_message)}
self._token_validation_cache_timeout = 300 # 5 minutes cache for token validation
# Per-plugin tombstone timestamps for plugins that were uninstalled
# recently via the UI. Used by the state reconciler to avoid
# resurrecting a plugin the user just deleted when reconciliation
# races against the uninstall operation. Cleared after ``_uninstall_tombstone_ttl``.
self._uninstall_tombstones: Dict[str, float] = {}
self._uninstall_tombstone_ttl = 300 # 5 minutes
# Cache for _get_local_git_info: {plugin_path_str: (head_mtime, data)}
# Keyed on the mtime of .git/HEAD so we re-run git only when the
# working copy actually moved. Before this cache, every
# /plugins/installed request fired 4 git subprocesses per plugin,
# which pegged the CPU on a Pi4 with a dozen plugins.
self._git_info_cache: Dict[str, tuple] = {}
# Ensure plugins directory exists
self.plugins_dir.mkdir(exist_ok=True)
def mark_recently_uninstalled(self, plugin_id: str) -> None:
"""Record that ``plugin_id`` was just uninstalled by the user."""
self._uninstall_tombstones[plugin_id] = time.time()
def was_recently_uninstalled(self, plugin_id: str) -> bool:
"""Return True if ``plugin_id`` has an active uninstall tombstone."""
ts = self._uninstall_tombstones.get(plugin_id)
if ts is None:
return False
if time.time() - ts > self._uninstall_tombstone_ttl:
# Expired — clean up so the dict doesn't grow unbounded.
self._uninstall_tombstones.pop(plugin_id, None)
return False
return True
def _load_github_token(self) -> Optional[str]:
"""
Load GitHub API token from config_secrets.json if available.
@@ -469,9 +503,16 @@ class PluginStoreManager:
return self.registry_cache
except requests.RequestException as e:
self.logger.error(f"Error fetching registry: {e}")
# Prefer stale cache over an empty list so the plugin list UI
# keeps working on a flaky connection (e.g. Pi on WiFi).
if self.registry_cache:
self.logger.warning("Falling back to stale registry cache")
return self.registry_cache
return {"plugins": []}
except json.JSONDecodeError as e:
self.logger.error(f"Error parsing registry JSON: {e}")
if self.registry_cache:
return self.registry_cache
return {"plugins": []}
def search_plugins(self, query: str = "", category: str = "", tags: List[str] = None, fetch_commit_info: bool = True, include_saved_repos: bool = True, saved_repositories_manager = None) -> List[Dict]:
@@ -1561,11 +1602,29 @@ class PluginStoreManager:
return False
def _get_local_git_info(self, plugin_path: Path) -> Optional[Dict[str, str]]:
"""Return local git branch, commit hash, and commit date if the plugin is a git checkout."""
"""Return local git branch, commit hash, and commit date if the plugin is a git checkout.
Results are cached keyed on the mtime of ``.git/HEAD`` so repeated
calls (e.g. one per plugin on every ``/plugins/installed`` request)
skip the four ``git`` subprocesses when nothing has changed.
"""
git_dir = plugin_path / '.git'
if not git_dir.exists():
return None
# mtime-gated cache lookup.
head_file = git_dir / 'HEAD'
cache_key = str(plugin_path)
try:
head_mtime = head_file.stat().st_mtime
except OSError:
head_mtime = None
if head_mtime is not None:
cached = self._git_info_cache.get(cache_key)
if cached is not None and cached[0] == head_mtime:
return cached[1]
try:
sha_result = subprocess.run(
['git', '-C', str(plugin_path), 'rev-parse', 'HEAD'],
@@ -1623,6 +1682,8 @@ class PluginStoreManager:
result['date_iso'] = commit_date_iso
result['date'] = self._iso_to_date(commit_date_iso)
if head_mtime is not None:
self._git_info_cache[cache_key] = (head_mtime, result)
return result
except subprocess.CalledProcessError as err:
self.logger.debug(f"Failed to read git info for {plugin_path.name}: {err}")