diff --git a/src/plugin_system/state_reconciliation.py b/src/plugin_system/state_reconciliation.py index ca381cfd..af804ec9 100644 --- a/src/plugin_system/state_reconciliation.py +++ b/src/plugin_system/state_reconciliation.py @@ -8,7 +8,7 @@ Detects and fixes inconsistencies between: - State manager state """ -from typing import Dict, Any, List, Optional +from typing import Dict, Any, List, Optional, Set from dataclasses import dataclass from enum import Enum from pathlib import Path @@ -86,16 +86,38 @@ class StateReconciliation: self.plugins_dir = Path(plugins_dir) self.store_manager = store_manager self.logger = get_logger(__name__) + + # Plugin IDs that failed auto-repair and should NOT be retried this + # process lifetime. Prevents the infinite "attempt to reinstall missing + # plugin" loop when a config entry references a plugin that isn't in + # the registry (e.g. legacy 'github', 'youtube' entries). A process + # restart — or an explicit user-initiated reconcile with force=True — + # clears this so recovery is possible after the underlying issue is + # fixed. + self._unrecoverable_missing_on_disk: Set[str] = set() - def reconcile_state(self) -> ReconciliationResult: + def reconcile_state(self, force: bool = False) -> ReconciliationResult: """ Perform state reconciliation. - + Compares state from all sources and fixes safe inconsistencies. - + + Args: + force: If True, clear the unrecoverable-plugin cache before + reconciling so previously-failed auto-repairs are retried. + Intended for user-initiated reconcile requests after the + underlying issue (e.g. registry update) has been fixed. + Returns: ReconciliationResult with findings and fixes """ + if force and self._unrecoverable_missing_on_disk: + self.logger.info( + "Force reconcile requested; clearing %d cached unrecoverable plugin(s)", + len(self._unrecoverable_missing_on_disk), + ) + self._unrecoverable_missing_on_disk.clear() + self.logger.info("Starting state reconciliation") inconsistencies = [] @@ -280,7 +302,26 @@ class StateReconciliation: # Check: Plugin in config but not on disk if config.get('exists_in_config') and not disk.get('exists_on_disk'): - can_repair = self.store_manager is not None + # Skip plugins that previously failed auto-repair in this process. + # Re-attempting wastes CPU (network + git clone each request) and + # spams the logs with the same "Plugin not found in registry" + # error. The entry is still surfaced as MANUAL_FIX_REQUIRED so the + # UI can show it, but no auto-repair will run. + previously_unrecoverable = plugin_id in self._unrecoverable_missing_on_disk + # Also refuse to re-install a plugin that the user just uninstalled + # through the UI — prevents a race where the reconciler fires + # between file removal and config cleanup and resurrects the + # plugin the user just deleted. + recently_uninstalled = ( + self.store_manager is not None + and hasattr(self.store_manager, 'was_recently_uninstalled') + and self.store_manager.was_recently_uninstalled(plugin_id) + ) + can_repair = ( + self.store_manager is not None + and not previously_unrecoverable + and not recently_uninstalled + ) inconsistencies.append(Inconsistency( plugin_id=plugin_id, inconsistency_type=InconsistencyType.PLUGIN_MISSING_ON_DISK, @@ -342,7 +383,13 @@ class StateReconciliation: return False def _auto_repair_missing_plugin(self, plugin_id: str) -> bool: - """Attempt to reinstall a missing plugin from the store.""" + """Attempt to reinstall a missing plugin from the store. + + On failure, records plugin_id in ``_unrecoverable_missing_on_disk`` so + subsequent reconciliation passes within this process do not retry and + spam the log / CPU. A process restart (or an explicit ``force=True`` + reconcile) is required to clear the cache. + """ if not self.store_manager: return False @@ -351,6 +398,43 @@ class StateReconciliation: if plugin_id.startswith('ledmatrix-'): candidates.append(plugin_id[len('ledmatrix-'):]) + # Cheap pre-check: is any candidate actually present in the registry + # at all? If not, we know up-front this is unrecoverable and can skip + # the expensive install_plugin path (which does a forced GitHub fetch + # before failing). + # + # IMPORTANT: we must pass raise_on_failure=True here. The default + # fetch_registry() silently falls back to a stale cache or an empty + # dict on network failure, which would make it impossible to tell + # "plugin genuinely not in registry" from "I can't reach the + # registry right now" — in the second case we'd end up poisoning + # _unrecoverable_missing_on_disk with every config entry on a fresh + # boot with no cache. + registry_has_candidate = False + try: + registry = self.store_manager.fetch_registry(raise_on_failure=True) + registry_ids = { + p.get('id') for p in (registry.get('plugins', []) or []) if p.get('id') + } + registry_has_candidate = any(c in registry_ids for c in candidates) + except Exception as e: + # If we can't reach the registry, treat this as transient — don't + # mark unrecoverable, let the next pass try again. + self.logger.warning( + "[AutoRepair] Could not read registry to check %s: %s", plugin_id, e + ) + return False + + if not registry_has_candidate: + self.logger.warning( + "[AutoRepair] %s not present in registry; marking unrecoverable " + "(will not retry this session). Reinstall from the Plugin Store " + "or remove the stale config entry to clear this warning.", + plugin_id, + ) + self._unrecoverable_missing_on_disk.add(plugin_id) + return False + for candidate_id in candidates: try: self.logger.info("[AutoRepair] Attempting to reinstall missing plugin: %s", candidate_id) @@ -366,6 +450,11 @@ class StateReconciliation: except Exception as e: self.logger.error("[AutoRepair] Error reinstalling %s: %s", candidate_id, e, exc_info=True) - self.logger.warning("[AutoRepair] Could not reinstall %s from store", plugin_id) + self.logger.warning( + "[AutoRepair] Could not reinstall %s from store; marking unrecoverable " + "(will not retry this session).", + plugin_id, + ) + self._unrecoverable_missing_on_disk.add(plugin_id) return False diff --git a/src/plugin_system/store_manager.py b/src/plugin_system/store_manager.py index fba7dc36..d495463a 100644 --- a/src/plugin_system/store_manager.py +++ b/src/plugin_system/store_manager.py @@ -14,9 +14,10 @@ import zipfile import tempfile import requests import time +from concurrent.futures import ThreadPoolExecutor from datetime import datetime from pathlib import Path -from typing import List, Dict, Optional, Any +from typing import List, Dict, Optional, Any, Tuple import logging from src.common.permission_utils import sudo_remove_directory @@ -52,19 +53,89 @@ class PluginStoreManager: self.registry_cache = None self.registry_cache_time = None # Timestamp of when registry was cached self.github_cache = {} # Cache for GitHub API responses - self.cache_timeout = 3600 # 1 hour cache timeout - self.registry_cache_timeout = 300 # 5 minutes for registry cache + self.cache_timeout = 3600 # 1 hour cache timeout (repo info: stars, default_branch) + # 15 minutes for registry cache. Long enough that the plugin list + # endpoint on a warm cache never hits the network, short enough that + # new plugins show up within a reasonable window. See also the + # stale-cache fallback in fetch_registry for transient network + # failures. + self.registry_cache_timeout = 900 self.commit_info_cache = {} # Cache for latest commit info: {key: (timestamp, data)} - self.commit_cache_timeout = 300 # 5 minutes (same as registry) + # 30 minutes for commit/manifest caches. Plugin Store users browse + # the catalog via /plugins/store/list which fetches commit info and + # manifest data per plugin. 5-min TTLs meant every fresh browse on + # a Pi4 paid for ~3 HTTP requests x N plugins (30-60s serial). 30 + # minutes keeps the cache warm across a realistic session while + # still picking up upstream updates within a reasonable window. + self.commit_cache_timeout = 1800 self.manifest_cache = {} # Cache for GitHub manifest fetches: {key: (timestamp, data)} - self.manifest_cache_timeout = 300 # 5 minutes + self.manifest_cache_timeout = 1800 self.github_token = self._load_github_token() self._token_validation_cache = {} # Cache for token validation results: {token: (is_valid, timestamp, error_message)} self._token_validation_cache_timeout = 300 # 5 minutes cache for token validation + # Per-plugin tombstone timestamps for plugins that were uninstalled + # recently via the UI. Used by the state reconciler to avoid + # resurrecting a plugin the user just deleted when reconciliation + # races against the uninstall operation. Cleared after ``_uninstall_tombstone_ttl``. + self._uninstall_tombstones: Dict[str, float] = {} + self._uninstall_tombstone_ttl = 300 # 5 minutes + + # Cache for _get_local_git_info: {plugin_path_str: (signature, data)} + # where ``signature`` is a tuple of (head_mtime, resolved_ref_mtime, + # head_contents) so a fast-forward update to the current branch + # (which touches .git/refs/heads/ but NOT .git/HEAD) still + # invalidates the cache. Before this cache, every + # /plugins/installed request fired 4 git subprocesses per plugin, + # which pegged the CPU on a Pi4 with a dozen plugins. The cached + # ``data`` dict is the same shape returned by ``_get_local_git_info`` + # itself (sha / short_sha / branch / optional remote_url, date_iso, + # date) — all string-keyed strings. + self._git_info_cache: Dict[str, Tuple[Tuple, Dict[str, str]]] = {} + + # How long to wait before re-attempting a failed GitHub metadata + # fetch after we've already served a stale cache hit. Without this, + # a single expired-TTL + network-error would cause every subsequent + # request to re-hit the network (and fail again) until the network + # actually came back — amplifying the failure and blocking request + # handlers. Bumping the cached-entry timestamp on failure serves + # the stale payload cheaply until the backoff expires. + self._failure_backoff_seconds = 60 + # Ensure plugins directory exists self.plugins_dir.mkdir(exist_ok=True) + def _record_cache_backoff(self, cache_dict: Dict, cache_key: str, + cache_timeout: int, payload: Any) -> None: + """Bump a cache entry's timestamp so subsequent lookups hit the + cache rather than re-failing over the network. + + Used by the stale-on-error fallbacks in the GitHub metadata fetch + paths. Without this, a cache entry whose TTL just expired would + cause every subsequent request to re-hit the network and fail + again until the network actually came back. We write a synthetic + timestamp ``(now + backoff - cache_timeout)`` so the cache-valid + check ``(now - ts) < cache_timeout`` succeeds for another + ``backoff`` seconds. + """ + synthetic_ts = time.time() + self._failure_backoff_seconds - cache_timeout + cache_dict[cache_key] = (synthetic_ts, payload) + + def mark_recently_uninstalled(self, plugin_id: str) -> None: + """Record that ``plugin_id`` was just uninstalled by the user.""" + self._uninstall_tombstones[plugin_id] = time.time() + + def was_recently_uninstalled(self, plugin_id: str) -> bool: + """Return True if ``plugin_id`` has an active uninstall tombstone.""" + ts = self._uninstall_tombstones.get(plugin_id) + if ts is None: + return False + if time.time() - ts > self._uninstall_tombstone_ttl: + # Expired — clean up so the dict doesn't grow unbounded. + self._uninstall_tombstones.pop(plugin_id, None) + return False + return True + def _load_github_token(self) -> Optional[str]: """ Load GitHub API token from config_secrets.json if available. @@ -308,7 +379,25 @@ class PluginStoreManager: if self.github_token: headers['Authorization'] = f'token {self.github_token}' - response = requests.get(api_url, headers=headers, timeout=10) + try: + response = requests.get(api_url, headers=headers, timeout=10) + except requests.RequestException as req_err: + # Network error: prefer a stale cache hit over an + # empty default so the UI keeps working on a flaky + # Pi WiFi link. Bump the cached entry's timestamp + # into a short backoff window so subsequent + # requests serve the stale payload cheaply instead + # of re-hitting the network on every request. + if cache_key in self.github_cache: + _, stale = self.github_cache[cache_key] + self._record_cache_backoff(self.github_cache, cache_key, self.cache_timeout, stale) + self.logger.warning( + "GitHub repo info fetch failed for %s (%s); serving stale cache.", + cache_key, req_err, + ) + return stale + raise + if response.status_code == 200: data = response.json() pushed_at = data.get('pushed_at', '') or data.get('updated_at', '') @@ -328,7 +417,20 @@ class PluginStoreManager: self.github_cache[cache_key] = (time.time(), repo_info) return repo_info elif response.status_code == 403: - # Rate limit or authentication issue + # Rate limit or authentication issue. If we have a + # previously-cached value, serve it rather than + # returning empty defaults — a stale star count is + # better than a reset to zero. Apply the same + # failure-backoff bump as the network-error path + # so we don't hammer the API with repeat requests + # while rate-limited. + if cache_key in self.github_cache: + _, stale = self.github_cache[cache_key] + self._record_cache_backoff(self.github_cache, cache_key, self.cache_timeout, stale) + self.logger.warning( + "GitHub API 403 for %s; serving stale cache.", cache_key, + ) + return stale if not self.github_token: self.logger.warning( f"GitHub API rate limit likely exceeded (403). " @@ -342,6 +444,10 @@ class PluginStoreManager: ) else: self.logger.warning(f"GitHub API request failed: {response.status_code} for {api_url}") + if cache_key in self.github_cache: + _, stale = self.github_cache[cache_key] + self._record_cache_backoff(self.github_cache, cache_key, self.cache_timeout, stale) + return stale return { 'stars': 0, @@ -442,23 +548,34 @@ class PluginStoreManager: self.logger.error(f"Error fetching registry from URL: {e}", exc_info=True) return None - def fetch_registry(self, force_refresh: bool = False) -> Dict: + def fetch_registry(self, force_refresh: bool = False, raise_on_failure: bool = False) -> Dict: """ Fetch the plugin registry from GitHub. - + Args: force_refresh: Force refresh even if cached - + raise_on_failure: If True, re-raise network / JSON errors instead + of silently falling back to stale cache / empty dict. UI + callers prefer the stale-fallback default so the plugin + list keeps working on flaky WiFi; the state reconciler + needs the explicit failure signal so it can distinguish + "plugin genuinely not in registry" from "I couldn't reach + the registry at all" and not mark everything unrecoverable. + Returns: Registry data with list of available plugins + + Raises: + requests.RequestException / json.JSONDecodeError when + ``raise_on_failure`` is True and the fetch fails. """ # Check if cache is still valid (within timeout) current_time = time.time() - if (self.registry_cache and self.registry_cache_time and - not force_refresh and + if (self.registry_cache and self.registry_cache_time and + not force_refresh and (current_time - self.registry_cache_time) < self.registry_cache_timeout): return self.registry_cache - + try: self.logger.info(f"Fetching plugin registry from {self.REGISTRY_URL}") response = self._http_get_with_retries(self.REGISTRY_URL, timeout=10) @@ -469,9 +586,30 @@ class PluginStoreManager: return self.registry_cache except requests.RequestException as e: self.logger.error(f"Error fetching registry: {e}") + if raise_on_failure: + raise + # Prefer stale cache over an empty list so the plugin list UI + # keeps working on a flaky connection (e.g. Pi on WiFi). Bump + # registry_cache_time into a short backoff window so the next + # request serves the stale payload cheaply instead of + # re-hitting the network on every request (matches the + # pattern used by github_cache / commit_info_cache). + if self.registry_cache: + self.logger.warning("Falling back to stale registry cache") + self.registry_cache_time = ( + time.time() + self._failure_backoff_seconds - self.registry_cache_timeout + ) + return self.registry_cache return {"plugins": []} except json.JSONDecodeError as e: self.logger.error(f"Error parsing registry JSON: {e}") + if raise_on_failure: + raise + if self.registry_cache: + self.registry_cache_time = ( + time.time() + self._failure_backoff_seconds - self.registry_cache_timeout + ) + return self.registry_cache return {"plugins": []} def search_plugins(self, query: str = "", category: str = "", tags: List[str] = None, fetch_commit_info: bool = True, include_saved_repos: bool = True, saved_repositories_manager = None) -> List[Dict]: @@ -517,68 +655,95 @@ class PluginStoreManager: except Exception as e: self.logger.warning(f"Failed to fetch plugins from saved repository {repo_url}: {e}") - results = [] + # First pass: apply cheap filters (category/tags/query) so we only + # fetch GitHub metadata for plugins that will actually be returned. + filtered: List[Dict] = [] for plugin in plugins: - # Category filter if category and plugin.get('category') != category: continue - - # Tags filter (match any tag) if tags and not any(tag in plugin.get('tags', []) for tag in tags): continue - - # Query search (case-insensitive) if query: query_lower = query.lower() searchable_text = ' '.join([ plugin.get('name', ''), plugin.get('description', ''), plugin.get('id', ''), - plugin.get('author', '') + plugin.get('author', ''), ]).lower() - if query_lower not in searchable_text: continue + filtered.append(plugin) - # Enhance plugin data with GitHub metadata + def _enrich(plugin: Dict) -> Dict: + """Enrich a single plugin with GitHub metadata. + + Called concurrently from a ThreadPoolExecutor. Each underlying + HTTP helper (``_get_github_repo_info`` / ``_get_latest_commit_info`` + / ``_fetch_manifest_from_github``) is thread-safe — they use + ``requests`` and write their own cache keys on Python dicts, + which is atomic under the GIL for single-key assignments. + """ enhanced_plugin = plugin.copy() - - # Get real GitHub stars repo_url = plugin.get('repo', '') - if repo_url: - github_info = self._get_github_repo_info(repo_url) - enhanced_plugin['stars'] = github_info.get('stars', plugin.get('stars', 0)) - enhanced_plugin['default_branch'] = github_info.get('default_branch', plugin.get('branch', 'main')) - enhanced_plugin['last_updated_iso'] = github_info.get('last_commit_iso') - enhanced_plugin['last_updated'] = github_info.get('last_commit_date') + if not repo_url: + return enhanced_plugin - if fetch_commit_info: - branch = plugin.get('branch') or github_info.get('default_branch', 'main') + github_info = self._get_github_repo_info(repo_url) + enhanced_plugin['stars'] = github_info.get('stars', plugin.get('stars', 0)) + enhanced_plugin['default_branch'] = github_info.get('default_branch', plugin.get('branch', 'main')) + enhanced_plugin['last_updated_iso'] = github_info.get('last_commit_iso') + enhanced_plugin['last_updated'] = github_info.get('last_commit_date') - commit_info = self._get_latest_commit_info(repo_url, branch) - if commit_info: - enhanced_plugin['last_commit'] = commit_info.get('short_sha') - enhanced_plugin['last_commit_sha'] = commit_info.get('sha') - enhanced_plugin['last_updated'] = commit_info.get('date') or enhanced_plugin.get('last_updated') - enhanced_plugin['last_updated_iso'] = commit_info.get('date_iso') or enhanced_plugin.get('last_updated_iso') - enhanced_plugin['last_commit_message'] = commit_info.get('message') - enhanced_plugin['last_commit_author'] = commit_info.get('author') - enhanced_plugin['branch'] = commit_info.get('branch', branch) - enhanced_plugin['last_commit_branch'] = commit_info.get('branch') + if fetch_commit_info: + branch = plugin.get('branch') or github_info.get('default_branch', 'main') - # Fetch manifest from GitHub for additional metadata (description, etc.) - plugin_subpath = plugin.get('plugin_path', '') - manifest_rel = f"{plugin_subpath}/manifest.json" if plugin_subpath else "manifest.json" - github_manifest = self._fetch_manifest_from_github(repo_url, branch, manifest_rel) - if github_manifest: - if 'last_updated' in github_manifest and not enhanced_plugin.get('last_updated'): - enhanced_plugin['last_updated'] = github_manifest['last_updated'] - if 'description' in github_manifest: - enhanced_plugin['description'] = github_manifest['description'] + commit_info = self._get_latest_commit_info(repo_url, branch) + if commit_info: + enhanced_plugin['last_commit'] = commit_info.get('short_sha') + enhanced_plugin['last_commit_sha'] = commit_info.get('sha') + enhanced_plugin['last_updated'] = commit_info.get('date') or enhanced_plugin.get('last_updated') + enhanced_plugin['last_updated_iso'] = commit_info.get('date_iso') or enhanced_plugin.get('last_updated_iso') + enhanced_plugin['last_commit_message'] = commit_info.get('message') + enhanced_plugin['last_commit_author'] = commit_info.get('author') + enhanced_plugin['branch'] = commit_info.get('branch', branch) + enhanced_plugin['last_commit_branch'] = commit_info.get('branch') - results.append(enhanced_plugin) + # Intentionally NO per-plugin manifest.json fetch here. + # The registry's plugins.json already carries ``description`` + # (it is generated from each plugin's manifest by + # ``update_registry.py``), and ``last_updated`` is filled in + # from the commit info above. An earlier implementation + # fetched manifest.json per plugin anyway, which meant one + # extra HTTPS round trip per result; on a Pi4 with a flaky + # WiFi link the tail retries of that one extra call + # (_http_get_with_retries does 3 attempts with exponential + # backoff) dominated wall time even after parallelization. - return results + return enhanced_plugin + + # Fan out the per-plugin GitHub enrichment. The previous + # implementation did this serially, which on a Pi4 with ~15 plugins + # and a fresh cache meant 30+ HTTP requests in strict sequence (the + # "connecting to display" hang reported by users). With a thread + # pool, latency is dominated by the slowest request rather than + # their sum. Workers capped at 10 to stay well under the + # unauthenticated GitHub rate limit burst and avoid overwhelming a + # Pi's WiFi link. For a small number of plugins the pool is + # essentially free. + if not filtered: + return [] + + # Not worth the pool overhead for tiny workloads. Parenthesized to + # make Python's default ``and`` > ``or`` precedence explicit: a + # single plugin, OR a small batch where we don't need commit info. + if (len(filtered) == 1) or ((not fetch_commit_info) and (len(filtered) < 4)): + return [_enrich(p) for p in filtered] + + max_workers = min(10, len(filtered)) + with ThreadPoolExecutor(max_workers=max_workers, thread_name_prefix='plugin-search') as executor: + # executor.map preserves input order, which the UI relies on. + return list(executor.map(_enrich, filtered)) def _fetch_manifest_from_github(self, repo_url: str, branch: str = "master", manifest_path: str = "manifest.json", force_refresh: bool = False) -> Optional[Dict]: """ @@ -676,7 +841,28 @@ class PluginStoreManager: last_error = None for branch_name in branches_to_try: api_url = f"https://api.github.com/repos/{owner}/{repo}/commits/{branch_name}" - response = requests.get(api_url, headers=headers, timeout=10) + try: + response = requests.get(api_url, headers=headers, timeout=10) + except requests.RequestException as req_err: + # Network failure: fall back to a stale cache hit if + # available so the plugin store UI keeps populating + # commit info on a flaky WiFi link. Bump the cached + # timestamp into the backoff window so we don't + # re-retry on every request. + if cache_key in self.commit_info_cache: + _, stale = self.commit_info_cache[cache_key] + if stale is not None: + self._record_cache_backoff( + self.commit_info_cache, cache_key, + self.commit_cache_timeout, stale, + ) + self.logger.warning( + "GitHub commit fetch failed for %s (%s); serving stale cache.", + cache_key, req_err, + ) + return stale + last_error = str(req_err) + continue if response.status_code == 200: commit_data = response.json() commit_sha_full = commit_data.get('sha', '') @@ -706,7 +892,23 @@ class PluginStoreManager: if last_error: self.logger.debug(f"Unable to fetch commit info for {repo_url}: {last_error}") - # Cache negative result to avoid repeated failing calls + # All branches returned a non-200 response (e.g. 404 on every + # candidate, or a transient 5xx). If we already had a good + # cached value, prefer serving that — overwriting it with + # None here would wipe out commit info the UI just showed + # on the previous request. Bump the timestamp into the + # backoff window so subsequent lookups hit the cache. + if cache_key in self.commit_info_cache: + _, prior = self.commit_info_cache[cache_key] + if prior is not None: + self._record_cache_backoff( + self.commit_info_cache, cache_key, + self.commit_cache_timeout, prior, + ) + return prior + + # No prior good value — cache the negative result so we don't + # hammer a plugin that genuinely has no reachable commits. self.commit_info_cache[cache_key] = (time.time(), None) except Exception as e: @@ -1560,12 +1762,93 @@ class PluginStoreManager: self.logger.error(f"Unexpected error installing dependencies for {plugin_path.name}: {e}", exc_info=True) return False + def _git_cache_signature(self, git_dir: Path) -> Optional[Tuple]: + """Build a cache signature that invalidates on the kind of updates + a plugin user actually cares about. + + Caching on ``.git/HEAD`` mtime alone is not enough: a ``git pull`` + that fast-forwards the current branch updates + ``.git/refs/heads/`` (or ``.git/packed-refs``) but leaves + HEAD's contents and mtime untouched. And the cached ``result`` + dict includes ``remote_url`` — a value read from ``.git/config`` — + so a config-only change (e.g. a monorepo-migration re-pointing + ``remote.origin.url``) must also invalidate the cache. + + Signature components: + - HEAD contents (catches detach / branch switch) + - HEAD mtime + - if HEAD points at a ref, that ref file's mtime (catches + fast-forward / reset on the current branch) + - packed-refs mtime as a coarse fallback for repos using packed refs + - .git/config contents + mtime (catches remote URL changes and + any other config-only edit that affects what the cached + ``remote_url`` field should contain) + + Returns ``None`` if HEAD cannot be read at all (caller will skip + the cache and take the slow path). + """ + head_file = git_dir / 'HEAD' + try: + head_mtime = head_file.stat().st_mtime + head_contents = head_file.read_text(encoding='utf-8', errors='replace').strip() + except OSError: + return None + + ref_mtime = None + if head_contents.startswith('ref: '): + ref_path = head_contents[len('ref: '):].strip() + # ``ref_path`` looks like ``refs/heads/main``. It lives either + # as a loose file under .git/ or inside .git/packed-refs. + loose_ref = git_dir / ref_path + try: + ref_mtime = loose_ref.stat().st_mtime + except OSError: + ref_mtime = None + + packed_refs_mtime = None + if ref_mtime is None: + try: + packed_refs_mtime = (git_dir / 'packed-refs').stat().st_mtime + except OSError: + packed_refs_mtime = None + + config_mtime = None + config_contents = None + config_file = git_dir / 'config' + try: + config_mtime = config_file.stat().st_mtime + config_contents = config_file.read_text(encoding='utf-8', errors='replace').strip() + except OSError: + config_mtime = None + config_contents = None + + return ( + head_contents, head_mtime, + ref_mtime, packed_refs_mtime, + config_contents, config_mtime, + ) + def _get_local_git_info(self, plugin_path: Path) -> Optional[Dict[str, str]]: - """Return local git branch, commit hash, and commit date if the plugin is a git checkout.""" + """Return local git branch, commit hash, and commit date if the plugin is a git checkout. + + Results are cached keyed on a signature that includes HEAD + contents plus the mtime of HEAD AND the resolved ref (or + packed-refs). Repeated calls skip the four ``git`` subprocesses + when nothing has changed, and a ``git pull`` that fast-forwards + the branch correctly invalidates the cache. + """ git_dir = plugin_path / '.git' if not git_dir.exists(): return None + cache_key = str(plugin_path) + signature = self._git_cache_signature(git_dir) + + if signature is not None: + cached = self._git_info_cache.get(cache_key) + if cached is not None and cached[0] == signature: + return cached[1] + try: sha_result = subprocess.run( ['git', '-C', str(plugin_path), 'rev-parse', 'HEAD'], @@ -1623,6 +1906,8 @@ class PluginStoreManager: result['date_iso'] = commit_date_iso result['date'] = self._iso_to_date(commit_date_iso) + if signature is not None: + self._git_info_cache[cache_key] = (signature, result) return result except subprocess.CalledProcessError as err: self.logger.debug(f"Failed to read git info for {plugin_path.name}: {err}") diff --git a/test/test_store_manager_caches.py b/test/test_store_manager_caches.py new file mode 100644 index 00000000..450d9fd5 --- /dev/null +++ b/test/test_store_manager_caches.py @@ -0,0 +1,747 @@ +""" +Tests for the caching and tombstone behaviors added to PluginStoreManager +to fix the plugin-list slowness and the uninstall-resurrection bugs. + +Coverage targets: +- ``mark_recently_uninstalled`` / ``was_recently_uninstalled`` lifecycle and + TTL expiry. +- ``_get_local_git_info`` mtime-gated cache: ``git`` subprocesses only run + when ``.git/HEAD`` mtime changes. +- ``fetch_registry`` stale-cache fallback on network failure. +""" + +import os +import time +import unittest +from pathlib import Path +from tempfile import TemporaryDirectory +from unittest.mock import patch, MagicMock + +from src.plugin_system.store_manager import PluginStoreManager + + +class TestUninstallTombstone(unittest.TestCase): + def setUp(self): + self._tmp = TemporaryDirectory() + self.addCleanup(self._tmp.cleanup) + self.sm = PluginStoreManager(plugins_dir=self._tmp.name) + + def test_unmarked_plugin_is_not_recent(self): + self.assertFalse(self.sm.was_recently_uninstalled("foo")) + + def test_marking_makes_it_recent(self): + self.sm.mark_recently_uninstalled("foo") + self.assertTrue(self.sm.was_recently_uninstalled("foo")) + + def test_tombstone_expires_after_ttl(self): + self.sm._uninstall_tombstone_ttl = 0.05 + self.sm.mark_recently_uninstalled("foo") + self.assertTrue(self.sm.was_recently_uninstalled("foo")) + time.sleep(0.1) + self.assertFalse(self.sm.was_recently_uninstalled("foo")) + # Expired entry should also be pruned from the dict. + self.assertNotIn("foo", self.sm._uninstall_tombstones) + + +class TestGitInfoCache(unittest.TestCase): + def setUp(self): + self._tmp = TemporaryDirectory() + self.addCleanup(self._tmp.cleanup) + self.plugins_dir = Path(self._tmp.name) + self.sm = PluginStoreManager(plugins_dir=str(self.plugins_dir)) + + # Minimal fake git checkout: .git/HEAD needs to exist so the cache + # key (its mtime) is stable, but we mock subprocess so no actual git + # is required. + self.plugin_path = self.plugins_dir / "plg" + (self.plugin_path / ".git").mkdir(parents=True) + (self.plugin_path / ".git" / "HEAD").write_text("ref: refs/heads/main\n") + + def _fake_subprocess_run(self, *args, **kwargs): + # Return different dummy values depending on which git subcommand + # was invoked so the code paths that parse output all succeed. + cmd = args[0] + result = MagicMock() + result.returncode = 0 + if "rev-parse" in cmd and "HEAD" in cmd and "--abbrev-ref" not in cmd: + result.stdout = "abcdef1234567890\n" + elif "--abbrev-ref" in cmd: + result.stdout = "main\n" + elif "config" in cmd: + result.stdout = "https://example.com/repo.git\n" + elif "log" in cmd: + result.stdout = "2026-04-08T12:00:00+00:00\n" + else: + result.stdout = "" + return result + + def test_cache_hits_avoid_subprocess_calls(self): + with patch( + "src.plugin_system.store_manager.subprocess.run", + side_effect=self._fake_subprocess_run, + ) as mock_run: + first = self.sm._get_local_git_info(self.plugin_path) + self.assertIsNotNone(first) + self.assertEqual(first["short_sha"], "abcdef1") + calls_after_first = mock_run.call_count + self.assertEqual(calls_after_first, 4) + + # Second call with unchanged HEAD: zero new subprocess calls. + second = self.sm._get_local_git_info(self.plugin_path) + self.assertEqual(second, first) + self.assertEqual(mock_run.call_count, calls_after_first) + + def test_cache_invalidates_on_head_mtime_change(self): + with patch( + "src.plugin_system.store_manager.subprocess.run", + side_effect=self._fake_subprocess_run, + ) as mock_run: + self.sm._get_local_git_info(self.plugin_path) + calls_after_first = mock_run.call_count + + # Bump mtime on .git/HEAD to simulate a new commit being checked out. + head = self.plugin_path / ".git" / "HEAD" + new_time = head.stat().st_mtime + 10 + os.utime(head, (new_time, new_time)) + + self.sm._get_local_git_info(self.plugin_path) + self.assertEqual(mock_run.call_count, calls_after_first + 4) + + def test_no_git_directory_returns_none(self): + non_git = self.plugins_dir / "no_git" + non_git.mkdir() + self.assertIsNone(self.sm._get_local_git_info(non_git)) + + def test_cache_invalidates_on_git_config_change(self): + """A config-only change (e.g. ``git remote set-url``) must invalidate + the cache, because the cached ``result`` dict includes ``remote_url`` + which is read from ``.git/config``. Without config in the signature, + a stale remote URL would be served indefinitely. + """ + head_file = self.plugin_path / ".git" / "HEAD" + head_file.write_text("ref: refs/heads/main\n") + refs_heads = self.plugin_path / ".git" / "refs" / "heads" + refs_heads.mkdir(parents=True, exist_ok=True) + (refs_heads / "main").write_text("a" * 40 + "\n") + config_file = self.plugin_path / ".git" / "config" + config_file.write_text( + '[remote "origin"]\n\turl = https://old.example.com/repo.git\n' + ) + + remote_url = {"current": "https://old.example.com/repo.git"} + + def fake_subprocess_run(*args, **kwargs): + cmd = args[0] + result = MagicMock() + result.returncode = 0 + if "rev-parse" in cmd and "--abbrev-ref" not in cmd: + result.stdout = "a" * 40 + "\n" + elif "--abbrev-ref" in cmd: + result.stdout = "main\n" + elif "config" in cmd: + result.stdout = remote_url["current"] + "\n" + elif "log" in cmd: + result.stdout = "2026-04-08T12:00:00+00:00\n" + else: + result.stdout = "" + return result + + with patch( + "src.plugin_system.store_manager.subprocess.run", + side_effect=fake_subprocess_run, + ): + first = self.sm._get_local_git_info(self.plugin_path) + self.assertEqual(first["remote_url"], "https://old.example.com/repo.git") + + # Simulate ``git remote set-url origin https://new.example.com/repo.git``: + # ``.git/config`` contents AND mtime change. HEAD is untouched. + time.sleep(0.01) # ensure a detectable mtime delta + config_file.write_text( + '[remote "origin"]\n\turl = https://new.example.com/repo.git\n' + ) + new_time = config_file.stat().st_mtime + 10 + os.utime(config_file, (new_time, new_time)) + remote_url["current"] = "https://new.example.com/repo.git" + + second = self.sm._get_local_git_info(self.plugin_path) + self.assertEqual( + second["remote_url"], "https://new.example.com/repo.git", + "config-only change did not invalidate the cache — " + ".git/config mtime/contents must be part of the signature", + ) + + def test_cache_invalidates_on_fast_forward_of_current_branch(self): + """Regression: .git/HEAD mtime alone is not enough. + + ``git pull`` that fast-forwards the current branch touches + ``.git/refs/heads/`` (or packed-refs) but NOT HEAD. If + we cache on HEAD mtime alone, we serve a stale SHA indefinitely. + """ + # Build a realistic loose-ref layout. + refs_heads = self.plugin_path / ".git" / "refs" / "heads" + refs_heads.mkdir(parents=True) + branch_file = refs_heads / "main" + branch_file.write_text("a" * 40 + "\n") + # Overwrite HEAD to point at refs/heads/main. + (self.plugin_path / ".git" / "HEAD").write_text("ref: refs/heads/main\n") + + call_log = [] + + def fake_subprocess_run(*args, **kwargs): + call_log.append(args[0]) + result = MagicMock() + result.returncode = 0 + cmd = args[0] + if "rev-parse" in cmd and "--abbrev-ref" not in cmd: + result.stdout = branch_file.read_text().strip() + "\n" + elif "--abbrev-ref" in cmd: + result.stdout = "main\n" + elif "config" in cmd: + result.stdout = "https://example.com/repo.git\n" + elif "log" in cmd: + result.stdout = "2026-04-08T12:00:00+00:00\n" + else: + result.stdout = "" + return result + + with patch( + "src.plugin_system.store_manager.subprocess.run", + side_effect=fake_subprocess_run, + ): + first = self.sm._get_local_git_info(self.plugin_path) + calls_after_first = len(call_log) + self.assertIsNotNone(first) + self.assertTrue(first["sha"].startswith("a")) + + # Second call: unchanged. Cache hit → no new subprocess calls. + self.sm._get_local_git_info(self.plugin_path) + self.assertEqual(len(call_log), calls_after_first, + "cache should hit on unchanged state") + + # Simulate a fast-forward: the branch ref file gets a new SHA + # and a new mtime, but .git/HEAD is untouched. + branch_file.write_text("b" * 40 + "\n") + new_time = branch_file.stat().st_mtime + 10 + os.utime(branch_file, (new_time, new_time)) + + second = self.sm._get_local_git_info(self.plugin_path) + # Cache MUST have been invalidated — we should have re-run git. + self.assertGreater( + len(call_log), calls_after_first, + "cache should have invalidated on branch ref update", + ) + self.assertTrue(second["sha"].startswith("b")) + + +class TestSearchPluginsParallel(unittest.TestCase): + """Plugin Store browse path — the per-plugin GitHub enrichment used to + run serially, turning a browse of 15 plugins into 30–45 sequential HTTP + requests on a cold cache. This batch of tests locks in the parallel + fan-out and verifies output shape/ordering haven't regressed. + """ + + def setUp(self): + self._tmp = TemporaryDirectory() + self.addCleanup(self._tmp.cleanup) + self.sm = PluginStoreManager(plugins_dir=self._tmp.name) + + # Fake registry with 5 plugins. + self.registry = { + "plugins": [ + {"id": f"plg{i}", "name": f"Plugin {i}", + "repo": f"https://github.com/owner/plg{i}", "category": "util"} + for i in range(5) + ] + } + self.sm.registry_cache = self.registry + self.sm.registry_cache_time = time.time() + + self._enrich_calls = [] + + def fake_repo(repo_url): + self._enrich_calls.append(("repo", repo_url)) + return {"stars": 1, "default_branch": "main", + "last_commit_iso": "2026-04-08T00:00:00Z", + "last_commit_date": "2026-04-08"} + + def fake_commit(repo_url, branch): + self._enrich_calls.append(("commit", repo_url, branch)) + return {"short_sha": "abc1234", "sha": "abc1234" + "0" * 33, + "date_iso": "2026-04-08T00:00:00Z", "date": "2026-04-08", + "message": "m", "author": "a", "branch": branch} + + def fake_manifest(repo_url, branch, manifest_path): + self._enrich_calls.append(("manifest", repo_url, branch)) + return {"description": "desc"} + + self.sm._get_github_repo_info = fake_repo + self.sm._get_latest_commit_info = fake_commit + self.sm._fetch_manifest_from_github = fake_manifest + + def test_results_preserve_registry_order(self): + results = self.sm.search_plugins(include_saved_repos=False) + self.assertEqual([p["id"] for p in results], + [f"plg{i}" for i in range(5)]) + + def test_filters_applied_before_enrichment(self): + # Filter down to a single plugin via category — ensures we don't + # waste GitHub calls enriching plugins that won't be returned. + self.registry["plugins"][2]["category"] = "special" + self.sm.registry_cache = self.registry + self._enrich_calls.clear() + results = self.sm.search_plugins(category="special", include_saved_repos=False) + self.assertEqual(len(results), 1) + self.assertEqual(results[0]["id"], "plg2") + # Only one plugin should have been enriched. + repo_calls = [c for c in self._enrich_calls if c[0] == "repo"] + self.assertEqual(len(repo_calls), 1) + + def test_enrichment_runs_concurrently(self): + """Verify the thread pool actually runs fetches in parallel. + + Deterministic check: each stub repo fetch holds a lock while it + increments a "currently running" counter, then sleeps briefly, + then decrements. If execution is serial, the peak counter can + never exceed 1. If the thread pool is engaged, we see at least + 2 concurrent workers. + + We deliberately do NOT assert on elapsed wall time — that check + was flaky on low-power / CI boxes where scheduler noise dwarfed + the 50ms-per-worker budget. ``peak["count"] >= 2`` is the signal + we actually care about. + """ + import threading + peak_lock = threading.Lock() + peak = {"count": 0, "current": 0} + + def slow_repo(repo_url): + with peak_lock: + peak["current"] += 1 + if peak["current"] > peak["count"]: + peak["count"] = peak["current"] + # Small sleep gives other workers a chance to enter the + # critical section before we leave it. 50ms is large enough + # to dominate any scheduling jitter without slowing the test + # suite meaningfully. + time.sleep(0.05) + with peak_lock: + peak["current"] -= 1 + return {"stars": 0, "default_branch": "main", + "last_commit_iso": "", "last_commit_date": ""} + + self.sm._get_github_repo_info = slow_repo + self.sm._get_latest_commit_info = lambda *a, **k: None + self.sm._fetch_manifest_from_github = lambda *a, **k: None + + results = self.sm.search_plugins(fetch_commit_info=False, include_saved_repos=False) + + self.assertEqual(len(results), 5) + self.assertGreaterEqual( + peak["count"], 2, + "no concurrent fetches observed — thread pool not engaging", + ) + + +class TestStaleOnErrorFallbacks(unittest.TestCase): + """When GitHub is unreachable, previously-cached values should still be + returned rather than zero/None. Important on Pi's WiFi links. + """ + + def setUp(self): + self._tmp = TemporaryDirectory() + self.addCleanup(self._tmp.cleanup) + self.sm = PluginStoreManager(plugins_dir=self._tmp.name) + + def test_repo_info_stale_on_network_error(self): + cache_key = "owner/repo" + good = {"stars": 42, "default_branch": "main", + "last_commit_iso": "", "last_commit_date": "", + "forks": 0, "open_issues": 0, "updated_at_iso": "", + "language": "", "license": ""} + # Seed the cache with a known-good value, then force expiry. + self.sm.github_cache[cache_key] = (time.time() - 10_000, good) + self.sm.cache_timeout = 1 # force re-fetch + + import requests as real_requests + with patch("src.plugin_system.store_manager.requests.get", + side_effect=real_requests.ConnectionError("boom")): + result = self.sm._get_github_repo_info("https://github.com/owner/repo") + self.assertEqual(result["stars"], 42) + + def test_repo_info_stale_bumps_timestamp_into_backoff(self): + """Regression: after serving stale, next lookup must hit cache. + + Without the failure-backoff timestamp bump, a repeat request + would see the cache as still expired and re-hit the network, + amplifying the original failure. The fix is to update the + cached entry's timestamp so ``(now - ts) < cache_timeout`` holds + for the backoff window. + """ + cache_key = "owner/repo" + good = {"stars": 99, "default_branch": "main", + "last_commit_iso": "", "last_commit_date": "", + "forks": 0, "open_issues": 0, "updated_at_iso": "", + "language": "", "license": ""} + self.sm.github_cache[cache_key] = (time.time() - 10_000, good) + self.sm.cache_timeout = 1 + self.sm._failure_backoff_seconds = 60 + + import requests as real_requests + call_count = {"n": 0} + + def counting_get(*args, **kwargs): + call_count["n"] += 1 + raise real_requests.ConnectionError("boom") + + with patch("src.plugin_system.store_manager.requests.get", side_effect=counting_get): + first = self.sm._get_github_repo_info("https://github.com/owner/repo") + self.assertEqual(first["stars"], 99) + self.assertEqual(call_count["n"], 1) + + # Second call must hit the bumped cache and NOT make another request. + second = self.sm._get_github_repo_info("https://github.com/owner/repo") + self.assertEqual(second["stars"], 99) + self.assertEqual( + call_count["n"], 1, + "stale-cache fallback must bump the timestamp to avoid " + "re-retrying on every request during the backoff window", + ) + + def test_repo_info_stale_on_403_also_backs_off(self): + """Same backoff requirement for 403 rate-limit responses.""" + cache_key = "owner/repo" + good = {"stars": 7, "default_branch": "main", + "last_commit_iso": "", "last_commit_date": "", + "forks": 0, "open_issues": 0, "updated_at_iso": "", + "language": "", "license": ""} + self.sm.github_cache[cache_key] = (time.time() - 10_000, good) + self.sm.cache_timeout = 1 + + rate_limited = MagicMock() + rate_limited.status_code = 403 + rate_limited.text = "rate limited" + call_count = {"n": 0} + + def counting_get(*args, **kwargs): + call_count["n"] += 1 + return rate_limited + + with patch("src.plugin_system.store_manager.requests.get", side_effect=counting_get): + self.sm._get_github_repo_info("https://github.com/owner/repo") + self.assertEqual(call_count["n"], 1) + self.sm._get_github_repo_info("https://github.com/owner/repo") + self.assertEqual( + call_count["n"], 1, + "403 stale fallback must also bump the timestamp", + ) + + def test_commit_info_stale_on_network_error(self): + cache_key = "owner/repo:main" + good = {"branch": "main", "sha": "a" * 40, "short_sha": "aaaaaaa", + "date_iso": "2026-04-08T00:00:00Z", "date": "2026-04-08", + "author": "x", "message": "y"} + self.sm.commit_info_cache[cache_key] = (time.time() - 10_000, good) + self.sm.commit_cache_timeout = 1 # force re-fetch + + import requests as real_requests + with patch("src.plugin_system.store_manager.requests.get", + side_effect=real_requests.ConnectionError("boom")): + result = self.sm._get_latest_commit_info( + "https://github.com/owner/repo", branch="main" + ) + self.assertIsNotNone(result) + self.assertEqual(result["short_sha"], "aaaaaaa") + + def test_commit_info_preserves_good_cache_on_all_branches_404(self): + """Regression: all-branches-404 used to overwrite good cache with None. + + The previous implementation unconditionally wrote + ``self.commit_info_cache[cache_key] = (time.time(), None)`` after + the branch loop, which meant a single transient failure (e.g. an + odd 5xx or an ls-refs hiccup) wiped out the commit info we had + just served to the UI the previous minute. + """ + cache_key = "owner/repo:main" + good = {"branch": "main", "sha": "a" * 40, "short_sha": "aaaaaaa", + "date_iso": "2026-04-08T00:00:00Z", "date": "2026-04-08", + "author": "x", "message": "y"} + self.sm.commit_info_cache[cache_key] = (time.time() - 10_000, good) + self.sm.commit_cache_timeout = 1 + + # Each branches_to_try attempt returns a 404. No network error + # exception — just a non-200 response. This is the code path + # that used to overwrite the cache with None. + not_found = MagicMock() + not_found.status_code = 404 + not_found.text = "Not Found" + with patch("src.plugin_system.store_manager.requests.get", return_value=not_found): + result = self.sm._get_latest_commit_info( + "https://github.com/owner/repo", branch="main" + ) + + self.assertIsNotNone(result, "good cache was wiped out by transient 404s") + self.assertEqual(result["short_sha"], "aaaaaaa") + # The cache entry must still be the good value, not None. + self.assertIsNotNone(self.sm.commit_info_cache[cache_key][1]) + + +class TestInstallUpdateUninstallInvariants(unittest.TestCase): + """Regression guard: the caching and tombstone work added in this PR + must not break the install / update / uninstall code paths. + + Specifically: + - ``install_plugin`` bypasses commit/manifest caches via force_refresh, + so the 5→30 min TTL bump cannot cause users to install a stale commit. + - ``update_plugin`` does the same. + - The uninstall tombstone is only honored by the state reconciler, not + by explicit ``install_plugin`` calls — so a user can uninstall and + immediately reinstall from the store UI without the tombstone getting + in the way. + - ``was_recently_uninstalled`` is not touched by ``install_plugin``. + """ + + def setUp(self): + self._tmp = TemporaryDirectory() + self.addCleanup(self._tmp.cleanup) + self.sm = PluginStoreManager(plugins_dir=self._tmp.name) + + def test_get_plugin_info_with_force_refresh_forwards_to_commit_fetch(self): + """install_plugin's code path must reach the network bypass.""" + self.sm.registry_cache = { + "plugins": [{"id": "foo", "repo": "https://github.com/o/r"}] + } + self.sm.registry_cache_time = time.time() + + repo_calls = [] + commit_calls = [] + manifest_calls = [] + + def fake_repo(url): + repo_calls.append(url) + return {"default_branch": "main", "stars": 0, + "last_commit_iso": "", "last_commit_date": ""} + + def fake_commit(url, branch, force_refresh=False): + commit_calls.append((url, branch, force_refresh)) + return {"short_sha": "deadbee", "sha": "d" * 40, + "message": "m", "author": "a", "branch": branch, + "date": "2026-04-08", "date_iso": "2026-04-08T00:00:00Z"} + + def fake_manifest(url, branch, manifest_path, force_refresh=False): + manifest_calls.append((url, branch, manifest_path, force_refresh)) + return None + + self.sm._get_github_repo_info = fake_repo + self.sm._get_latest_commit_info = fake_commit + self.sm._fetch_manifest_from_github = fake_manifest + + info = self.sm.get_plugin_info("foo", fetch_latest_from_github=True, force_refresh=True) + + self.assertIsNotNone(info) + self.assertEqual(info["last_commit_sha"], "d" * 40) + # force_refresh must have propagated through to the fetch helpers. + self.assertTrue(commit_calls, "commit fetch was not called") + self.assertTrue(commit_calls[0][2], "force_refresh=True did not reach _get_latest_commit_info") + self.assertTrue(manifest_calls, "manifest fetch was not called") + self.assertTrue(manifest_calls[0][3], "force_refresh=True did not reach _fetch_manifest_from_github") + + def test_install_plugin_is_not_blocked_by_tombstone(self): + """A tombstone must only gate the reconciler, not explicit installs. + + Uses a complete, valid manifest stub and a no-op dependency + installer so ``install_plugin`` runs all the way through to a + True return. Anything less (e.g. swallowing exceptions) would + hide real regressions in the install path. + """ + import json as _json + self.sm.registry_cache = { + "plugins": [{"id": "bar", "repo": "https://github.com/o/bar", + "plugin_path": ""}] + } + self.sm.registry_cache_time = time.time() + + # Mark it recently uninstalled (simulates a user who just clicked + # uninstall and then immediately clicked install again). + self.sm.mark_recently_uninstalled("bar") + self.assertTrue(self.sm.was_recently_uninstalled("bar")) + + # Stub the heavy bits so install_plugin can run without network. + self.sm._get_github_repo_info = lambda url: { + "default_branch": "main", "stars": 0, + "last_commit_iso": "", "last_commit_date": "" + } + self.sm._get_latest_commit_info = lambda *a, **k: { + "short_sha": "abc1234", "sha": "a" * 40, "branch": "main", + "message": "m", "author": "a", + "date": "2026-04-08", "date_iso": "2026-04-08T00:00:00Z", + } + self.sm._fetch_manifest_from_github = lambda *a, **k: None + # Skip dependency install entirely (real install calls pip). + self.sm._install_dependencies = lambda *a, **k: True + + def fake_install_via_git(repo_url, plugin_path, branches): + # Write a COMPLETE valid manifest so install_plugin's + # post-download validation succeeds. Required fields come + # from install_plugin itself: id, name, class_name, display_modes. + plugin_path.mkdir(parents=True, exist_ok=True) + manifest = { + "id": "bar", + "name": "Bar Plugin", + "version": "1.0.0", + "class_name": "BarPlugin", + "entry_point": "manager.py", + "display_modes": ["bar_mode"], + } + (plugin_path / "manifest.json").write_text(_json.dumps(manifest)) + return branches[0] + + self.sm._install_via_git = fake_install_via_git + + # No exception-swallowing: if install_plugin fails for ANY reason + # unrelated to the tombstone, the test fails loudly. + result = self.sm.install_plugin("bar") + + self.assertTrue( + result, + "install_plugin returned False — the tombstone should not gate " + "explicit installs and all other stubs should allow success.", + ) + # Tombstone survives install (harmless — nothing reads it for installed plugins). + self.assertTrue(self.sm.was_recently_uninstalled("bar")) + + +class TestRegistryStaleCacheFallback(unittest.TestCase): + def setUp(self): + self._tmp = TemporaryDirectory() + self.addCleanup(self._tmp.cleanup) + self.sm = PluginStoreManager(plugins_dir=self._tmp.name) + + def test_network_failure_returns_stale_cache(self): + # Prime the cache with a known-good registry. + self.sm.registry_cache = {"plugins": [{"id": "cached"}]} + self.sm.registry_cache_time = time.time() - 10_000 # very old + self.sm.registry_cache_timeout = 1 # force re-fetch attempt + + import requests as real_requests + with patch.object( + self.sm, + "_http_get_with_retries", + side_effect=real_requests.RequestException("boom"), + ): + result = self.sm.fetch_registry() + + self.assertEqual(result, {"plugins": [{"id": "cached"}]}) + + def test_network_failure_with_no_cache_returns_empty(self): + self.sm.registry_cache = None + import requests as real_requests + with patch.object( + self.sm, + "_http_get_with_retries", + side_effect=real_requests.RequestException("boom"), + ): + result = self.sm.fetch_registry() + self.assertEqual(result, {"plugins": []}) + + def test_stale_fallback_bumps_timestamp_into_backoff(self): + """Regression: after the stale-cache fallback fires, the next + fetch_registry call must NOT re-hit the network. Without the + timestamp bump, a flaky connection causes every request to pay + the network timeout before falling back to stale. + """ + self.sm.registry_cache = {"plugins": [{"id": "cached"}]} + self.sm.registry_cache_time = time.time() - 10_000 # expired + self.sm.registry_cache_timeout = 1 + self.sm._failure_backoff_seconds = 60 + + import requests as real_requests + call_count = {"n": 0} + + def counting_get(*args, **kwargs): + call_count["n"] += 1 + raise real_requests.ConnectionError("boom") + + with patch.object(self.sm, "_http_get_with_retries", side_effect=counting_get): + first = self.sm.fetch_registry() + self.assertEqual(first, {"plugins": [{"id": "cached"}]}) + self.assertEqual(call_count["n"], 1) + + second = self.sm.fetch_registry() + self.assertEqual(second, {"plugins": [{"id": "cached"}]}) + self.assertEqual( + call_count["n"], 1, + "stale registry fallback must bump registry_cache_time so " + "subsequent requests hit the cache instead of re-retrying", + ) + + +class TestFetchRegistryRaiseOnFailure(unittest.TestCase): + """``fetch_registry(raise_on_failure=True)`` must propagate errors + instead of silently falling back to the stale cache / empty dict. + + Regression guard: the state reconciler relies on this to distinguish + "plugin genuinely not in registry" from "I can't reach the registry + right now". Without it, a fresh boot with flaky WiFi would poison + ``_unrecoverable_missing_on_disk`` with every config entry. + """ + + def setUp(self): + self._tmp = TemporaryDirectory() + self.addCleanup(self._tmp.cleanup) + self.sm = PluginStoreManager(plugins_dir=self._tmp.name) + + def test_request_exception_propagates_when_flag_set(self): + import requests as real_requests + self.sm.registry_cache = None # no stale cache + with patch.object( + self.sm, + "_http_get_with_retries", + side_effect=real_requests.RequestException("boom"), + ): + with self.assertRaises(real_requests.RequestException): + self.sm.fetch_registry(raise_on_failure=True) + + def test_request_exception_propagates_even_with_stale_cache(self): + """Explicit caller opt-in beats the stale-cache convenience.""" + import requests as real_requests + self.sm.registry_cache = {"plugins": [{"id": "stale"}]} + self.sm.registry_cache_time = time.time() - 10_000 + self.sm.registry_cache_timeout = 1 + with patch.object( + self.sm, + "_http_get_with_retries", + side_effect=real_requests.RequestException("boom"), + ): + with self.assertRaises(real_requests.RequestException): + self.sm.fetch_registry(raise_on_failure=True) + + def test_json_decode_error_propagates_when_flag_set(self): + import json as _json + self.sm.registry_cache = None + bad_response = MagicMock() + bad_response.status_code = 200 + bad_response.raise_for_status = MagicMock() + bad_response.json = MagicMock( + side_effect=_json.JSONDecodeError("bad", "", 0) + ) + with patch.object(self.sm, "_http_get_with_retries", return_value=bad_response): + with self.assertRaises(_json.JSONDecodeError): + self.sm.fetch_registry(raise_on_failure=True) + + def test_default_behavior_unchanged_by_new_parameter(self): + """UI callers that don't pass the flag still get stale-cache fallback.""" + import requests as real_requests + self.sm.registry_cache = {"plugins": [{"id": "cached"}]} + self.sm.registry_cache_time = time.time() - 10_000 + self.sm.registry_cache_timeout = 1 + with patch.object( + self.sm, + "_http_get_with_retries", + side_effect=real_requests.RequestException("boom"), + ): + result = self.sm.fetch_registry() # default raise_on_failure=False + self.assertEqual(result, {"plugins": [{"id": "cached"}]}) + + +if __name__ == "__main__": + unittest.main() diff --git a/test/test_uninstall_and_reconcile_endpoint.py b/test/test_uninstall_and_reconcile_endpoint.py new file mode 100644 index 00000000..34d2021b --- /dev/null +++ b/test/test_uninstall_and_reconcile_endpoint.py @@ -0,0 +1,395 @@ +"""Regression tests for the transactional uninstall helper and the +``/plugins/state/reconcile`` endpoint's payload handling. + +Bug 1: the original uninstall flow caught +``cleanup_plugin_config`` exceptions and only logged a warning before +proceeding to file deletion. A failure there would leave the plugin +files on disk with no config entry (orphan). The fix is a +``_do_transactional_uninstall`` helper that (a) aborts before touching +the filesystem if cleanup fails, and (b) restores the config+secrets +snapshot if file removal fails after cleanup succeeded. + +Bug 2: the reconcile endpoint did ``payload.get('force', False)`` after +``request.get_json(silent=True) or {}``, which raises AttributeError if +the client sent a non-object JSON body (e.g. a bare string or array). +Additionally, ``bool("false")`` is ``True``, so string-encoded booleans +were mis-handled. The fix is an ``isinstance(payload, dict)`` guard plus +routing the value through ``_coerce_to_bool``. +""" + +import json +import sys +import unittest +from pathlib import Path +from unittest.mock import MagicMock, patch + +project_root = Path(__file__).parent.parent +sys.path.insert(0, str(project_root)) + +from flask import Flask + + +_API_V3_MOCKED_ATTRS = ( + 'config_manager', 'plugin_manager', 'plugin_store_manager', + 'plugin_state_manager', 'saved_repositories_manager', 'schema_manager', + 'operation_queue', 'operation_history', 'cache_manager', +) + + +def _make_client(): + """Minimal Flask app + mocked deps that exercises the api_v3 blueprint. + + Returns ``(client, module, cleanup_fn)``. Callers (test ``setUp`` + methods) must register ``cleanup_fn`` with ``self.addCleanup(...)`` + so the original api_v3 singleton attributes are restored at the end + of the test — otherwise the MagicMocks leak into later tests that + import api_v3 expecting fresh state. + """ + from web_interface.blueprints import api_v3 as api_v3_module + from web_interface.blueprints.api_v3 import api_v3 + + # Snapshot the originals so we can restore them. + _SENTINEL = object() + originals = { + name: getattr(api_v3, name, _SENTINEL) for name in _API_V3_MOCKED_ATTRS + } + + # Mocks for all the bits the reconcile / uninstall endpoints touch. + api_v3.config_manager = MagicMock() + api_v3.config_manager.get_raw_file_content.return_value = {} + api_v3.config_manager.secrets_path = "/tmp/nonexistent_secrets.json" + api_v3.plugin_manager = MagicMock() + api_v3.plugin_manager.plugins = {} + api_v3.plugin_manager.plugins_dir = "/tmp" + api_v3.plugin_store_manager = MagicMock() + api_v3.plugin_state_manager = MagicMock() + api_v3.plugin_state_manager.get_all_states.return_value = {} + api_v3.saved_repositories_manager = MagicMock() + api_v3.schema_manager = MagicMock() + api_v3.operation_queue = None # force the direct (non-queue) path + api_v3.operation_history = MagicMock() + api_v3.cache_manager = MagicMock() + + def _cleanup(): + for name, original in originals.items(): + if original is _SENTINEL: + # Attribute didn't exist before — remove it to match. + if hasattr(api_v3, name): + try: + delattr(api_v3, name) + except AttributeError: + pass + else: + setattr(api_v3, name, original) + + app = Flask(__name__) + app.config['TESTING'] = True + app.config['SECRET_KEY'] = 'test' + app.register_blueprint(api_v3, url_prefix='/api/v3') + return app.test_client(), api_v3_module, _cleanup + + +class TestTransactionalUninstall(unittest.TestCase): + """Exercises ``_do_transactional_uninstall`` directly. + + Using the direct (non-queue) code path via the Flask client gives us + the full uninstall endpoint behavior end-to-end, including the + rollback on mid-flight failures. + """ + + def setUp(self): + self.client, self.mod, _cleanup = _make_client() + self.addCleanup(_cleanup) + self.api_v3 = self.mod.api_v3 + + def test_cleanup_failure_aborts_before_file_removal(self): + """If cleanup_plugin_config raises, uninstall_plugin must NOT run.""" + self.api_v3.config_manager.cleanup_plugin_config.side_effect = RuntimeError("disk full") + + response = self.client.post( + '/api/v3/plugins/uninstall', + data=json.dumps({'plugin_id': 'thing'}), + content_type='application/json', + ) + + self.assertEqual(response.status_code, 500) + # File removal must NOT have been attempted — otherwise we'd have + # deleted the plugin after failing to clean its config, leaving + # the reconciler to potentially resurrect it later. + self.api_v3.plugin_store_manager.uninstall_plugin.assert_not_called() + + def test_file_removal_failure_restores_snapshot(self): + """If uninstall_plugin returns False after cleanup, snapshot must be restored.""" + # Start with the plugin in main config and in secrets. + stored_main = {'thing': {'enabled': True, 'custom': 'stuff'}} + stored_secrets = {'thing': {'api_key': 'secret'}} + + # get_raw_file_content is called twice during snapshot (main + + # secrets) and then again during restore. We track writes through + # save_raw_file_content so we can assert the restore happened. + def raw_get(file_type): + if file_type == 'main': + return dict(stored_main) + if file_type == 'secrets': + return dict(stored_secrets) + return {} + + self.api_v3.config_manager.get_raw_file_content.side_effect = raw_get + self.api_v3.config_manager.secrets_path = __file__ # any existing file + self.api_v3.config_manager.cleanup_plugin_config.return_value = None + self.api_v3.plugin_store_manager.uninstall_plugin.return_value = False + + response = self.client.post( + '/api/v3/plugins/uninstall', + data=json.dumps({'plugin_id': 'thing'}), + content_type='application/json', + ) + + self.assertEqual(response.status_code, 500) + # After the file removal returned False, the helper must have + # written the snapshot back. Inspect save_raw_file_content calls. + calls = self.api_v3.config_manager.save_raw_file_content.call_args_list + file_types_written = [c.args[0] for c in calls] + self.assertIn('main', file_types_written, + f"main config was not restored after uninstall failure; calls={calls}") + # Find the main restore call and confirm our snapshot entry is present. + for c in calls: + if c.args[0] == 'main': + written = c.args[1] + self.assertIn('thing', written, + "main config was written without the restored snapshot entry") + self.assertEqual(written['thing'], stored_main['thing']) + break + + def test_file_removal_raising_also_restores_snapshot(self): + """Same restore path, but triggered by an exception instead of False.""" + stored_main = {'thing': {'enabled': False}} + + def raw_get(file_type): + if file_type == 'main': + return dict(stored_main) + return {} + + self.api_v3.config_manager.get_raw_file_content.side_effect = raw_get + self.api_v3.config_manager.cleanup_plugin_config.return_value = None + self.api_v3.plugin_store_manager.uninstall_plugin.side_effect = OSError("rm failed") + + response = self.client.post( + '/api/v3/plugins/uninstall', + data=json.dumps({'plugin_id': 'thing'}), + content_type='application/json', + ) + + self.assertEqual(response.status_code, 500) + calls = self.api_v3.config_manager.save_raw_file_content.call_args_list + self.assertTrue( + any(c.args[0] == 'main' for c in calls), + "main config was not restored after uninstall raised", + ) + + def test_happy_path_succeeds(self): + """Sanity: the transactional rework did not break the happy path.""" + self.api_v3.config_manager.get_raw_file_content.return_value = {} + self.api_v3.config_manager.cleanup_plugin_config.return_value = None + self.api_v3.plugin_store_manager.uninstall_plugin.return_value = True + + response = self.client.post( + '/api/v3/plugins/uninstall', + data=json.dumps({'plugin_id': 'thing'}), + content_type='application/json', + ) + + self.assertEqual(response.status_code, 200) + self.api_v3.plugin_store_manager.uninstall_plugin.assert_called_once_with('thing') + + def test_file_removal_failure_reloads_previously_loaded_plugin(self): + """Regression: rollback must restore BOTH config AND runtime state. + + If the plugin was loaded at runtime before the uninstall + request, and file removal fails after unload has already + succeeded, the rollback must call ``load_plugin`` so the user + doesn't end up in a state where the files exist and the config + exists but the plugin is no longer loaded. + """ + # Plugin is currently loaded. + self.api_v3.plugin_manager.plugins = {'thing': MagicMock()} + self.api_v3.config_manager.get_raw_file_content.return_value = { + 'thing': {'enabled': True} + } + self.api_v3.config_manager.cleanup_plugin_config.return_value = None + self.api_v3.plugin_manager.unload_plugin.return_value = None + self.api_v3.plugin_store_manager.uninstall_plugin.return_value = False + + response = self.client.post( + '/api/v3/plugins/uninstall', + data=json.dumps({'plugin_id': 'thing'}), + content_type='application/json', + ) + + self.assertEqual(response.status_code, 500) + # Unload did happen (it's part of the uninstall sequence)... + self.api_v3.plugin_manager.unload_plugin.assert_called_once_with('thing') + # ...and because file removal failed, the rollback must have + # called load_plugin to restore runtime state. + self.api_v3.plugin_manager.load_plugin.assert_called_once_with('thing') + + def test_snapshot_survives_config_read_error(self): + """Regression: if get_raw_file_content raises an expected error + (OSError / ConfigError) during snapshot, the uninstall should + still proceed — we just won't have a rollback snapshot. Narrow + exception list must still cover the realistic failure modes. + """ + from src.exceptions import ConfigError + self.api_v3.config_manager.get_raw_file_content.side_effect = ConfigError( + "file missing", config_path="/tmp/missing" + ) + self.api_v3.config_manager.cleanup_plugin_config.return_value = None + self.api_v3.plugin_store_manager.uninstall_plugin.return_value = True + + response = self.client.post( + '/api/v3/plugins/uninstall', + data=json.dumps({'plugin_id': 'thing'}), + content_type='application/json', + ) + + # Uninstall should still succeed — snapshot failure is logged + # but doesn't block the uninstall. + self.assertEqual(response.status_code, 200) + self.api_v3.plugin_store_manager.uninstall_plugin.assert_called_once_with('thing') + + def test_snapshot_does_not_swallow_programmer_errors(self): + """Regression: unexpected exceptions (TypeError, AttributeError) + must propagate out of the snapshot helper so bugs surface + during development instead of being silently logged and + ignored. Narrowing from ``except Exception`` to + ``(OSError, ConfigError)`` is what makes this work. + """ + # Raise an exception that is NOT in the narrow catch list. + self.api_v3.config_manager.get_raw_file_content.side_effect = TypeError( + "unexpected kwarg" + ) + + response = self.client.post( + '/api/v3/plugins/uninstall', + data=json.dumps({'plugin_id': 'thing'}), + content_type='application/json', + ) + + # The TypeError should propagate up to the endpoint's outer + # try/except and produce a 500, NOT be silently swallowed like + # the previous ``except Exception`` did. + self.assertEqual(response.status_code, 500) + # uninstall_plugin must NOT have been called — the snapshot + # exception bubbled up before we got that far. + self.api_v3.plugin_store_manager.uninstall_plugin.assert_not_called() + + def test_unload_failure_restores_config_and_does_not_call_uninstall(self): + """If unload_plugin itself raises, config must be restored and + uninstall_plugin must NOT be called.""" + self.api_v3.plugin_manager.plugins = {'thing': MagicMock()} + self.api_v3.config_manager.get_raw_file_content.return_value = { + 'thing': {'enabled': True} + } + self.api_v3.config_manager.cleanup_plugin_config.return_value = None + self.api_v3.plugin_manager.unload_plugin.side_effect = RuntimeError("unload boom") + + response = self.client.post( + '/api/v3/plugins/uninstall', + data=json.dumps({'plugin_id': 'thing'}), + content_type='application/json', + ) + + self.assertEqual(response.status_code, 500) + self.api_v3.plugin_store_manager.uninstall_plugin.assert_not_called() + # Config should have been restored. + calls = self.api_v3.config_manager.save_raw_file_content.call_args_list + self.assertTrue( + any(c.args[0] == 'main' for c in calls), + "main config was not restored after unload_plugin raised", + ) + # load_plugin must NOT have been called — unload didn't succeed, + # so runtime state is still what it was. + self.api_v3.plugin_manager.load_plugin.assert_not_called() + + +class TestReconcileEndpointPayload(unittest.TestCase): + """``/plugins/state/reconcile`` must handle weird JSON payloads without + crashing, and must accept string booleans for ``force``. + """ + + def setUp(self): + self.client, self.mod, _cleanup = _make_client() + self.addCleanup(_cleanup) + self.api_v3 = self.mod.api_v3 + # Stub the reconciler so we only test the payload plumbing, not + # the full reconciliation. We patch StateReconciliation at the + # module level where the endpoint imports it lazily. + self._reconciler_instance = MagicMock() + self._reconciler_instance.reconcile_state.return_value = MagicMock( + inconsistencies_found=[], + inconsistencies_fixed=[], + inconsistencies_manual=[], + message="ok", + ) + # Patch the StateReconciliation class where it's imported inside + # the reconcile endpoint. + self._patcher = patch( + 'src.plugin_system.state_reconciliation.StateReconciliation', + return_value=self._reconciler_instance, + ) + self._patcher.start() + self.addCleanup(self._patcher.stop) + + def _post(self, body, content_type='application/json'): + return self.client.post( + '/api/v3/plugins/state/reconcile', + data=body, + content_type=content_type, + ) + + def test_non_object_json_body_does_not_crash(self): + """A bare string JSON body must not raise AttributeError.""" + response = self._post('"just a string"') + self.assertEqual(response.status_code, 200) + # force must default to False. + self._reconciler_instance.reconcile_state.assert_called_once_with(force=False) + + def test_array_json_body_does_not_crash(self): + response = self._post('[1, 2, 3]') + self.assertEqual(response.status_code, 200) + self._reconciler_instance.reconcile_state.assert_called_once_with(force=False) + + def test_null_json_body_does_not_crash(self): + response = self._post('null') + self.assertEqual(response.status_code, 200) + self._reconciler_instance.reconcile_state.assert_called_once_with(force=False) + + def test_missing_force_key_defaults_to_false(self): + response = self._post('{}') + self.assertEqual(response.status_code, 200) + self._reconciler_instance.reconcile_state.assert_called_once_with(force=False) + + def test_force_true_boolean(self): + response = self._post(json.dumps({'force': True})) + self.assertEqual(response.status_code, 200) + self._reconciler_instance.reconcile_state.assert_called_once_with(force=True) + + def test_force_false_boolean(self): + response = self._post(json.dumps({'force': False})) + self.assertEqual(response.status_code, 200) + self._reconciler_instance.reconcile_state.assert_called_once_with(force=False) + + def test_force_string_false_coerced_correctly(self): + """``bool("false")`` is ``True`` — _coerce_to_bool must fix that.""" + response = self._post(json.dumps({'force': 'false'})) + self.assertEqual(response.status_code, 200) + self._reconciler_instance.reconcile_state.assert_called_once_with(force=False) + + def test_force_string_true_coerced_correctly(self): + response = self._post(json.dumps({'force': 'true'})) + self.assertEqual(response.status_code, 200) + self._reconciler_instance.reconcile_state.assert_called_once_with(force=True) + + +if __name__ == '__main__': + unittest.main() diff --git a/test/web_interface/test_state_reconciliation.py b/test/web_interface/test_state_reconciliation.py index 538c252f..1fa0de22 100644 --- a/test/web_interface/test_state_reconciliation.py +++ b/test/web_interface/test_state_reconciliation.py @@ -342,6 +342,167 @@ class TestStateReconciliation(unittest.TestCase): self.assertEqual(state, {}) +class TestStateReconciliationUnrecoverable(unittest.TestCase): + """Tests for the unrecoverable-plugin cache and force reconcile. + + Regression coverage for the infinite reinstall loop where a config + entry referenced a plugin not present in the registry (e.g. legacy + 'github' / 'youtube' entries). The reconciler used to retry the + install on every HTTP request; it now caches the failure for the + process lifetime and only retries on an explicit ``force=True`` + reconcile call. + """ + + def setUp(self): + self.temp_dir = Path(tempfile.mkdtemp()) + self.plugins_dir = self.temp_dir / "plugins" + self.plugins_dir.mkdir() + + self.state_manager = Mock(spec=PluginStateManager) + self.state_manager.get_all_states.return_value = {} + self.config_manager = Mock() + self.config_manager.load_config.return_value = { + "ghost": {"enabled": True} + } + self.plugin_manager = Mock() + self.plugin_manager.plugin_manifests = {} + self.plugin_manager.plugins = {} + + # Store manager with an empty registry — install_plugin always fails + self.store_manager = Mock() + self.store_manager.fetch_registry.return_value = {"plugins": []} + self.store_manager.install_plugin.return_value = False + self.store_manager.was_recently_uninstalled.return_value = False + + self.reconciler = StateReconciliation( + state_manager=self.state_manager, + config_manager=self.config_manager, + plugin_manager=self.plugin_manager, + plugins_dir=self.plugins_dir, + store_manager=self.store_manager, + ) + + def tearDown(self): + shutil.rmtree(self.temp_dir) + + def test_not_in_registry_marks_unrecoverable_without_install(self): + """If the plugin isn't in the registry at all, skip install_plugin.""" + result = self.reconciler.reconcile_state() + + # One inconsistency, unfixable, no install attempt made. + self.assertEqual(len(result.inconsistencies_found), 1) + self.assertEqual(len(result.inconsistencies_fixed), 0) + self.store_manager.install_plugin.assert_not_called() + self.assertIn("ghost", self.reconciler._unrecoverable_missing_on_disk) + + def test_subsequent_reconcile_does_not_retry(self): + """Second reconcile pass must not touch install_plugin or fetch_registry again.""" + self.reconciler.reconcile_state() + self.store_manager.fetch_registry.reset_mock() + self.store_manager.install_plugin.reset_mock() + + result = self.reconciler.reconcile_state() + + # Still one inconsistency, still no install attempt, no new registry fetch + self.assertEqual(len(result.inconsistencies_found), 1) + inc = result.inconsistencies_found[0] + self.assertFalse(inc.can_auto_fix) + self.assertEqual(inc.fix_action, FixAction.MANUAL_FIX_REQUIRED) + self.store_manager.install_plugin.assert_not_called() + self.store_manager.fetch_registry.assert_not_called() + + def test_force_reconcile_clears_unrecoverable_cache(self): + """force=True must re-attempt previously-failed plugins.""" + self.reconciler.reconcile_state() + self.assertIn("ghost", self.reconciler._unrecoverable_missing_on_disk) + + # Now pretend the registry gained the plugin so the pre-check passes + # and install_plugin is actually invoked. + self.store_manager.fetch_registry.return_value = { + "plugins": [{"id": "ghost"}] + } + self.store_manager.install_plugin.return_value = True + self.store_manager.install_plugin.reset_mock() + + # Config still references ghost; disk still missing it — the + # reconciler should re-attempt install now that force=True cleared + # the cache. Use assert_called_once_with so a future regression + # that accidentally triggers a second install attempt on force=True + # is caught. + result = self.reconciler.reconcile_state(force=True) + + self.store_manager.install_plugin.assert_called_once_with("ghost") + + def test_registry_unreachable_does_not_mark_unrecoverable(self): + """Transient registry failures should not poison the cache.""" + self.store_manager.fetch_registry.side_effect = Exception("network down") + + result = self.reconciler.reconcile_state() + + self.assertEqual(len(result.inconsistencies_found), 1) + self.assertNotIn("ghost", self.reconciler._unrecoverable_missing_on_disk) + self.store_manager.install_plugin.assert_not_called() + + def test_recently_uninstalled_skips_auto_repair(self): + """A freshly-uninstalled plugin must not be resurrected by the reconciler.""" + self.store_manager.was_recently_uninstalled.return_value = True + self.store_manager.fetch_registry.return_value = { + "plugins": [{"id": "ghost"}] + } + + result = self.reconciler.reconcile_state() + + self.assertEqual(len(result.inconsistencies_found), 1) + inc = result.inconsistencies_found[0] + self.assertFalse(inc.can_auto_fix) + self.assertEqual(inc.fix_action, FixAction.MANUAL_FIX_REQUIRED) + self.store_manager.install_plugin.assert_not_called() + + def test_real_store_manager_empty_registry_on_network_failure(self): + """Regression: using the REAL PluginStoreManager (not a Mock), verify + the reconciler does NOT poison the unrecoverable cache when + ``fetch_registry`` fails with no stale cache available. + + Previously, the default stale-cache fallback in ``fetch_registry`` + silently returned ``{"plugins": []}`` on network failure with no + cache. The reconciler's ``_auto_repair_missing_plugin`` saw "no + candidates in registry" and marked everything unrecoverable — a + regression that would bite every user doing a fresh boot on flaky + WiFi. The fix is ``fetch_registry(raise_on_failure=True)`` in + ``_auto_repair_missing_plugin`` so the reconciler can tell a real + registry miss from a network error. + """ + from src.plugin_system.store_manager import PluginStoreManager + import requests as real_requests + + real_store = PluginStoreManager(plugins_dir=str(self.plugins_dir)) + real_store.registry_cache = None # fresh boot, no cache + real_store.registry_cache_time = None + + # Stub the underlying HTTP so no real network call is made but the + # real fetch_registry code path runs. + real_store._http_get_with_retries = Mock( + side_effect=real_requests.ConnectionError("wifi down") + ) + + reconciler = StateReconciliation( + state_manager=self.state_manager, + config_manager=self.config_manager, + plugin_manager=self.plugin_manager, + plugins_dir=self.plugins_dir, + store_manager=real_store, + ) + + result = reconciler.reconcile_state() + + # One inconsistency (ghost is in config, not on disk), but + # because the registry lookup failed transiently, we must NOT + # have marked it unrecoverable — a later reconcile (after the + # network comes back) can still auto-repair. + self.assertEqual(len(result.inconsistencies_found), 1) + self.assertNotIn("ghost", reconciler._unrecoverable_missing_on_disk) + + if __name__ == '__main__': unittest.main() diff --git a/web_interface/app.py b/web_interface/app.py index 66684f86..42568ce0 100644 --- a/web_interface/app.py +++ b/web_interface/app.py @@ -667,8 +667,20 @@ import threading as _threading _reconciliation_lock = _threading.Lock() def _run_startup_reconciliation() -> None: - """Run state reconciliation in background to auto-repair missing plugins.""" - global _reconciliation_done, _reconciliation_started + """Run state reconciliation in background to auto-repair missing plugins. + + Reconciliation runs exactly once per process lifetime, regardless of + whether every inconsistency could be auto-fixed. Previously, a failed + auto-repair (e.g. a config entry referencing a plugin that no longer + exists in the registry) would reset ``_reconciliation_started`` to False, + causing the ``@app.before_request`` hook to re-trigger reconciliation on + every single HTTP request — an infinite install-retry loop that pegged + the CPU and flooded the log. Unresolved issues are now left in place for + the user to address via the UI; the reconciler itself also caches + per-plugin unrecoverable failures internally so repeated reconcile calls + stay cheap. + """ + global _reconciliation_done from src.logging_config import get_logger _logger = get_logger('reconciliation') @@ -684,18 +696,22 @@ def _run_startup_reconciliation() -> None: result = reconciler.reconcile_state() if result.inconsistencies_found: _logger.info("[Reconciliation] %s", result.message) - if result.reconciliation_successful: - if result.inconsistencies_fixed: - plugin_manager.discover_plugins() - _reconciliation_done = True - else: - _logger.warning("[Reconciliation] Finished with unresolved issues, will retry") - with _reconciliation_lock: - _reconciliation_started = False + if result.inconsistencies_fixed: + plugin_manager.discover_plugins() + if not result.reconciliation_successful: + _logger.warning( + "[Reconciliation] Finished with %d unresolved issue(s); " + "will not retry automatically. Use the Plugin Store or the " + "manual 'Reconcile' action to resolve.", + len(result.inconsistencies_manual), + ) except Exception as e: _logger.error("[Reconciliation] Error: %s", e, exc_info=True) - with _reconciliation_lock: - _reconciliation_started = False + finally: + # Always mark done — we do not want an unhandled exception (or an + # unresolved inconsistency) to cause the @before_request hook to + # retrigger reconciliation on every subsequent request. + _reconciliation_done = True # Initialize health monitor and run reconciliation on first request @app.before_request diff --git a/web_interface/blueprints/api_v3.py b/web_interface/blueprints/api_v3.py index 7fe9c7b3..9a752076 100644 --- a/web_interface/blueprints/api_v3.py +++ b/web_interface/blueprints/api_v3.py @@ -1714,9 +1714,23 @@ def get_installed_plugins(): import json from pathlib import Path - # Re-discover plugins to ensure we have the latest list - # This handles cases where plugins are added/removed after app startup - api_v3.plugin_manager.discover_plugins() + # Re-discover plugins only if the plugins directory has actually + # changed since our last scan, or if the caller explicitly asked + # for a refresh. The previous unconditional ``discover_plugins()`` + # call (plus a per-plugin manifest re-read) made this endpoint + # O(plugins) in disk I/O on every page refresh, which on an SD-card + # Pi4 with ~15 plugins was pegging the CPU and blocking the UI + # "connecting to display" spinner for minutes. + force_refresh = request.args.get('refresh', '').lower() in ('1', 'true', 'yes') + plugins_dir_path = Path(api_v3.plugin_manager.plugins_dir) + try: + current_mtime = plugins_dir_path.stat().st_mtime if plugins_dir_path.exists() else 0 + except OSError: + current_mtime = 0 + last_mtime = getattr(api_v3, '_installed_plugins_dir_mtime', None) + if force_refresh or last_mtime != current_mtime: + api_v3.plugin_manager.discover_plugins() + api_v3._installed_plugins_dir_mtime = current_mtime # Get all installed plugin info from the plugin manager all_plugin_info = api_v3.plugin_manager.get_all_plugin_info() @@ -1729,17 +1743,10 @@ def get_installed_plugins(): for plugin_info in all_plugin_info: plugin_id = plugin_info.get('id') - # Re-read manifest from disk to ensure we have the latest metadata - manifest_path = Path(api_v3.plugin_manager.plugins_dir) / plugin_id / "manifest.json" - if manifest_path.exists(): - try: - with open(manifest_path, 'r', encoding='utf-8') as f: - fresh_manifest = json.load(f) - # Update plugin_info with fresh manifest data - plugin_info.update(fresh_manifest) - except Exception as e: - # If we can't read the fresh manifest, use the cached one - logger.warning("[PluginStore] Could not read fresh manifest for %s: %s", plugin_id, e) + # Note: we intentionally do NOT re-read manifest.json here. + # discover_plugins() above already reparses manifests on change; + # re-reading on every request added ~1 syscall+json.loads per + # plugin per request for no benefit. # Get enabled status from config (source of truth) # Read from config file first, fall back to plugin instance if config doesn't have the key @@ -2369,14 +2376,30 @@ def reconcile_plugin_state(): from src.plugin_system.state_reconciliation import StateReconciliation + # Pass the store manager so auto-repair of missing-on-disk plugins + # can actually run. Previously this endpoint silently degraded to + # MANUAL_FIX_REQUIRED because store_manager was omitted. reconciler = StateReconciliation( state_manager=api_v3.plugin_state_manager, config_manager=api_v3.config_manager, plugin_manager=api_v3.plugin_manager, - plugins_dir=Path(api_v3.plugin_manager.plugins_dir) + plugins_dir=Path(api_v3.plugin_manager.plugins_dir), + store_manager=api_v3.plugin_store_manager, ) - result = reconciler.reconcile_state() + # Allow the caller to force a retry of previously-unrecoverable + # plugins (e.g. after the registry has been updated or a typo fixed). + # Non-object JSON bodies (e.g. a bare string or array) must fall + # through to the default False instead of raising AttributeError, + # and string booleans like "false" must coerce correctly — hence + # the isinstance guard plus _coerce_to_bool. + force = False + if request.is_json: + payload = request.get_json(silent=True) + if isinstance(payload, dict): + force = _coerce_to_bool(payload.get('force', False)) + + result = reconciler.reconcile_state(force=force) return success_response( data={ @@ -2799,6 +2822,181 @@ def update_plugin(): status_code=500 ) +def _snapshot_plugin_config(plugin_id: str): + """Capture the plugin's current config and secrets entries for rollback. + + Returns a tuple ``(main_entry, secrets_entry)`` where each element is + the plugin's dict from the respective file, or ``None`` if the plugin + was not present there. Used by the transactional uninstall path so we + can restore state if file removal fails after config cleanup has + already succeeded. + """ + main_entry = None + secrets_entry = None + # Narrow exception list: filesystem errors (FileNotFoundError is a + # subclass of OSError, IOError is an alias for OSError in Python 3) + # and ConfigError, which is what ``get_raw_file_content`` wraps all + # load failures in. Programmer errors (TypeError, AttributeError, + # etc.) are intentionally NOT caught — they should surface loudly. + try: + main_config = api_v3.config_manager.get_raw_file_content('main') + if plugin_id in main_config: + import copy as _copy + main_entry = _copy.deepcopy(main_config[plugin_id]) + except (OSError, ConfigError) as e: + logger.warning("[PluginUninstall] Could not snapshot main config for %s: %s", plugin_id, e) + try: + import os as _os + if _os.path.exists(api_v3.config_manager.secrets_path): + secrets_config = api_v3.config_manager.get_raw_file_content('secrets') + if plugin_id in secrets_config: + import copy as _copy + secrets_entry = _copy.deepcopy(secrets_config[plugin_id]) + except (OSError, ConfigError) as e: + logger.warning("[PluginUninstall] Could not snapshot secrets for %s: %s", plugin_id, e) + return (main_entry, secrets_entry) + + +def _restore_plugin_config(plugin_id: str, snapshot) -> None: + """Best-effort restoration of a snapshot taken by ``_snapshot_plugin_config``. + + Called on the unhappy path when ``cleanup_plugin_config`` already + succeeded but the subsequent file removal failed. If the restore + itself fails, we log loudly — the caller still sees the original + uninstall error and the user can reconcile manually. + """ + main_entry, secrets_entry = snapshot + if main_entry is not None: + try: + main_config = api_v3.config_manager.get_raw_file_content('main') + main_config[plugin_id] = main_entry + api_v3.config_manager.save_raw_file_content('main', main_config) + logger.warning("[PluginUninstall] Restored main config entry for %s after uninstall failure", plugin_id) + except Exception as e: + logger.error( + "[PluginUninstall] FAILED to restore main config entry for %s after uninstall failure: %s", + plugin_id, e, exc_info=True, + ) + if secrets_entry is not None: + try: + import os as _os + if _os.path.exists(api_v3.config_manager.secrets_path): + secrets_config = api_v3.config_manager.get_raw_file_content('secrets') + else: + secrets_config = {} + secrets_config[plugin_id] = secrets_entry + api_v3.config_manager.save_raw_file_content('secrets', secrets_config) + logger.warning("[PluginUninstall] Restored secrets entry for %s after uninstall failure", plugin_id) + except Exception as e: + logger.error( + "[PluginUninstall] FAILED to restore secrets entry for %s after uninstall failure: %s", + plugin_id, e, exc_info=True, + ) + + +def _do_transactional_uninstall(plugin_id: str, preserve_config: bool) -> None: + """Run the full uninstall as a best-effort transaction. + + Order: + 1. Mark tombstone (so any reconciler racing with us cannot resurrect + the plugin mid-flight). + 2. Snapshot existing config + secrets entries (for rollback). + 3. Run ``cleanup_plugin_config``. If this raises, re-raise — files + have NOT been touched, so aborting here leaves a fully consistent + state: plugin is still installed and still in config. + 4. Unload the plugin from the running plugin manager. + 5. Call ``store_manager.uninstall_plugin``. If it returns False or + raises, RESTORE the snapshot (so config matches disk) and then + propagate the failure. + 6. Invalidate schema cache and remove from the state manager only + after the file removal succeeds. + + Raises on any failure so the caller can return an error to the user. + """ + if hasattr(api_v3.plugin_store_manager, 'mark_recently_uninstalled'): + api_v3.plugin_store_manager.mark_recently_uninstalled(plugin_id) + + snapshot = _snapshot_plugin_config(plugin_id) if not preserve_config else (None, None) + + # Step 1: config cleanup. If this fails, bail out early — the plugin + # files on disk are still intact and the caller will get a clear + # error. + if not preserve_config: + try: + api_v3.config_manager.cleanup_plugin_config(plugin_id, remove_secrets=True) + except Exception as cleanup_err: + logger.error( + "[PluginUninstall] Config cleanup failed for %s; aborting uninstall (files untouched): %s", + plugin_id, cleanup_err, exc_info=True, + ) + raise + + # Remember whether the plugin was loaded *before* we touched runtime + # state — we need this so we can reload it on rollback if file + # removal fails after we've already unloaded it. + was_loaded = bool( + api_v3.plugin_manager and plugin_id in api_v3.plugin_manager.plugins + ) + + def _rollback(reason_err): + """Undo both the config cleanup AND the unload.""" + if not preserve_config: + _restore_plugin_config(plugin_id, snapshot) + if was_loaded and api_v3.plugin_manager: + try: + api_v3.plugin_manager.load_plugin(plugin_id) + except Exception as reload_err: + logger.error( + "[PluginUninstall] FAILED to reload %s after uninstall rollback: %s", + plugin_id, reload_err, exc_info=True, + ) + + # Step 2: unload if loaded. Also part of the rollback boundary — if + # unload itself raises, restore config and surface the error. + if was_loaded: + try: + api_v3.plugin_manager.unload_plugin(plugin_id) + except Exception as unload_err: + logger.error( + "[PluginUninstall] unload_plugin raised for %s; restoring config snapshot: %s", + plugin_id, unload_err, exc_info=True, + ) + if not preserve_config: + _restore_plugin_config(plugin_id, snapshot) + # Plugin was never successfully unloaded, so no reload is + # needed here — runtime state is still what it was before. + raise + + # Step 3: remove files. If this fails, roll back the config cleanup + # AND reload the plugin so the user doesn't end up with an orphaned + # install (files on disk + no config entry + plugin no longer + # loaded at runtime). + try: + success = api_v3.plugin_store_manager.uninstall_plugin(plugin_id) + except Exception as uninstall_err: + logger.error( + "[PluginUninstall] uninstall_plugin raised for %s; rolling back: %s", + plugin_id, uninstall_err, exc_info=True, + ) + _rollback(uninstall_err) + raise + + if not success: + logger.error( + "[PluginUninstall] uninstall_plugin returned False for %s; rolling back", + plugin_id, + ) + _rollback(None) + raise RuntimeError(f"Failed to uninstall plugin {plugin_id}") + + # Past this point the filesystem and config are both in the + # "uninstalled" state. Clean up the cheap in-memory bookkeeping. + if api_v3.schema_manager: + api_v3.schema_manager.invalidate_cache(plugin_id) + if api_v3.plugin_state_manager: + api_v3.plugin_state_manager.remove_plugin_state(plugin_id) + + @api_v3.route('/plugins/uninstall', methods=['POST']) def uninstall_plugin(): """Uninstall plugin""" @@ -2821,49 +3019,28 @@ def uninstall_plugin(): # Use operation queue if available if api_v3.operation_queue: def uninstall_callback(operation): - """Callback to execute plugin uninstallation.""" - # Unload the plugin first if it's loaded - if api_v3.plugin_manager and plugin_id in api_v3.plugin_manager.plugins: - api_v3.plugin_manager.unload_plugin(plugin_id) - - # Uninstall the plugin - success = api_v3.plugin_store_manager.uninstall_plugin(plugin_id) - - if not success: - error_msg = f'Failed to uninstall plugin {plugin_id}' + """Callback to execute plugin uninstallation transactionally.""" + try: + _do_transactional_uninstall(plugin_id, preserve_config) + except Exception as err: + error_msg = f'Failed to uninstall plugin {plugin_id}: {err}' if api_v3.operation_history: api_v3.operation_history.record_operation( "uninstall", plugin_id=plugin_id, status="failed", - error=error_msg + error=error_msg, ) - raise Exception(error_msg) + # Re-raise so the operation_queue marks this op as failed. + raise - # Invalidate schema cache - if api_v3.schema_manager: - api_v3.schema_manager.invalidate_cache(plugin_id) - - # Clean up plugin configuration if not preserving - if not preserve_config: - try: - api_v3.config_manager.cleanup_plugin_config(plugin_id, remove_secrets=True) - except Exception as cleanup_err: - logger.warning("[PluginUninstall] Failed to cleanup config for %s: %s", plugin_id, cleanup_err) - - # Remove from state manager - if api_v3.plugin_state_manager: - api_v3.plugin_state_manager.remove_plugin_state(plugin_id) - - # Record in history if api_v3.operation_history: api_v3.operation_history.record_operation( "uninstall", plugin_id=plugin_id, status="success", - details={"preserve_config": preserve_config} + details={"preserve_config": preserve_config}, ) - return {'success': True, 'message': f'Plugin {plugin_id} uninstalled successfully'} # Enqueue operation @@ -2878,55 +3055,32 @@ def uninstall_plugin(): message=f'Plugin {plugin_id} uninstallation queued' ) else: - # Fallback to direct uninstall - # Unload the plugin first if it's loaded - if api_v3.plugin_manager and plugin_id in api_v3.plugin_manager.plugins: - api_v3.plugin_manager.unload_plugin(plugin_id) - - # Uninstall the plugin - success = api_v3.plugin_store_manager.uninstall_plugin(plugin_id) - - if success: - # Invalidate schema cache - if api_v3.schema_manager: - api_v3.schema_manager.invalidate_cache(plugin_id) - - # Clean up plugin configuration if not preserving - if not preserve_config: - try: - api_v3.config_manager.cleanup_plugin_config(plugin_id, remove_secrets=True) - except Exception as cleanup_err: - logger.warning("[PluginUninstall] Failed to cleanup config for %s: %s", plugin_id, cleanup_err) - - # Remove from state manager - if api_v3.plugin_state_manager: - api_v3.plugin_state_manager.remove_plugin_state(plugin_id) - - # Record in history - if api_v3.operation_history: - api_v3.operation_history.record_operation( - "uninstall", - plugin_id=plugin_id, - status="success", - details={"preserve_config": preserve_config} - ) - - return success_response(message=f'Plugin {plugin_id} uninstalled successfully') - else: + # Fallback to direct uninstall — same transactional helper. + try: + _do_transactional_uninstall(plugin_id, preserve_config) + except Exception as err: if api_v3.operation_history: api_v3.operation_history.record_operation( "uninstall", plugin_id=plugin_id, status="failed", - error=f'Failed to uninstall plugin {plugin_id}' + error=f'Failed to uninstall plugin {plugin_id}: {err}', ) - return error_response( ErrorCode.PLUGIN_UNINSTALL_FAILED, - f'Failed to uninstall plugin {plugin_id}', - status_code=500 + f'Failed to uninstall plugin {plugin_id}: {err}', + status_code=500, ) + if api_v3.operation_history: + api_v3.operation_history.record_operation( + "uninstall", + plugin_id=plugin_id, + status="success", + details={"preserve_config": preserve_config}, + ) + return success_response(message=f'Plugin {plugin_id} uninstalled successfully') + except Exception as e: logger.exception("[PluginUninstall] Unhandled exception") from src.web_interface.errors import WebInterfaceError