perf(plugins): parallelize Plugin Store browse and extend metadata cache TTLs

Follow-up to the previous commit addressing the Plugin Store browse path specifically. Most users install plugins via the store (ZIP extraction, no .git directory) so the git-info mtime cache from the previous commit was a no-op for them; their pain was coming from /plugins/store/list. Root cause. search_plugins() enriched each returned plugin with three serial GitHub fetches: _get_github_repo_info (repo API), _get_latest_commit_info (commits API), _fetch_manifest_from_github (raw.githubusercontent.com). Fifteen plugins × three requests × serial HTTP = 30–45 sequential round trips on every cold browse. On a Pi4 over WiFi that translated directly into the "connecting to display" hang users reported. The commit and manifest caches had a 5-minute TTL, so even a brief absence re-paid the full cost. Changes. - ``search_plugins``: fan out per-plugin enrichment through a ``ThreadPoolExecutor`` (max 10 workers, stays well under unauthenticated GitHub rate limits). Apply category/tag/query filters before enrichment so we never waste requests on plugins that will be filtered out. ``executor.map`` preserves input order, which the UI depends on. - ``commit_cache_timeout`` and ``manifest_cache_timeout``: 5 min → 30 min. Keeps the cache warm across a realistic session while still picking up upstream updates in a reasonable window. - ``_get_github_repo_info`` and ``_get_latest_commit_info``: stale-on-error fallback. On a network failure or a 403 we now prefer a previously- cached value over the zero-default, matching the pattern already in ``fetch_registry``. Flaky Pi WiFi no longer causes star counts to flip to 0 and commit info to disappear. Tests (5 new in test_store_manager_caches.py). - ``test_results_preserve_registry_order`` — the parallel map must still return plugins in input order. - ``test_filters_applied_before_enrichment`` — category/tag/query filters run first so we don't waste HTTP calls. - ``test_enrichment_runs_concurrently`` — peak-concurrency check plus a wall-time bound that would fail if the code regressed to serial. - ``test_repo_info_stale_on_network_error`` — repo info falls back to stale cache on RequestException. - ``test_commit_info_stale_on_network_error`` — commit info falls back to stale cache on RequestException. All 29 tests (16 reconciliation, 13 store_manager caches) pass. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-25 21:43:32 +00:00 · 2026-04-08 10:57:12 -04:00
parent b031b99130
commit c03eb8dbcc
2 changed files with 263 additions and 46 deletions
--- a/src/plugin_system/store_manager.py
+++ b/src/plugin_system/store_manager.py
@@ -14,6 +14,7 @@ import zipfile
 import tempfile
 import requests
 import time
 from concurrent.futures import ThreadPoolExecutor
 from datetime import datetime
 from pathlib import Path
 from typing import List, Dict, Optional, Any
@@ -52,7 +53,7 @@ class PluginStoreManager:
        self.registry_cache = None
        self.registry_cache_time = None  # Timestamp of when registry was cached
        self.github_cache = {}  # Cache for GitHub API responses
-        self.cache_timeout = 3600  # 1 hour cache timeout
+        self.cache_timeout = 3600  # 1 hour cache timeout (repo info: stars, default_branch)
        # 15 minutes for registry cache. Long enough that the plugin list
        # endpoint on a warm cache never hits the network, short enough that
        # new plugins show up within a reasonable window. See also the
@@ -60,9 +61,15 @@ class PluginStoreManager:
        # failures.
        self.registry_cache_timeout = 900
        self.commit_info_cache = {}  # Cache for latest commit info: {key: (timestamp, data)}
-        self.commit_cache_timeout = 300  # 5 minutes (same as registry)
+        # 30 minutes for commit/manifest caches. Plugin Store users browse
        # the catalog via /plugins/store/list which fetches commit info and
        # manifest data per plugin. 5-min TTLs meant every fresh browse on
        # a Pi4 paid for ~3 HTTP requests × N plugins (30-60s serial). 30
        # minutes keeps the cache warm across a realistic session while
        # still picking up upstream updates within a reasonable window.
        self.commit_cache_timeout = 1800
        self.manifest_cache = {}  # Cache for GitHub manifest fetches: {key: (timestamp, data)}
-        self.manifest_cache_timeout = 300  # 5 minutes
+        self.manifest_cache_timeout = 1800
        self.github_token = self._load_github_token()
        self._token_validation_cache = {}  # Cache for token validation results: {token: (is_valid, timestamp, error_message)}
        self._token_validation_cache_timeout = 300  # 5 minutes cache for token validation
@@ -342,7 +349,21 @@ class PluginStoreManager:
                    if self.github_token:
                        headers['Authorization'] = f'token {self.github_token}'
-                    response = requests.get(api_url, headers=headers, timeout=10)
+                    try:
                        response = requests.get(api_url, headers=headers, timeout=10)
                    except requests.RequestException as req_err:
                        # Network error: prefer a stale cache hit over an
                        # empty default so the UI keeps working on a flaky
                        # Pi WiFi link.
                        if cache_key in self.github_cache:
                            _, stale = self.github_cache[cache_key]
                            self.logger.warning(
                                "GitHub repo info fetch failed for %s (%s); serving stale cache.",
                                cache_key, req_err,
                            )
                            return stale
                        raise
                    if response.status_code == 200:
                        data = response.json()
                        pushed_at = data.get('pushed_at', '') or data.get('updated_at', '')
@@ -362,7 +383,16 @@ class PluginStoreManager:
                        self.github_cache[cache_key] = (time.time(), repo_info)
                        return repo_info
                    elif response.status_code == 403:
-                        # Rate limit or authentication issue
+                        # Rate limit or authentication issue. If we have a
                        # previously-cached value, serve it rather than
                        # returning empty defaults — a stale star count is
                        # better than a reset to zero.
                        if cache_key in self.github_cache:
                            _, stale = self.github_cache[cache_key]
                            self.logger.warning(
                                "GitHub API 403 for %s; serving stale cache.", cache_key,
                            )
                            return stale
                        if not self.github_token:
                            self.logger.warning(
                                f"GitHub API rate limit likely exceeded (403). "
@@ -376,6 +406,9 @@ class PluginStoreManager:
                            )
                    else:
                        self.logger.warning(f"GitHub API request failed: {response.status_code} for {api_url}")
                        if cache_key in self.github_cache:
                            _, stale = self.github_cache[cache_key]
                            return stale
            return {
                'stars': 0,
@@ -558,68 +591,91 @@ class PluginStoreManager:
                    except Exception as e:
                        self.logger.warning(f"Failed to fetch plugins from saved repository {repo_url}: {e}")
-        results = []
+        # First pass: apply cheap filters (category/tags/query) so we only
        # fetch GitHub metadata for plugins that will actually be returned.
        filtered: List[Dict] = []
        for plugin in plugins:
            # Category filter
            if category and plugin.get('category') != category:
                continue
            # Tags filter (match any tag)
            if tags and not any(tag in plugin.get('tags', []) for tag in tags):
                continue
            # Query search (case-insensitive)
            if query:
                query_lower = query.lower()
                searchable_text = ' '.join([
                    plugin.get('name', ''),
                    plugin.get('description', ''),
                    plugin.get('id', ''),
-                    plugin.get('author', '')
+                    plugin.get('author', ''),
                ]).lower()
                if query_lower not in searchable_text:
                    continue
            filtered.append(plugin)
-            # Enhance plugin data with GitHub metadata
+        def _enrich(plugin: Dict) -> Dict:
            """Enrich a single plugin with GitHub metadata.
            Called concurrently from a ThreadPoolExecutor. Each underlying
            HTTP helper (``_get_github_repo_info`` / ``_get_latest_commit_info``
            / ``_fetch_manifest_from_github``) is thread-safe — they use
            ``requests`` and write their own cache keys on Python dicts,
            which is atomic under the GIL for single-key assignments.
            """
            enhanced_plugin = plugin.copy()
            # Get real GitHub stars
            repo_url = plugin.get('repo', '')
-            if repo_url:
+            if not repo_url:
-                github_info = self._get_github_repo_info(repo_url)
+                return enhanced_plugin
                enhanced_plugin['stars'] = github_info.get('stars', plugin.get('stars', 0))
                enhanced_plugin['default_branch'] = github_info.get('default_branch', plugin.get('branch', 'main'))
                enhanced_plugin['last_updated_iso'] = github_info.get('last_commit_iso')
                enhanced_plugin['last_updated'] = github_info.get('last_commit_date')
-                if fetch_commit_info:
+            github_info = self._get_github_repo_info(repo_url)
-                    branch = plugin.get('branch') or github_info.get('default_branch', 'main')
+            enhanced_plugin['stars'] = github_info.get('stars', plugin.get('stars', 0))
            enhanced_plugin['default_branch'] = github_info.get('default_branch', plugin.get('branch', 'main'))
            enhanced_plugin['last_updated_iso'] = github_info.get('last_commit_iso')
            enhanced_plugin['last_updated'] = github_info.get('last_commit_date')
-                    commit_info = self._get_latest_commit_info(repo_url, branch)
+            if fetch_commit_info:
-                    if commit_info:
+                branch = plugin.get('branch') or github_info.get('default_branch', 'main')
                        enhanced_plugin['last_commit'] = commit_info.get('short_sha')
                        enhanced_plugin['last_commit_sha'] = commit_info.get('sha')
                        enhanced_plugin['last_updated'] = commit_info.get('date') or enhanced_plugin.get('last_updated')
                        enhanced_plugin['last_updated_iso'] = commit_info.get('date_iso') or enhanced_plugin.get('last_updated_iso')
                        enhanced_plugin['last_commit_message'] = commit_info.get('message')
                        enhanced_plugin['last_commit_author'] = commit_info.get('author')
                        enhanced_plugin['branch'] = commit_info.get('branch', branch)
                        enhanced_plugin['last_commit_branch'] = commit_info.get('branch')
-                    # Fetch manifest from GitHub for additional metadata (description, etc.)
+                commit_info = self._get_latest_commit_info(repo_url, branch)
-                    plugin_subpath = plugin.get('plugin_path', '')
+                if commit_info:
-                    manifest_rel = f"{plugin_subpath}/manifest.json" if plugin_subpath else "manifest.json"
+                    enhanced_plugin['last_commit'] = commit_info.get('short_sha')
-                    github_manifest = self._fetch_manifest_from_github(repo_url, branch, manifest_rel)
+                    enhanced_plugin['last_commit_sha'] = commit_info.get('sha')
-                    if github_manifest:
+                    enhanced_plugin['last_updated'] = commit_info.get('date') or enhanced_plugin.get('last_updated')
-                        if 'last_updated' in github_manifest and not enhanced_plugin.get('last_updated'):
+                    enhanced_plugin['last_updated_iso'] = commit_info.get('date_iso') or enhanced_plugin.get('last_updated_iso')
-                            enhanced_plugin['last_updated'] = github_manifest['last_updated']
+                    enhanced_plugin['last_commit_message'] = commit_info.get('message')
-                        if 'description' in github_manifest:
+                    enhanced_plugin['last_commit_author'] = commit_info.get('author')
-                            enhanced_plugin['description'] = github_manifest['description']
+                    enhanced_plugin['branch'] = commit_info.get('branch', branch)
                    enhanced_plugin['last_commit_branch'] = commit_info.get('branch')
-            results.append(enhanced_plugin)
+                plugin_subpath = plugin.get('plugin_path', '')
                manifest_rel = f"{plugin_subpath}/manifest.json" if plugin_subpath else "manifest.json"
                github_manifest = self._fetch_manifest_from_github(repo_url, branch, manifest_rel)
                if github_manifest:
                    if 'last_updated' in github_manifest and not enhanced_plugin.get('last_updated'):
                        enhanced_plugin['last_updated'] = github_manifest['last_updated']
                    if 'description' in github_manifest:
                        enhanced_plugin['description'] = github_manifest['description']
-        return results
+            return enhanced_plugin
        # Fan out the per-plugin GitHub enrichment. The previous
        # implementation did this serially, which on a Pi4 with ~15 plugins
        # and a fresh cache meant 30+ HTTP requests in strict sequence (the
        # "connecting to display" hang reported by users). With a thread
        # pool, latency is dominated by the slowest request rather than
        # their sum. Workers capped at 10 to stay well under the
        # unauthenticated GitHub rate limit burst and avoid overwhelming a
        # Pi's WiFi link. For a small number of plugins the pool is
        # essentially free.
        if not filtered:
            return []
        if len(filtered) == 1 or not fetch_commit_info and len(filtered) < 4:
            # Not worth the pool overhead for tiny workloads.
            return [_enrich(p) for p in filtered]
        max_workers = min(10, len(filtered))
        with ThreadPoolExecutor(max_workers=max_workers, thread_name_prefix='plugin-search') as executor:
            # executor.map preserves input order, which the UI relies on.
            return list(executor.map(_enrich, filtered))
    def _fetch_manifest_from_github(self, repo_url: str, branch: str = "master", manifest_path: str = "manifest.json", force_refresh: bool = False) -> Optional[Dict]:
        """
@@ -717,7 +773,22 @@ class PluginStoreManager:
            last_error = None
            for branch_name in branches_to_try:
                api_url = f"https://api.github.com/repos/{owner}/{repo}/commits/{branch_name}"
-                response = requests.get(api_url, headers=headers, timeout=10)
+                try:
                    response = requests.get(api_url, headers=headers, timeout=10)
                except requests.RequestException as req_err:
                    # Network failure: fall back to a stale cache hit if
                    # available so the plugin store UI keeps populating
                    # commit info on a flaky WiFi link.
                    if cache_key in self.commit_info_cache:
                        _, stale = self.commit_info_cache[cache_key]
                        if stale is not None:
                            self.logger.warning(
                                "GitHub commit fetch failed for %s (%s); serving stale cache.",
                                cache_key, req_err,
                            )
                            return stale
                    last_error = str(req_err)
                    continue
                if response.status_code == 200:
                    commit_data = response.json()
                    commit_sha_full = commit_data.get('sha', '')
--- a/test/test_store_manager_caches.py
+++ b/test/test_store_manager_caches.py
@@ -113,6 +113,152 @@ class TestGitInfoCache(unittest.TestCase):
        self.assertIsNone(self.sm._get_local_git_info(non_git))
 class TestSearchPluginsParallel(unittest.TestCase):
    """Plugin Store browse path — the per-plugin GitHub enrichment used to
    run serially, turning a browse of 15 plugins into 30–45 sequential HTTP
    requests on a cold cache. This batch of tests locks in the parallel
    fan-out and verifies output shape/ordering haven't regressed.
    """
    def setUp(self):
        self._tmp = TemporaryDirectory()
        self.addCleanup(self._tmp.cleanup)
        self.sm = PluginStoreManager(plugins_dir=self._tmp.name)
        # Fake registry with 5 plugins.
        self.registry = {
            "plugins": [
                {"id": f"plg{i}", "name": f"Plugin {i}",
                 "repo": f"https://github.com/owner/plg{i}", "category": "util"}
                for i in range(5)
            ]
        }
        self.sm.registry_cache = self.registry
        self.sm.registry_cache_time = time.time()
        self._enrich_calls = []
        def fake_repo(repo_url):
            self._enrich_calls.append(("repo", repo_url))
            return {"stars": 1, "default_branch": "main",
                    "last_commit_iso": "2026-04-08T00:00:00Z",
                    "last_commit_date": "2026-04-08"}
        def fake_commit(repo_url, branch):
            self._enrich_calls.append(("commit", repo_url, branch))
            return {"short_sha": "abc1234", "sha": "abc1234" + "0" * 33,
                    "date_iso": "2026-04-08T00:00:00Z", "date": "2026-04-08",
                    "message": "m", "author": "a", "branch": branch}
        def fake_manifest(repo_url, branch, manifest_path):
            self._enrich_calls.append(("manifest", repo_url, branch))
            return {"description": "desc"}
        self.sm._get_github_repo_info = fake_repo
        self.sm._get_latest_commit_info = fake_commit
        self.sm._fetch_manifest_from_github = fake_manifest
    def test_results_preserve_registry_order(self):
        results = self.sm.search_plugins(include_saved_repos=False)
        self.assertEqual([p["id"] for p in results],
                         [f"plg{i}" for i in range(5)])
    def test_filters_applied_before_enrichment(self):
        # Filter down to a single plugin via category — ensures we don't
        # waste GitHub calls enriching plugins that won't be returned.
        self.registry["plugins"][2]["category"] = "special"
        self.sm.registry_cache = self.registry
        self._enrich_calls.clear()
        results = self.sm.search_plugins(category="special", include_saved_repos=False)
        self.assertEqual(len(results), 1)
        self.assertEqual(results[0]["id"], "plg2")
        # Only one plugin should have been enriched.
        repo_calls = [c for c in self._enrich_calls if c[0] == "repo"]
        self.assertEqual(len(repo_calls), 1)
    def test_enrichment_runs_concurrently(self):
        """Verify the thread pool actually runs fetches in parallel.
        We make each fake fetch sleep briefly and check wall time — if the
        code had regressed to serial execution, 5 plugins × ~60ms sleep
        would be ≥ 300ms. Parallel should be well under that.
        """
        import threading
        concurrent_workers = []
        peak_lock = threading.Lock()
        peak = {"count": 0, "current": 0}
        def slow_repo(repo_url):
            with peak_lock:
                peak["current"] += 1
                if peak["current"] > peak["count"]:
                    peak["count"] = peak["current"]
            time.sleep(0.05)
            with peak_lock:
                peak["current"] -= 1
            return {"stars": 0, "default_branch": "main",
                    "last_commit_iso": "", "last_commit_date": ""}
        self.sm._get_github_repo_info = slow_repo
        self.sm._get_latest_commit_info = lambda *a, **k: None
        self.sm._fetch_manifest_from_github = lambda *a, **k: None
        t0 = time.time()
        results = self.sm.search_plugins(fetch_commit_info=False, include_saved_repos=False)
        elapsed = time.time() - t0
        self.assertEqual(len(results), 5)
        # Serial would be ~250ms; parallel should be well under 200ms.
        self.assertLess(elapsed, 0.2,
                        f"search_plugins took {elapsed:.3f}s — appears to have regressed to serial")
        self.assertGreaterEqual(peak["count"], 2,
                                "no concurrent fetches observed — thread pool not engaging")
 class TestStaleOnErrorFallbacks(unittest.TestCase):
    """When GitHub is unreachable, previously-cached values should still be
    returned rather than zero/None. Important on Pi's WiFi links.
    """
    def setUp(self):
        self._tmp = TemporaryDirectory()
        self.addCleanup(self._tmp.cleanup)
        self.sm = PluginStoreManager(plugins_dir=self._tmp.name)
    def test_repo_info_stale_on_network_error(self):
        cache_key = "owner/repo"
        good = {"stars": 42, "default_branch": "main",
                "last_commit_iso": "", "last_commit_date": "",
                "forks": 0, "open_issues": 0, "updated_at_iso": "",
                "language": "", "license": ""}
        # Seed the cache with a known-good value, then force expiry.
        self.sm.github_cache[cache_key] = (time.time() - 10_000, good)
        self.sm.cache_timeout = 1  # force re-fetch
        import requests as real_requests
        with patch("src.plugin_system.store_manager.requests.get",
                   side_effect=real_requests.ConnectionError("boom")):
            result = self.sm._get_github_repo_info("https://github.com/owner/repo")
        self.assertEqual(result["stars"], 42)
    def test_commit_info_stale_on_network_error(self):
        cache_key = "owner/repo:main"
        good = {"branch": "main", "sha": "a" * 40, "short_sha": "aaaaaaa",
                "date_iso": "2026-04-08T00:00:00Z", "date": "2026-04-08",
                "author": "x", "message": "y"}
        self.sm.commit_info_cache[cache_key] = (time.time() - 10_000, good)
        self.sm.commit_cache_timeout = 1  # force re-fetch
        import requests as real_requests
        with patch("src.plugin_system.store_manager.requests.get",
                   side_effect=real_requests.ConnectionError("boom")):
            result = self.sm._get_latest_commit_info(
                "https://github.com/owner/repo", branch="main"
            )
        self.assertIsNotNone(result)
        self.assertEqual(result["short_sha"], "aaaaaaa")
 class TestRegistryStaleCacheFallback(unittest.TestCase):
    def setUp(self):
        self._tmp = TemporaryDirectory()