feat: add error detection, monitoring, and code quality improvements (#223)

* feat: add error detection, monitoring, and code quality improvements This comprehensive update addresses automatic error detection, code quality, and plugin development experience: ## Error Detection & Monitoring - Add ErrorAggregator service for centralized error tracking - Add pattern detection for recurring errors (5+ in 60 min) - Add error dashboard API endpoints (/api/v3/errors/*) - Integrate error recording into plugin executor ## Code Quality - Remove 10 silent `except: pass` blocks in sports.py and football.py - Remove hardcoded debug log paths - Add pre-commit hooks to prevent future bare except clauses ## Validation & Type Safety - Add warnings when plugins lack config_schema.json - Add config key collision detection for plugins - Improve type coercion logging in BasePlugin ## Testing - Add test_config_validation_edge_cases.py - Add test_plugin_loading_failures.py - Add test_error_aggregator.py ## Documentation - Add PLUGIN_ERROR_HANDLING.md guide - Add CONFIG_DEBUGGING.md guide Note: GitHub Actions CI workflow is available in the plan but requires workflow scope to push. Add .github/workflows/ci.yml manually. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> * fix: address code review issues - Fix GitHub issues URL in CONFIG_DEBUGGING.md - Use RLock in error_aggregator.py to prevent deadlock in export_to_file - Distinguish missing vs invalid schema files in plugin_manager.py - Add assertions to test_null_value_for_required_field test - Remove unused initial_count variable in test_plugin_load_error_recorded - Add validation for max_age_hours in clear_old_errors API endpoint Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> --------- Co-authored-by: Chuck <chuck@example.com> Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
2026-04-10 13:02:59 +00:00 · 2026-01-30 10:05:09 -05:00
parent 8912501604
commit 8fb2800495
14 changed files with 2330 additions and 202 deletions
--- a/src/base_classes/football.py
+++ b/src/base_classes/football.py
@@ -387,43 +387,8 @@ class FootballLive(Football, SportsLive):
            main_img = main_img.convert('RGB') # Convert for display

            # Display the final image
-            # #region agent log
-            import json
-            import time
-            try:
-                with open('/home/chuck/Github/LEDMatrix/.cursor/debug.log', 'a') as f:
-                    f.write(json.dumps({
-                        "sessionId": "debug-session",
-                        "runId": "run1",
-                        "hypothesisId": "C",
-                        "location": "football.py:390",
-                        "message": "About to update display",
-                        "data": {
-                            "force_clear": force_clear,
-                            "game": game.get('away_abbr', '') + "@" + game.get('home_abbr', '')
-                        },
-                        "timestamp": int(time.time() * 1000)
-                    }) + "\n")
-            except: pass
-            # #endregion
            self.display_manager.image.paste(main_img, (0, 0))
            self.display_manager.update_display() # Update display here for live
-            # #region agent log
-            try:
-                with open('/home/chuck/Github/LEDMatrix/.cursor/debug.log', 'a') as f:
-                    f.write(json.dumps({
-                        "sessionId": "debug-session",
-                        "runId": "run1",
-                        "hypothesisId": "C",
-                        "location": "football.py:392",
-                        "message": "After update display",
-                        "data": {
-                            "force_clear": force_clear
-                        },
-                        "timestamp": int(time.time() * 1000)
-                    }) + "\n")
-            except: pass
-            # #endregion

        except Exception as e:
            self.logger.error(f"Error displaying live Football game: {e}", exc_info=True) # Changed log prefix
--- a/src/base_classes/sports.py
+++ b/src/base_classes/sports.py
@@ -207,25 +207,6 @@ class SportsCore(ABC):

    def display(self, force_clear: bool = False) -> bool:
        """Common display method for all NCAA FB managers""" # Updated docstring
-        # #region agent log
-        import json
-        try:
-            with open('/home/chuck/Github/LEDMatrix/.cursor/debug.log', 'a') as f:
-                f.write(json.dumps({
-                    "sessionId": "debug-session",
-                    "runId": "run1",
-                    "hypothesisId": "D",
-                    "location": "sports.py:208",
-                    "message": "Display called",
-                    "data": {
-                        "force_clear": force_clear,
-                        "has_current_game": self.current_game is not None,
-                        "current_game": self.current_game['away_abbr'] + "@" + self.current_game['home_abbr'] if self.current_game else None
-                    },
-                    "timestamp": int(time.time() * 1000)
-                }) + "\n")
-        except: pass
-        # #endregion
        if not self.is_enabled: # Check if module is enabled
             return False

@@ -248,40 +229,7 @@ class SportsCore(ABC):
            return False

        try:
-            # #region agent log
-            try:
-                with open('/home/chuck/Github/LEDMatrix/.cursor/debug.log', 'a') as f:
-                    f.write(json.dumps({
-                        "sessionId": "debug-session",
-                        "runId": "run1",
-                        "hypothesisId": "D",
-                        "location": "sports.py:232",
-                        "message": "About to draw scorebug",
-                        "data": {
-                            "force_clear": force_clear,
-                            "game": self.current_game['away_abbr'] + "@" + self.current_game['home_abbr'] if self.current_game else None
-                        },
-                        "timestamp": int(time.time() * 1000)
-                    }) + "\n")
-            except: pass
-            # #endregion
            self._draw_scorebug_layout(self.current_game, force_clear)
-            # #region agent log
-            try:
-                with open('/home/chuck/Github/LEDMatrix/.cursor/debug.log', 'a') as f:
-                    f.write(json.dumps({
-                        "sessionId": "debug-session",
-                        "runId": "run1",
-                        "hypothesisId": "D",
-                        "location": "sports.py:235",
-                        "message": "After draw scorebug",
-                        "data": {
-                            "force_clear": force_clear
-                        },
-                        "timestamp": int(time.time() * 1000)
-                    }) + "\n")
-            except: pass
-            # #endregion
            # display_manager.update_display() should be called within subclass draw methods
            # or after calling display() in the main loop. Let's keep it out of the base display.
            return True
@@ -1443,48 +1391,9 @@ class SportsLive(SportsCore):
                            self.live_games = sorted(new_live_games, key=lambda g: g.get('start_time_utc') or datetime.now(timezone.utc)) # Sort by start time
                            # Reset index if current game is gone or list is new
                            if not self.current_game or self.current_game['id'] not in new_game_ids:
-                                # #region agent log
-                                import json
-                                try:
-                                    with open('/home/chuck/Github/LEDMatrix/.cursor/debug.log', 'a') as f:
-                                        f.write(json.dumps({
-                                            "sessionId": "debug-session",
-                                            "runId": "run1",
-                                            "hypothesisId": "B",
-                                            "location": "sports.py:1393",
-                                            "message": "Games loaded - resetting index and last_game_switch",
-                                            "data": {
-                                                "current_game_before": self.current_game['id'] if self.current_game else None,
-                                                "live_games_count": len(self.live_games),
-                                                "last_game_switch_before": self.last_game_switch,
-                                                "current_time": current_time,
-                                                "time_since_init": current_time - self.last_game_switch if self.last_game_switch > 0 else None
-                                            },
-                                            "timestamp": int(time.time() * 1000)
-                                        }) + "\n")
-                                except: pass
-                                # #endregion
                                self.current_game_index = 0
                                self.current_game = self.live_games[0] if self.live_games else None
                                self.last_game_switch = current_time
-                                # #region agent log
-                                try:
-                                    with open('/home/chuck/Github/LEDMatrix/.cursor/debug.log', 'a') as f:
-                                        f.write(json.dumps({
-                                            "sessionId": "debug-session",
-                                            "runId": "run1",
-                                            "hypothesisId": "B",
-                                            "location": "sports.py:1396",
-                                            "message": "Games loaded - after setting last_game_switch",
-                                            "data": {
-                                                "current_game_after": self.current_game['id'] if self.current_game else None,
-                                                "last_game_switch_after": self.last_game_switch,
-                                                "first_game": self.current_game['away_abbr'] + "@" + self.current_game['home_abbr'] if self.current_game else None
-                                            },
-                                            "timestamp": int(time.time() * 1000)
-                                        }) + "\n")
-                                except: pass
-                                # #endregion
                            else:
                                # Find current game's new index if it still exists
                                try:
@@ -1530,70 +1439,9 @@ class SportsLive(SportsCore):
            # Handle game switching (outside test mode check)
            # Fix: Don't check for switching if last_game_switch is still 0 (games haven't been loaded yet)
            # This prevents immediate switching when the system has been running for a while before games load
-            # #region agent log
-            import json
-            try:
-                with open('/home/chuck/Github/LEDMatrix/.cursor/debug.log', 'a') as f:
-                    f.write(json.dumps({
-                        "sessionId": "debug-session",
-                        "runId": "run1",
-                        "hypothesisId": "A",
-                        "location": "sports.py:1432",
-                        "message": "Game switch check - before condition",
-                        "data": {
-                            "test_mode": self.test_mode,
-                            "live_games_count": len(self.live_games),
-                            "current_time": current_time,
-                            "last_game_switch": self.last_game_switch,
-                            "time_since_switch": current_time - self.last_game_switch,
-                            "game_display_duration": self.game_display_duration,
-                            "current_game_index": self.current_game_index,
-                            "will_switch": not self.test_mode and len(self.live_games) > 1 and self.last_game_switch > 0 and (current_time - self.last_game_switch) >= self.game_display_duration
-                        },
-                        "timestamp": int(time.time() * 1000)
-                    }) + "\n")
-            except: pass
-            # #endregion
            if not self.test_mode and len(self.live_games) > 1 and self.last_game_switch > 0 and (current_time - self.last_game_switch) >= self.game_display_duration:
-                # #region agent log
-                try:
-                    with open('/home/chuck/Github/LEDMatrix/.cursor/debug.log', 'a') as f:
-                        f.write(json.dumps({
-                            "sessionId": "debug-session",
-                            "runId": "run1",
-                            "hypothesisId": "A",
-                            "location": "sports.py:1433",
-                            "message": "Game switch triggered",
-                            "data": {
-                                "old_index": self.current_game_index,
-                                "old_game": self.current_game['away_abbr'] + "@" + self.current_game['home_abbr'] if self.current_game else None,
-                                "time_since_switch": current_time - self.last_game_switch,
-                                "last_game_switch_before": self.last_game_switch
-                            },
-                            "timestamp": int(time.time() * 1000)
-                        }) + "\n")
-                except: pass
-                # #endregion
                self.current_game_index = (self.current_game_index + 1) % len(self.live_games)
                self.current_game = self.live_games[self.current_game_index]
                self.last_game_switch = current_time
-                # #region agent log
-                try:
-                    with open('/home/chuck/Github/LEDMatrix/.cursor/debug.log', 'a') as f:
-                        f.write(json.dumps({
-                            "sessionId": "debug-session",
-                            "runId": "run1",
-                            "hypothesisId": "A",
-                            "location": "sports.py:1436",
-                            "message": "Game switch completed",
-                            "data": {
-                                "new_index": self.current_game_index,
-                                "new_game": self.current_game['away_abbr'] + "@" + self.current_game['home_abbr'] if self.current_game else None,
-                                "last_game_switch_after": self.last_game_switch
-                            },
-                            "timestamp": int(time.time() * 1000)
-                        }) + "\n")
-                except: pass
-                # #endregion
                self.logger.info(f"Switched live view to: {self.current_game['away_abbr']}@{self.current_game['home_abbr']}") # Changed log prefix
                # Force display update via flag or direct call if needed, but usually let main loop handle
--- a/src/error_aggregator.py
+++ b/src/error_aggregator.py
@@ -0,0 +1,418 @@
+"""
+Error Aggregation Service
+
+Provides centralized error tracking, pattern detection, and reporting
+for the LEDMatrix system. Enables automatic bug detection by tracking
+error frequency, patterns, and context.
+
+This is a local-only implementation with no external dependencies.
+Errors are stored in memory with optional JSON export.
+"""
+
+import threading
+import traceback
+import json
+from collections import defaultdict
+from dataclasses import dataclass, field
+from datetime import datetime, timedelta
+from pathlib import Path
+from typing import Dict, List, Optional, Any, Callable
+import logging
+
+from src.exceptions import LEDMatrixError
+
+
+@dataclass
+class ErrorRecord:
+    """Record of a single error occurrence."""
+    error_type: str
+    message: str
+    timestamp: datetime
+    context: Dict[str, Any] = field(default_factory=dict)
+    plugin_id: Optional[str] = None
+    operation: Optional[str] = None
+    stack_trace: Optional[str] = None
+
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary for JSON serialization."""
+        return {
+            "error_type": self.error_type,
+            "message": self.message,
+            "timestamp": self.timestamp.isoformat(),
+            "context": self.context,
+            "plugin_id": self.plugin_id,
+            "operation": self.operation,
+            "stack_trace": self.stack_trace
+        }
+
+
+@dataclass
+class ErrorPattern:
+    """Detected error pattern for automatic detection."""
+    error_type: str
+    count: int
+    first_seen: datetime
+    last_seen: datetime
+    affected_plugins: List[str] = field(default_factory=list)
+    sample_messages: List[str] = field(default_factory=list)
+    severity: str = "warning"  # warning, error, critical
+
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary for JSON serialization."""
+        return {
+            "error_type": self.error_type,
+            "count": self.count,
+            "first_seen": self.first_seen.isoformat(),
+            "last_seen": self.last_seen.isoformat(),
+            "affected_plugins": list(set(self.affected_plugins)),
+            "sample_messages": self.sample_messages[:3],  # Keep only 3 samples
+            "severity": self.severity
+        }
+
+
+class ErrorAggregator:
+    """
+    Aggregates and analyzes errors across the system.
+
+    Features:
+    - Error counting by type, plugin, and time window
+    - Pattern detection (recurring errors)
+    - Error rate alerting via callbacks
+    - Export for analytics/reporting
+
+    Thread-safe for concurrent access.
+    """
+
+    def __init__(
+        self,
+        max_records: int = 1000,
+        pattern_threshold: int = 5,
+        pattern_window_minutes: int = 60,
+        export_path: Optional[Path] = None
+    ):
+        """
+        Initialize the error aggregator.
+
+        Args:
+            max_records: Maximum number of error records to keep in memory
+            pattern_threshold: Number of occurrences to detect a pattern
+            pattern_window_minutes: Time window for pattern detection
+            export_path: Optional path for JSON export (auto-export on pattern detection)
+        """
+        self.logger = logging.getLogger(__name__)
+        self.max_records = max_records
+        self.pattern_threshold = pattern_threshold
+        self.pattern_window = timedelta(minutes=pattern_window_minutes)
+        self.export_path = export_path
+
+        self._records: List[ErrorRecord] = []
+        self._error_counts: Dict[str, int] = defaultdict(int)
+        self._plugin_error_counts: Dict[str, Dict[str, int]] = defaultdict(lambda: defaultdict(int))
+        self._patterns: Dict[str, ErrorPattern] = {}
+        self._pattern_callbacks: List[Callable[[ErrorPattern], None]] = []
+        self._lock = threading.RLock()  # RLock allows nested acquisition for export_to_file
+
+        # Track session start for relative timing
+        self._session_start = datetime.now()
+
+    def record_error(
+        self,
+        error: Exception,
+        context: Optional[Dict[str, Any]] = None,
+        plugin_id: Optional[str] = None,
+        operation: Optional[str] = None
+    ) -> ErrorRecord:
+        """
+        Record an error occurrence.
+
+        Args:
+            error: The exception that occurred
+            context: Optional context dictionary with additional details
+            plugin_id: Optional plugin ID that caused the error
+            operation: Optional operation name (e.g., "update", "display")
+
+        Returns:
+            The created ErrorRecord
+        """
+        with self._lock:
+            error_type = type(error).__name__
+
+            # Extract additional context from LEDMatrixError subclasses
+            error_context = context or {}
+            if isinstance(error, LEDMatrixError) and error.context:
+                error_context.update(error.context)
+
+            record = ErrorRecord(
+                error_type=error_type,
+                message=str(error),
+                timestamp=datetime.now(),
+                context=error_context,
+                plugin_id=plugin_id,
+                operation=operation,
+                stack_trace=traceback.format_exc()
+            )
+
+            # Add record (with size limit)
+            self._records.append(record)
+            if len(self._records) > self.max_records:
+                self._records.pop(0)
+
+            # Update counts
+            self._error_counts[error_type] += 1
+            if plugin_id:
+                self._plugin_error_counts[plugin_id][error_type] += 1
+
+            # Check for patterns
+            self._detect_pattern(record)
+
+            # Log the error
+            self.logger.debug(
+                f"Error recorded: {error_type} - {str(error)[:100]}",
+                extra={"plugin_id": plugin_id, "operation": operation}
+            )
+
+            return record
+
+    def _detect_pattern(self, record: ErrorRecord) -> None:
+        """Detect recurring error patterns."""
+        cutoff = datetime.now() - self.pattern_window
+        recent_same_type = [
+            r for r in self._records
+            if r.error_type == record.error_type and r.timestamp > cutoff
+        ]
+
+        if len(recent_same_type) >= self.pattern_threshold:
+            pattern_key = record.error_type
+            is_new_pattern = pattern_key not in self._patterns
+
+            # Determine severity based on count
+            count = len(recent_same_type)
+            if count > self.pattern_threshold * 3:
+                severity = "critical"
+            elif count > self.pattern_threshold * 2:
+                severity = "error"
+            else:
+                severity = "warning"
+
+            # Collect affected plugins
+            affected_plugins = [r.plugin_id for r in recent_same_type if r.plugin_id]
+
+            # Collect sample messages
+            sample_messages = list(set(r.message for r in recent_same_type[:5]))
+
+            if is_new_pattern:
+                pattern = ErrorPattern(
+                    error_type=record.error_type,
+                    count=count,
+                    first_seen=recent_same_type[0].timestamp,
+                    last_seen=record.timestamp,
+                    affected_plugins=affected_plugins,
+                    sample_messages=sample_messages,
+                    severity=severity
+                )
+                self._patterns[pattern_key] = pattern
+
+                self.logger.warning(
+                    f"Error pattern detected: {record.error_type} occurred "
+                    f"{count} times in last {self.pattern_window}. "
+                    f"Affected plugins: {set(affected_plugins) or 'unknown'}"
+                )
+
+                # Notify callbacks
+                for callback in self._pattern_callbacks:
+                    try:
+                        callback(pattern)
+                    except Exception as e:
+                        self.logger.error(f"Pattern callback failed: {e}")
+
+                # Auto-export if path configured
+                if self.export_path:
+                    self._auto_export()
+            else:
+                # Update existing pattern
+                self._patterns[pattern_key].count = count
+                self._patterns[pattern_key].last_seen = record.timestamp
+                self._patterns[pattern_key].severity = severity
+                self._patterns[pattern_key].affected_plugins.extend(affected_plugins)
+
+    def on_pattern_detected(self, callback: Callable[[ErrorPattern], None]) -> None:
+        """
+        Register a callback to be called when a new error pattern is detected.
+
+        Args:
+            callback: Function that takes an ErrorPattern as argument
+        """
+        self._pattern_callbacks.append(callback)
+
+    def get_error_summary(self) -> Dict[str, Any]:
+        """
+        Get summary of all errors for reporting.
+
+        Returns:
+            Dictionary with error statistics and recent errors
+        """
+        with self._lock:
+            # Calculate error rate (errors per hour)
+            session_duration = (datetime.now() - self._session_start).total_seconds() / 3600
+            error_rate = len(self._records) / max(session_duration, 0.01)
+
+            return {
+                "session_start": self._session_start.isoformat(),
+                "total_errors": len(self._records),
+                "error_rate_per_hour": round(error_rate, 2),
+                "error_counts_by_type": dict(self._error_counts),
+                "plugin_error_counts": {
+                    k: dict(v) for k, v in self._plugin_error_counts.items()
+                },
+                "active_patterns": {
+                    k: v.to_dict() for k, v in self._patterns.items()
+                },
+                "recent_errors": [
+                    r.to_dict() for r in self._records[-20:]
+                ]
+            }
+
+    def get_plugin_health(self, plugin_id: str) -> Dict[str, Any]:
+        """
+        Get health status for a specific plugin.
+
+        Args:
+            plugin_id: Plugin ID to check
+
+        Returns:
+            Dictionary with plugin error statistics
+        """
+        with self._lock:
+            plugin_errors = self._plugin_error_counts.get(plugin_id, {})
+            recent_plugin_errors = [
+                r for r in self._records[-100:]
+                if r.plugin_id == plugin_id
+            ]
+
+            # Determine health status
+            recent_count = len(recent_plugin_errors)
+            if recent_count == 0:
+                status = "healthy"
+            elif recent_count < 5:
+                status = "degraded"
+            else:
+                status = "unhealthy"
+
+            return {
+                "plugin_id": plugin_id,
+                "status": status,
+                "total_errors": sum(plugin_errors.values()),
+                "error_types": dict(plugin_errors),
+                "recent_error_count": recent_count,
+                "last_error": recent_plugin_errors[-1].to_dict() if recent_plugin_errors else None
+            }
+
+    def clear_old_records(self, max_age_hours: int = 24) -> int:
+        """
+        Clear records older than specified age.
+
+        Args:
+            max_age_hours: Maximum age in hours
+
+        Returns:
+            Number of records cleared
+        """
+        with self._lock:
+            cutoff = datetime.now() - timedelta(hours=max_age_hours)
+            original_count = len(self._records)
+            self._records = [r for r in self._records if r.timestamp > cutoff]
+            cleared = original_count - len(self._records)
+
+            if cleared > 0:
+                self.logger.info(f"Cleared {cleared} old error records")
+
+            return cleared
+
+    def export_to_file(self, filepath: Path) -> None:
+        """
+        Export error data to JSON file.
+
+        Args:
+            filepath: Path to export file
+        """
+        with self._lock:
+            data = {
+                "exported_at": datetime.now().isoformat(),
+                "summary": self.get_error_summary(),
+                "all_records": [r.to_dict() for r in self._records]
+            }
+            filepath.parent.mkdir(parents=True, exist_ok=True)
+            filepath.write_text(json.dumps(data, indent=2))
+            self.logger.info(f"Exported error data to {filepath}")
+
+    def _auto_export(self) -> None:
+        """Auto-export on pattern detection (if export_path configured)."""
+        if self.export_path:
+            try:
+                timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+                filepath = self.export_path / f"errors_{timestamp}.json"
+                self.export_to_file(filepath)
+            except Exception as e:
+                self.logger.error(f"Auto-export failed: {e}")
+
+
+# Global singleton instance
+_error_aggregator: Optional[ErrorAggregator] = None
+_aggregator_lock = threading.Lock()
+
+
+def get_error_aggregator(
+    max_records: int = 1000,
+    pattern_threshold: int = 5,
+    pattern_window_minutes: int = 60,
+    export_path: Optional[Path] = None
+) -> ErrorAggregator:
+    """
+    Get or create the global error aggregator instance.
+
+    Args:
+        max_records: Maximum records to keep (only used on first call)
+        pattern_threshold: Pattern detection threshold (only used on first call)
+        pattern_window_minutes: Pattern detection window (only used on first call)
+        export_path: Export path for auto-export (only used on first call)
+
+    Returns:
+        The global ErrorAggregator instance
+    """
+    global _error_aggregator
+
+    with _aggregator_lock:
+        if _error_aggregator is None:
+            _error_aggregator = ErrorAggregator(
+                max_records=max_records,
+                pattern_threshold=pattern_threshold,
+                pattern_window_minutes=pattern_window_minutes,
+                export_path=export_path
+            )
+        return _error_aggregator
+
+
+def record_error(
+    error: Exception,
+    context: Optional[Dict[str, Any]] = None,
+    plugin_id: Optional[str] = None,
+    operation: Optional[str] = None
+) -> ErrorRecord:
+    """
+    Convenience function to record an error to the global aggregator.
+
+    Args:
+        error: The exception that occurred
+        context: Optional context dictionary
+        plugin_id: Optional plugin ID
+        operation: Optional operation name
+
+    Returns:
+        The created ErrorRecord
+    """
+    return get_error_aggregator().record_error(
+        error=error,
+        context=context,
+        plugin_id=plugin_id,
+        operation=operation
+    )
--- a/src/plugin_system/base_plugin.py
+++ b/src/plugin_system/base_plugin.py
@@ -133,11 +133,11 @@ class BasePlugin(ABC):
    def get_display_duration(self) -> float:
        """
        Get the display duration for this plugin instance.
-        
+
        Automatically detects duration from:
        1. self.display_duration instance variable (if exists)
        2. self.config.get("display_duration", 15.0) (fallback)
-        
+
        Can be overridden by plugins to provide dynamic durations based
        on content (e.g., longer duration for more complex displays).

@@ -155,27 +155,78 @@ class BasePlugin(ABC):
                elif isinstance(duration, (int, float)):
                    if duration > 0:
                        return float(duration)
+                    else:
+                        self.logger.debug(
+                            "display_duration instance variable is non-positive (%s), using config fallback",
+                            duration
+                        )
                # Try converting string representations of numbers
                elif isinstance(duration, str):
                    try:
                        duration_float = float(duration)
                        if duration_float > 0:
                            return duration_float
+                        else:
+                            self.logger.debug(
+                                "display_duration string value is non-positive (%s), using config fallback",
+                                duration
+                            )
                    except (ValueError, TypeError):
-                        pass  # Fall through to config
-            except (TypeError, ValueError, AttributeError):
-                pass  # Fall through to config
+                        self.logger.warning(
+                            "display_duration instance variable has invalid string value '%s', using config fallback",
+                            duration
+                        )
+                else:
+                    self.logger.warning(
+                        "display_duration instance variable has unexpected type %s (value: %s), using config fallback",
+                        type(duration).__name__, duration
+                    )
+            except (TypeError, ValueError, AttributeError) as e:
+                self.logger.warning(
+                    "Error reading display_duration instance variable: %s, using config fallback",
+                    e
+                )

        # Fall back to config
        config_duration = self.config.get("display_duration", 15.0)
        try:
            # Ensure config value is also a valid float
            if isinstance(config_duration, (int, float)):
-                return float(config_duration) if config_duration > 0 else 15.0
+                if config_duration > 0:
+                    return float(config_duration)
+                else:
+                    self.logger.debug(
+                        "Config display_duration is non-positive (%s), using default 15.0",
+                        config_duration
+                    )
+                    return 15.0
            elif isinstance(config_duration, str):
-                return float(config_duration) if float(config_duration) > 0 else 15.0
-        except (ValueError, TypeError):
-            pass
+                try:
+                    duration_float = float(config_duration)
+                    if duration_float > 0:
+                        return duration_float
+                    else:
+                        self.logger.debug(
+                            "Config display_duration string is non-positive (%s), using default 15.0",
+                            config_duration
+                        )
+                        return 15.0
+                except ValueError:
+                    self.logger.warning(
+                        "Config display_duration has invalid string value '%s', using default 15.0",
+                        config_duration
+                    )
+                    return 15.0
+            else:
+                self.logger.warning(
+                    "Config display_duration has unexpected type %s (value: %s), using default 15.0",
+                    type(config_duration).__name__, config_duration
+                )
+        except (ValueError, TypeError) as e:
+            self.logger.warning(
+                "Error processing config display_duration: %s, using default 15.0",
+                e
+            )

        return 15.0

--- a/src/plugin_system/plugin_executor.py
+++ b/src/plugin_system/plugin_executor.py
@@ -13,6 +13,7 @@ import logging

 from src.exceptions import PluginError
 from src.logging_config import get_logger
+from src.error_aggregator import record_error


 class TimeoutError(Exception):
@@ -80,12 +81,15 @@ class PluginExecutor:
        if not result_container['completed']:
            error_msg = f"{plugin_context} operation timed out after {timeout}s"
            self.logger.error(error_msg)
-            raise TimeoutError(error_msg)
-        
+            timeout_error = TimeoutError(error_msg)
+            record_error(timeout_error, plugin_id=plugin_id, operation="timeout")
+            raise timeout_error
+
        if result_container['exception']:
            error = result_container['exception']
            error_msg = f"{plugin_context} operation failed: {error}"
            self.logger.error(error_msg, exc_info=True)
+            record_error(error, plugin_id=plugin_id, operation="execute")
            raise PluginError(error_msg, plugin_id=plugin_id) from error
        
        return result_container['value']
@@ -128,7 +132,7 @@ class PluginExecutor:
            self.logger.error("Plugin %s update() timed out", plugin_id)
            return False
        except PluginError:
-            # Already logged in execute_with_timeout
+            # Already logged and recorded in execute_with_timeout
            return False
        except Exception as e:
            self.logger.error(
@@ -137,6 +141,7 @@ class PluginExecutor:
                e,
                exc_info=True
            )
+            record_error(e, plugin_id=plugin_id, operation="update")
            return False
    
    def execute_display(
@@ -203,7 +208,7 @@ class PluginExecutor:
            self.logger.error("Plugin %s display() timed out", plugin_id)
            return False
        except PluginError:
-            # Already logged in execute_with_timeout
+            # Already logged and recorded in execute_with_timeout
            return False
        except Exception as e:
            self.logger.error(
@@ -212,6 +217,7 @@ class PluginExecutor:
                e,
                exc_info=True
            )
+            record_error(e, plugin_id=plugin_id, operation="display")
            return False
    
    def execute_safe(
--- a/src/plugin_system/plugin_manager.py
+++ b/src/plugin_system/plugin_manager.py
@@ -136,13 +136,24 @@ class PluginManager:
    def discover_plugins(self) -> List[str]:
        """
        Discover all plugins in the plugins directory.
-        
+
+        Also checks for potential config key collisions and logs warnings.
+
        Returns:
            List of plugin IDs
        """
        self.logger.info("Discovering plugins in %s", self.plugins_dir)
        plugin_ids = self._scan_directory_for_plugins(self.plugins_dir)
        self.logger.info("Discovered %d plugin(s)", len(plugin_ids))
+
+        # Check for config key collisions
+        collisions = self.schema_manager.detect_config_key_collisions(plugin_ids)
+        for collision in collisions:
+            self.logger.warning(
+                "Config collision detected: %s",
+                collision.get('message', str(collision))
+            )
+
        return plugin_ids

    def _get_dependency_marker_path(self, plugin_id: str) -> Path:
@@ -288,6 +299,24 @@ class PluginManager:
            else:
                config = {}
            
+            # Check if plugin has a config schema
+            schema_path = self.schema_manager.get_schema_path(plugin_id)
+            if schema_path is None:
+                # Schema file doesn't exist
+                self.logger.warning(
+                    f"Plugin '{plugin_id}' has no config_schema.json - configuration will not be validated. "
+                    f"Consider adding a schema file for better error detection and user experience."
+                )
+            else:
+                # Schema file exists, try to load it
+                schema = self.schema_manager.load_schema(plugin_id)
+                if schema is None:
+                    # Schema exists but couldn't be loaded (likely invalid JSON or schema)
+                    self.logger.warning(
+                        f"Plugin '{plugin_id}' has a config_schema.json but it could not be loaded. "
+                        f"The schema may be invalid. Please verify the schema file at: {schema_path}"
+                    )
+
            # Merge config with schema defaults to ensure all defaults are applied
            try:
                defaults = self.schema_manager.generate_default_config(plugin_id, use_cache=True)
--- a/src/plugin_system/schema_manager.py
+++ b/src/plugin_system/schema_manager.py
@@ -445,3 +445,62 @@ class SchemaManager:
        replace_none_with_defaults(merged, defaults)
        return merged

+    def detect_config_key_collisions(
+        self,
+        plugin_ids: List[str]
+    ) -> List[Dict[str, Any]]:
+        """
+        Detect config key collisions between plugins.
+
+        Checks for:
+        1. Plugin IDs that collide with reserved system config keys
+        2. Plugin IDs that might cause confusion or conflicts
+
+        Args:
+            plugin_ids: List of plugin identifiers to check
+
+        Returns:
+            List of collision warnings, each containing:
+            - type: 'reserved_key_collision' or 'case_collision'
+            - plugin_id: The plugin ID involved
+            - message: Human-readable warning message
+        """
+        collisions = []
+
+        # Reserved top-level config keys that plugins should not use as IDs
+        reserved_keys = {
+            'display', 'schedule', 'timezone', 'plugin_system',
+            'display_modes', 'system', 'hardware', 'debug',
+            'log_level', 'emulator', 'web_interface'
+        }
+
+        # Track plugin IDs for case collision detection
+        lowercase_ids: Dict[str, str] = {}
+
+        for plugin_id in plugin_ids:
+            # Check reserved key collision
+            if plugin_id.lower() in {k.lower() for k in reserved_keys}:
+                collisions.append({
+                    "type": "reserved_key_collision",
+                    "plugin_id": plugin_id,
+                    "message": f"Plugin ID '{plugin_id}' conflicts with reserved config key. "
+                               f"This may cause configuration issues."
+                })
+
+            # Check for case-insensitive collisions between plugins
+            lower_id = plugin_id.lower()
+            if lower_id in lowercase_ids:
+                existing_id = lowercase_ids[lower_id]
+                if existing_id != plugin_id:
+                    collisions.append({
+                        "type": "case_collision",
+                        "plugin_id": plugin_id,
+                        "conflicting_id": existing_id,
+                        "message": f"Plugin ID '{plugin_id}' may conflict with '{existing_id}' "
+                                   f"on case-insensitive file systems."
+                    })
+            else:
+                lowercase_ids[lower_id] = plugin_id
+
+        return collisions
+