feat(cache): Add intelligent disk cache cleanup with retention policies (#199)

* feat(cache): Add intelligent disk cache cleanup with retention policies

- Add cleanup_expired_files() method to DiskCache class
- Implement retention policies based on cache data types:
  * Odds data: 2 days (lines move frequently)
  * Live/recent/leaderboard: 7 days (weekly updates)
  * News/stocks: 14 days
  * Upcoming/schedules/team_info/logos: 60 days (stable data)
- Add cleanup_disk_cache() orchestration in CacheManager
- Start background cleanup thread running every 24 hours
- Run cleanup on application startup
- Add disk cleanup metrics tracking
- Comprehensive logging with cleanup statistics

This prevents disk cache from accumulating indefinitely while preserving
important season data longer than volatile live game data.

* refactor(cache): improve disk cache cleanup implementation

- Implement force parameter throttle mechanism in cleanup_disk_cache
- Fix TOCTOU race condition in disk cache cleanup (getsize/remove)
- Reduce lock contention by processing files outside lock where possible
- Add CacheStrategyProtocol for better type safety (replaces Any)
- Move time import to module level in cache_metrics
- Defer initial cleanup to background thread for non-blocking startup
- Add graceful shutdown mechanism with threading.Event for cleanup thread
- Add stop_cleanup_thread() method for controlled thread termination

* fix(cache): improve disk cache cleanup initialization and error handling

- Only start cleanup thread when disk caching is enabled (cache_dir is set)
- Remove unused retention policy keys (leaderboard, live_scores, logos)
- Handle FileNotFoundError as benign race condition in cleanup
- Preserve existing OSError handling for actual file system errors

---------

Co-authored-by: Chuck <chuck@example.com>
This commit is contained in:
Chuck
2026-01-19 15:57:19 -05:00
committed by GitHub
parent bc23b7c75c
commit 2381ead03f
3 changed files with 331 additions and 42 deletions

View File

@@ -5,6 +5,7 @@ Tracks cache performance metrics including hit rates, miss rates, and fetch time
"""
import threading
import time
import logging
from typing import Dict, Any, Optional
@@ -28,7 +29,12 @@ class CacheMetrics:
'background_hits': 0,
'background_misses': 0,
'total_fetch_time': 0.0,
'fetch_count': 0
'fetch_count': 0,
# Disk cleanup metrics
'last_disk_cleanup': 0.0,
'total_files_cleaned': 0,
'total_space_freed_mb': 0.0,
'last_cleanup_duration_sec': 0.0
}
def record_hit(self, cache_type: str = 'regular') -> None:
@@ -69,6 +75,21 @@ class CacheMetrics:
self._metrics['total_fetch_time'] += duration
self._metrics['fetch_count'] += 1
def record_disk_cleanup(self, files_cleaned: int, space_freed_mb: float, duration_sec: float) -> None:
"""
Record disk cleanup operation results.
Args:
files_cleaned: Number of files deleted
space_freed_mb: Space freed in megabytes
duration_sec: Duration of cleanup operation in seconds
"""
with self._lock:
self._metrics['last_disk_cleanup'] = time.time()
self._metrics['total_files_cleaned'] += files_cleaned
self._metrics['total_space_freed_mb'] += space_freed_mb
self._metrics['last_cleanup_duration_sec'] = duration_sec
def get_metrics(self) -> Dict[str, Any]:
"""
Get current cache performance metrics.
@@ -93,7 +114,12 @@ class CacheMetrics:
'api_calls_saved': self._metrics['api_calls_saved'],
'average_fetch_time': avg_fetch_time,
'total_fetch_time': self._metrics['total_fetch_time'],
'fetch_count': self._metrics['fetch_count']
'fetch_count': self._metrics['fetch_count'],
# Disk cleanup metrics
'last_disk_cleanup': self._metrics['last_disk_cleanup'],
'total_files_cleaned': self._metrics['total_files_cleaned'],
'total_space_freed_mb': self._metrics['total_space_freed_mb'],
'last_cleanup_duration_sec': self._metrics['last_cleanup_duration_sec']
}
def log_metrics(self) -> None:

View File

@@ -10,12 +10,28 @@ import time
import tempfile
import logging
import threading
from typing import Dict, Any, Optional
from typing import Dict, Any, Optional, Protocol
from datetime import datetime
from src.exceptions import CacheError
class CacheStrategyProtocol(Protocol):
"""Protocol for cache strategy objects that categorize cache keys."""
def get_data_type_from_key(self, key: str) -> str:
"""
Determine the data type from a cache key.
Args:
key: Cache key
Returns:
Data type string for strategy lookup
"""
...
class DateTimeEncoder(json.JSONEncoder):
"""JSON encoder that handles datetime objects."""
def default(self, obj: Any) -> Any:
@@ -269,4 +285,116 @@ class DiskCache:
def get_cache_dir(self) -> Optional[str]:
"""Get the cache directory path."""
return self.cache_dir
def cleanup_expired_files(self, cache_strategy: CacheStrategyProtocol, retention_policies: Dict[str, int]) -> Dict[str, Any]:
"""
Clean up expired cache files based on retention policies.
Args:
cache_strategy: Object implementing CacheStrategyProtocol for categorizing files
retention_policies: Dict mapping data types to retention days
Returns:
Dictionary with cleanup statistics:
- files_scanned: Total files checked
- files_deleted: Files removed
- space_freed_bytes: Bytes freed
- errors: Number of errors encountered
"""
if not self.cache_dir or not os.path.exists(self.cache_dir):
self.logger.warning("Cache directory not available for cleanup")
return {'files_scanned': 0, 'files_deleted': 0, 'space_freed_bytes': 0, 'errors': 0}
stats = {
'files_scanned': 0,
'files_deleted': 0,
'space_freed_bytes': 0,
'errors': 0
}
current_time = time.time()
try:
# Collect files to process outside the lock to avoid blocking cache operations
# Only hold lock during directory listing to get snapshot of files
try:
with self._lock:
# Get snapshot of files while holding lock briefly
filenames = [f for f in os.listdir(self.cache_dir) if f.endswith('.json')]
except OSError as list_error:
self.logger.error("Error listing cache directory %s: %s", self.cache_dir, list_error, exc_info=True)
stats['errors'] += 1
return stats
# Process files outside the lock to avoid blocking get/set operations
for filename in filenames:
stats['files_scanned'] += 1
file_path = os.path.join(self.cache_dir, filename)
try:
# Get file age (outside lock - stat operations are generally atomic)
file_mtime = os.path.getmtime(file_path)
file_age_days = (current_time - file_mtime) / 86400 # Convert to days
# Extract cache key from filename (remove .json extension)
cache_key = filename[:-5]
# Determine data type and retention policy
data_type = cache_strategy.get_data_type_from_key(cache_key)
retention_days = retention_policies.get(data_type, retention_policies.get('default', 30))
# Delete if older than retention period
# Only hold lock during actual file deletion to ensure atomicity
if file_age_days > retention_days:
try:
# Hold lock only during delete operation (get size and remove atomically)
with self._lock:
# Double-check file still exists (may have been deleted by another process)
if os.path.exists(file_path):
try:
file_size = os.path.getsize(file_path)
os.remove(file_path)
# Only increment stats if removal succeeded
stats['files_deleted'] += 1
stats['space_freed_bytes'] += file_size
self.logger.debug(
"Deleted expired cache file: %s (age: %.1f days, type: %s, retention: %d days)",
filename, file_age_days, data_type, retention_days
)
except FileNotFoundError:
# File was deleted by another process between exists check and remove
# This is a benign race condition, silently continue
pass
else:
# File was deleted by another process before lock was acquired
# This is a benign race condition, silently continue
pass
except FileNotFoundError:
# File was already deleted by another process, skip it
# This is a benign race condition, silently continue
continue
except OSError as e:
# Other file system errors, log but don't fail the entire cleanup
stats['errors'] += 1
self.logger.warning("Error deleting cache file %s: %s", filename, e)
continue
except FileNotFoundError:
# File was deleted by another process between listing and processing
# This is a benign race condition, silently continue
continue
except OSError as e:
stats['errors'] += 1
self.logger.warning("Error processing cache file %s: %s", filename, e)
continue
except Exception as e:
stats['errors'] += 1
self.logger.error("Unexpected error processing cache file %s: %s", filename, e, exc_info=True)
continue
except OSError as e:
self.logger.error("Error listing cache directory %s: %s", self.cache_dir, e, exc_info=True)
stats['errors'] += 1
return stats