"""File caching utilities for data sources.
Provides disk-based caching for downloaded files with configurable TTL.
Used by NISRA, PSNI, and other data source modules to avoid repeated
downloads of the same resources.
Cache Location:
Files are cached in ``~/.cache/bolster/<namespace>/`` with filenames
based on URL hashes. Each data source uses its own namespace.
Example:
>>> from bolster.utils.cache import CachedDownloader, hash_url
>>> hash_url("https://example.com/data.csv")
'2a01ab0de708440185cbb6473893860c'
>>> downloader = CachedDownloader("my_source")
>>> downloader.namespace
'my_source'
"""
import hashlib
import logging
from datetime import datetime
from pathlib import Path
from .web import session as web_session
[docs]
logger = logging.getLogger(__name__)
# Base cache directory
[docs]
CACHE_BASE = Path.home() / ".cache" / "bolster"
[docs]
class CacheError(Exception):
"""Base exception for cache operations."""
pass
[docs]
class DownloadError(CacheError):
"""Raised when a file download fails."""
pass
[docs]
def hash_url(url: str) -> str:
"""Generate a cache-safe filename from a URL using MD5 hash.
Args:
url: The URL to hash
Returns:
32-character hexadecimal MD5 hash string
Example:
>>> hash_url("https://example.com/data.csv")
'2a01ab0de708440185cbb6473893860c'
"""
return hashlib.md5(url.encode()).hexdigest()
[docs]
class CachedDownloader:
"""Disk-based file cache with TTL support.
Provides download-with-cache functionality for data source modules.
Each instance uses a namespace subdirectory for isolation.
Args:
namespace: Subdirectory name for this cache (e.g., "nisra", "psni")
timeout: Request timeout in seconds (default: 60)
Example:
>>> downloader = CachedDownloader("psni", timeout=60)
>>> downloader.namespace
'psni'
>>> downloader.timeout
60
>>> downloader.cache_dir.parts[-2:]
('bolster', 'psni')
"""
def __init__(self, namespace: str, timeout: int = 60):
"""Initialize CachedDownloader with namespace and timeout.
Args:
namespace: Cache namespace for organizing files
timeout: Timeout for HTTP requests in seconds
"""
[docs]
self.namespace = namespace
[docs]
self.cache_dir = CACHE_BASE / namespace
self.cache_dir.mkdir(parents=True, exist_ok=True)
[docs]
def get_cached_file(self, url: str, cache_ttl_hours: int = 24) -> Path | None:
"""Return cached file if it exists and is fresh, else None.
Args:
url: URL of the file (used to generate cache filename)
cache_ttl_hours: Maximum age in hours before cache is stale
Returns:
Path to cached file if valid and fresh, None otherwise
"""
url_hash = hash_url(url)
ext = Path(url).suffix or ".bin"
cache_path = self.cache_dir / f"{url_hash}{ext}"
if cache_path.exists():
age = datetime.now() - datetime.fromtimestamp(cache_path.stat().st_mtime)
if age.total_seconds() < cache_ttl_hours * 3600:
logger.info(f"Using cached file: {cache_path}")
return cache_path
return None
[docs]
def download(
self,
url: str,
cache_ttl_hours: int = 24,
force_refresh: bool = False,
headers: dict | None = None,
) -> Path:
"""Download a file with caching support.
Downloads a file from the given URL and caches it locally. If a valid
cached version exists, returns that instead.
Args:
url: URL to download
cache_ttl_hours: Cache validity in hours (default: 24)
force_refresh: If True, bypass cache and re-download
headers: Optional extra HTTP headers to include in the request
(e.g. ``{"Referer": "...", "User-Agent": "..."}``)
Returns:
Path to the downloaded (or cached) file
Raises:
DownloadError: If download fails due to network or HTTP errors
"""
# Check cache first
if not force_refresh:
cached = self.get_cached_file(url, cache_ttl_hours)
if cached:
return cached
# Download the file
url_hash = hash_url(url)
ext = Path(url).suffix or ".bin"
cache_path = self.cache_dir / f"{url_hash}{ext}"
try:
logger.info(f"Downloading {url}")
# Use shared session with retry logic for resilient downloads
response = web_session.get(url, timeout=self.timeout, headers=headers)
response.raise_for_status()
cache_path.write_bytes(response.content)
size_mb = len(response.content) / 1024 / 1024
logger.info(f"Saved to {cache_path} ({size_mb:.1f} MB)")
return cache_path
except Exception as e:
raise DownloadError(f"Failed to download {url}: {e}") from e
[docs]
def clear(self, pattern: str | None = None) -> int:
"""Clear cached files.
Args:
pattern: Optional glob pattern (e.g., ``*.csv``). If None, clears all.
Returns:
Number of files deleted
"""
files = list(self.cache_dir.glob(pattern)) if pattern else list(self.cache_dir.glob("*"))
deleted = 0
for file in files:
if file.is_file():
file.unlink()
deleted += 1
logger.info(f"Deleted {file}")
logger.info(f"Cleared {deleted} cached files from {self.namespace}")
return deleted