Source code for bolster.data_sources.dva

"""DVA (Driver & Vehicle Agency) Monthly Tests Statistics Module.

This module provides access to Northern Ireland's Driver & Vehicle Agency monthly
test statistics, including vehicle tests, driver tests, and theory tests.

Data is published monthly by the Department for Infrastructure (DfI) Northern Ireland.

Data Coverage:
    - Vehicle Tests (Full & Retests): April 2014 - Present
    - Driver Tests: April 2014 - Present
    - Theory Tests: April 2014 - Present
    - Test breakdowns by category and test centre

Data Source: Department for Infrastructure Northern Ireland provides Driver & Vehicle Agency
statistics through their publications portal at https://www.infrastructure-ni.gov.uk/publications?f%5B0%5D=type%3Astatisticalreports.
The DVA publishes monthly test statistics covering vehicle tests, driver tests, and theory tests
conducted across Northern Ireland, providing comprehensive data on driving and vehicle testing performance.

Update Frequency: Monthly publications are released covering the previous month's test statistics.
DVA data is published by the Department for Infrastructure Analytics Branch approximately 4-6 weeks
after the reference month ends, providing consistent monthly updates on driving test performance
and vehicle testing statistics across Northern Ireland.

Publication Details:
    - Published by: Department for Infrastructure (DfI) - Analytics Branch
    - Data Source: DVA Business & Regulatory Statistics

Example:
    >>> from bolster.data_sources import dva
    >>> # Get latest vehicle test statistics
    >>> df = dva.get_latest_vehicle_tests()
    >>> 'tests_conducted' in df.columns
    True

    >>> # Get latest driver test statistics
    >>> df = dva.get_latest_driver_tests()
    >>> len(df) > 0
    True

    >>> # Get latest theory test statistics
    >>> df = dva.get_latest_theory_tests()
    >>> len(df) > 0
    True

    >>> # Get all test types combined
    >>> data = dva.get_latest_all_tests()
    >>> sorted(data.keys())
    ['driver', 'theory', 'vehicle']
"""

import contextlib
import logging
import re
from datetime import datetime
from pathlib import Path

import pandas as pd
from bs4 import BeautifulSoup

from bolster.utils.web import session


[docs]
logger = logging.getLogger(__name__)


# Cache directory

[docs]
CACHE_DIR = Path.home() / ".cache" / "bolster" / "dva"

CACHE_DIR.mkdir(parents=True, exist_ok=True)

# Base URL for DVA statistics publications

[docs]
DVA_PUBLICATIONS_URL = "https://www.infrastructure-ni.gov.uk/publications/type/statistics"


[docs]
DVA_SEARCH_TERM = "driver-and-vehicle-agency-monthly-tests-conducted"




[docs]
class DVADataError(Exception):
    """Base exception for DVA data errors."""

    pass




[docs]
class DVADataNotFoundError(DVADataError):
    """Data file not available."""

    pass



def _hash_url(url: str) -> str:
    """Generate a safe filename from a URL."""
    import hashlib

    return hashlib.md5(url.encode()).hexdigest()


def _download_file(url: str, cache_ttl_hours: int = 24, force_refresh: bool = False) -> Path:
    """Download a file with caching support.

    Args:
        url: URL to download
        cache_ttl_hours: Cache validity in hours (default: 24)
        force_refresh: Force re-download even if cached

    Returns:
        Path to downloaded file

    Raises:
        DVADataNotFoundError: If download fails
    """
    url_hash = _hash_url(url)
    ext = Path(url).suffix or ".xlsx"
    cache_path = CACHE_DIR / f"{url_hash}{ext}"

    # Check cache
    if not force_refresh and cache_path.exists():
        age = datetime.now() - datetime.fromtimestamp(cache_path.stat().st_mtime)
        if age.total_seconds() < cache_ttl_hours * 3600:
            logger.info(f"Using cached file: {cache_path}")
            return cache_path

    # Download
    try:
        logger.info(f"Downloading {url}")
        response = session.get(url, timeout=60)
        response.raise_for_status()
        cache_path.write_bytes(response.content)
        logger.info(f"Saved to {cache_path}")
        return cache_path
    except Exception as e:
        raise DVADataNotFoundError(f"Failed to download {url}: {e}") from e



[docs]
def get_latest_dva_publication_url() -> tuple[str, str, datetime]:
    """Get the URL of the latest DVA Monthly Tests publication.

    Attempts to find the most recent DVA monthly tests statistics publication
    by trying recent months in reverse order.

    Returns:
        Tuple of (excel_url, publication_title, publication_date)

    Raises:
        DVADataNotFoundError: If unable to find any recent publication

    Example:
        >>> url, title, pub_date = get_latest_dva_publication_url()
        >>> url.startswith('https://')
        True
    """
    from dateutil.relativedelta import relativedelta

    # Try the last 6 months to find the most recent publication
    current_date = datetime.now()
    months_to_try = []

    for i in range(6):
        check_date = current_date - relativedelta(months=i)
        months_to_try.append(check_date)

    month_names = [
        "january",
        "february",
        "march",
        "april",
        "may",
        "june",
        "july",
        "august",
        "september",
        "october",
        "november",
        "december",
    ]

    for check_date in months_to_try:
        month_name = month_names[check_date.month - 1]
        year = check_date.year

        # Construct the publication URL
        pub_url = (
            f"https://www.infrastructure-ni.gov.uk/publications/"
            f"driver-and-vehicle-agency-monthly-tests-conducted-statistics-{month_name}-{year}"
        )

        logger.info(f"Trying publication URL: {pub_url}")

        try:
            response = session.get(pub_url, timeout=30)
            if response.status_code == 200:
                logger.info(f"Found publication for {month_name.title()} {year}")

                soup = BeautifulSoup(response.content, "html.parser")

                # Find Excel file link
                excel_url = None
                for file_link in soup.find_all("a", href=True):
                    href = file_link["href"]
                    if ".xlsx" in href.lower() and "tables" in href.lower():
                        if not href.startswith("http"):
                            href = f"https://www.infrastructure-ni.gov.uk{href}"
                        excel_url = href
                        break

                if excel_url:
                    pub_title = f"DVA Monthly Tests Conducted Statistics - {month_name.title()} {year}"
                    pub_date = datetime(year, check_date.month, 1)
                    logger.info(f"Found DVA Excel file: {excel_url}")
                    return excel_url, pub_title, pub_date

        except Exception as e:
            logger.debug(f"Failed to fetch {pub_url}: {e}")
            continue

    raise DVADataNotFoundError("Could not find any DVA monthly tests publications in the last 6 months")



def _parse_month_year(date_str: str) -> datetime | None:
    """Parse a 'YYYY Month' string into a datetime.

    Args:
        date_str: String like '2024 December' or '2025 January'

    Returns:
        datetime object or None if parsing fails
    """
    if not date_str or not isinstance(date_str, str):
        return None

    date_str = date_str.strip()

    # Pattern: "YYYY Month"
    match = re.match(
        r"(\d{4})\s+(January|February|March|April|May|June|July|August|September|October|November|December)",
        date_str,
        re.IGNORECASE,
    )
    if match:
        year, month_name = match.groups()
        month_map = {
            "january": 1,
            "february": 2,
            "march": 3,
            "april": 4,
            "may": 5,
            "june": 6,
            "july": 7,
            "august": 8,
            "september": 9,
            "october": 10,
            "november": 11,
            "december": 12,
        }
        return datetime(int(year), month_map[month_name.lower()], 1)

    return None



[docs]
def parse_vehicle_tests(file_path: str | Path) -> pd.DataFrame:
    """Parse DVA vehicle tests data from Excel file.

    Extracts full vehicle tests conducted from Table 1.1a.

    Args:
        file_path: Path to the DVA Excel file

    Returns:
        DataFrame with columns:
            - date: datetime (first day of month)
            - year: int
            - month: str (month name)
            - tests_conducted: int (full tests conducted)
            - rolling_12_month_total: int (optional, rolling 12-month sum)

    Example:
        >>> url, _, _ = get_latest_dva_publication_url()
        >>> path = _download_file(url)
        >>> df = parse_vehicle_tests(path)
        >>> 'tests_conducted' in df.columns
        True
        >>> len(df) > 0
        True
    """
    from openpyxl import load_workbook

    logger.info(f"Parsing vehicle tests from: {file_path}")

    wb = load_workbook(file_path, data_only=True)

    # Find the vehicle tests sheet
    sheet_name = None
    for name in wb.sheetnames:
        if "1_1a" in name and "Veh" in name:
            sheet_name = name
            break

    if not sheet_name:
        raise DVADataError("Could not find vehicle tests sheet (1_1a)")

    sheet = wb[sheet_name]

    # Parse data rows (skip header rows 1-3)
    records = []
    for row in sheet.iter_rows(min_row=4, values_only=True):
        date_str = row[0]
        tests = row[1]
        rolling_total = row[3] if len(row) > 3 else None

        if not date_str or not tests:
            continue

        date = _parse_month_year(str(date_str))
        if not date:
            continue

        try:
            tests_int = int(tests)
        except (ValueError, TypeError):
            continue

        record = {
            "date": date,
            "year": date.year,
            "month": date.strftime("%B"),
            "tests_conducted": tests_int,
        }

        if rolling_total:
            with contextlib.suppress(ValueError, TypeError):
                record["rolling_12_month_total"] = int(rolling_total)

        records.append(record)

    df = pd.DataFrame(records)

    if df.empty:
        raise DVADataError("No vehicle test data found in file")

    logger.info(f"Parsed {len(df)} months of vehicle test data ({df['year'].min()}-{df['year'].max()})")
    return df




[docs]
def parse_driver_tests(file_path: str | Path) -> pd.DataFrame:
    """Parse DVA driver tests data from Excel file.

    Extracts driver tests conducted from Table 2.1.

    Args:
        file_path: Path to the DVA Excel file

    Returns:
        DataFrame with columns:
            - date: datetime (first day of month)
            - year: int
            - month: str (month name)
            - tests_conducted: int (driver tests conducted)
            - rolling_12_month_total: int (optional, rolling 12-month sum)

    Example:
        >>> url, _, _ = get_latest_dva_publication_url()
        >>> path = _download_file(url)
        >>> df = parse_driver_tests(path)
        >>> 'tests_conducted' in df.columns
        True
        >>> len(df) > 0
        True
    """
    from openpyxl import load_workbook

    logger.info(f"Parsing driver tests from: {file_path}")

    wb = load_workbook(file_path, data_only=True)

    # Find the driver tests sheet
    sheet_name = None
    for name in wb.sheetnames:
        if "2_1" in name and "Driver" in name:
            sheet_name = name
            break

    if not sheet_name:
        raise DVADataError("Could not find driver tests sheet (2_1)")

    sheet = wb[sheet_name]

    # Parse data rows (skip header rows 1-3)
    records = []
    for row in sheet.iter_rows(min_row=4, values_only=True):
        date_str = row[0]
        tests = row[1]
        rolling_total = row[3] if len(row) > 3 else None

        if not date_str or not tests:
            continue

        date = _parse_month_year(str(date_str))
        if not date:
            continue

        try:
            tests_int = int(tests)
        except (ValueError, TypeError):
            continue

        record = {
            "date": date,
            "year": date.year,
            "month": date.strftime("%B"),
            "tests_conducted": tests_int,
        }

        if rolling_total:
            with contextlib.suppress(ValueError, TypeError):
                record["rolling_12_month_total"] = int(rolling_total)

        records.append(record)

    df = pd.DataFrame(records)

    if df.empty:
        raise DVADataError("No driver test data found in file")

    logger.info(f"Parsed {len(df)} months of driver test data ({df['year'].min()}-{df['year'].max()})")
    return df




[docs]
def parse_theory_tests(file_path: str | Path) -> pd.DataFrame:
    """Parse DVA theory tests data from Excel file.

    Extracts theory tests conducted from Table 3.1.

    Args:
        file_path: Path to the DVA Excel file

    Returns:
        DataFrame with columns:
            - date: datetime (first day of month)
            - year: int
            - month: str (month name)
            - tests_conducted: int (theory tests conducted)
            - rolling_12_month_total: int (optional, rolling 12-month sum)

    Example:
        >>> url, _, _ = get_latest_dva_publication_url()
        >>> path = _download_file(url)
        >>> df = parse_theory_tests(path)
        >>> 'tests_conducted' in df.columns
        True
        >>> len(df) > 0
        True
    """
    from openpyxl import load_workbook

    logger.info(f"Parsing theory tests from: {file_path}")

    wb = load_workbook(file_path, data_only=True)

    # Find the theory tests sheet
    sheet_name = None
    for name in wb.sheetnames:
        if "3_1" in name and "Theory" in name:
            sheet_name = name
            break

    if not sheet_name:
        raise DVADataError("Could not find theory tests sheet (3_1)")

    sheet = wb[sheet_name]

    # Parse data rows (skip header rows 1-3)
    records = []
    for row in sheet.iter_rows(min_row=4, values_only=True):
        date_str = row[0]
        tests = row[1]
        rolling_total = row[3] if len(row) > 3 else None

        if not date_str or not tests:
            continue

        date = _parse_month_year(str(date_str))
        if not date:
            continue

        try:
            tests_int = int(tests)
        except (ValueError, TypeError):
            continue

        record = {
            "date": date,
            "year": date.year,
            "month": date.strftime("%B"),
            "tests_conducted": tests_int,
        }

        if rolling_total:
            with contextlib.suppress(ValueError, TypeError):
                record["rolling_12_month_total"] = int(rolling_total)

        records.append(record)

    df = pd.DataFrame(records)

    if df.empty:
        raise DVADataError("No theory test data found in file")

    logger.info(f"Parsed {len(df)} months of theory test data ({df['year'].min()}-{df['year'].max()})")
    return df



# ============================================================================
# Main API Functions
# ============================================================================



[docs]
def get_latest_vehicle_tests(force_refresh: bool = False) -> pd.DataFrame:
    """Get the latest vehicle test statistics.

    Downloads and parses the most recent DVA monthly tests publication.
    Results are cached for 7 days unless force_refresh=True.

    Args:
        force_refresh: If True, bypass cache and download fresh data

    Returns:
        DataFrame with monthly vehicle test data

    Example:
        >>> df = get_latest_vehicle_tests()
        >>> 'tests_conducted' in df.columns
        True
    """
    excel_url, _, _ = get_latest_dva_publication_url()
    file_path = _download_file(excel_url, cache_ttl_hours=168, force_refresh=force_refresh)
    return parse_vehicle_tests(file_path)




[docs]
def get_latest_driver_tests(force_refresh: bool = False) -> pd.DataFrame:
    """Get the latest driver test statistics.

    Downloads and parses the most recent DVA monthly tests publication.
    Results are cached for 7 days unless force_refresh=True.

    Args:
        force_refresh: If True, bypass cache and download fresh data

    Returns:
        DataFrame with monthly driver test data

    Example:
        >>> df = get_latest_driver_tests()
        >>> len(df) > 0
        True
    """
    excel_url, _, _ = get_latest_dva_publication_url()
    file_path = _download_file(excel_url, cache_ttl_hours=168, force_refresh=force_refresh)
    return parse_driver_tests(file_path)




[docs]
def get_latest_theory_tests(force_refresh: bool = False) -> pd.DataFrame:
    """Get the latest theory test statistics.

    Downloads and parses the most recent DVA monthly tests publication.
    Results are cached for 7 days unless force_refresh=True.

    Args:
        force_refresh: If True, bypass cache and download fresh data

    Returns:
        DataFrame with monthly theory test data

    Example:
        >>> df = get_latest_theory_tests()
        >>> len(df) > 0
        True
    """
    excel_url, _, _ = get_latest_dva_publication_url()
    file_path = _download_file(excel_url, cache_ttl_hours=168, force_refresh=force_refresh)
    return parse_theory_tests(file_path)




[docs]
def get_latest_all_tests(force_refresh: bool = False) -> dict[str, pd.DataFrame]:
    """Get all test types (vehicle, driver, theory) from the latest publication.

    Downloads the file once and parses all three test types.

    Args:
        force_refresh: If True, bypass cache and download fresh data

    Returns:
        Dictionary with keys 'vehicle', 'driver', 'theory' containing DataFrames

    Example:
        >>> data = get_latest_all_tests()
        >>> sorted(data.keys())
        ['driver', 'theory', 'vehicle']
    """
    excel_url, _, _ = get_latest_dva_publication_url()
    file_path = _download_file(excel_url, cache_ttl_hours=168, force_refresh=force_refresh)

    return {
        "vehicle": parse_vehicle_tests(file_path),
        "driver": parse_driver_tests(file_path),
        "theory": parse_theory_tests(file_path),
    }



# ============================================================================
# Helper Functions for Analysis
# ============================================================================



[docs]
def get_tests_by_year(df: pd.DataFrame, year: int) -> pd.DataFrame:
    """Filter test data for a specific year.

    Args:
        df: Test statistics DataFrame
        year: Year to filter for

    Returns:
        DataFrame with only the specified year's data

    Example:
        >>> df = get_latest_vehicle_tests()
        >>> df_2024 = get_tests_by_year(df, 2024)
        >>> 'tests_conducted' in df_2024.columns
        True
    """
    return df[df["year"] == year].reset_index(drop=True)




[docs]
def get_tests_by_month(df: pd.DataFrame, month: str, year: int) -> pd.DataFrame:
    """Get test data for a specific month and year.

    Args:
        df: Test statistics DataFrame
        month: Month name (e.g., 'January', 'December')
        year: Year

    Returns:
        DataFrame with single row for the specified month

    Example:
        >>> df = get_latest_vehicle_tests()
        >>> dec_2025 = get_tests_by_month(df, 'December', 2025)
        >>> 'tests_conducted' in dec_2025.columns
        True
    """
    return df[(df["month"] == month) & (df["year"] == year)].reset_index(drop=True)




[docs]
def calculate_growth_rates(df: pd.DataFrame, periods: int = 12) -> pd.DataFrame:
    """Calculate year-on-year growth rates for test statistics.

    Args:
        df: Test statistics DataFrame
        periods: Number of months for comparison (default: 12 for YoY)

    Returns:
        DataFrame with additional column:
            - yoy_growth: Percentage change vs same month previous year

    Example:
        >>> df = get_latest_vehicle_tests()
        >>> df_growth = calculate_growth_rates(df)
        >>> 'yoy_growth' in df_growth.columns
        True
    """
    result = df.copy()
    result["yoy_growth"] = result["tests_conducted"].pct_change(periods=periods) * 100
    return result




[docs]
def get_summary_statistics(df: pd.DataFrame, start_year: int | None = None, end_year: int | None = None) -> dict:
    """Calculate summary statistics for test data.

    Args:
        df: Test statistics DataFrame
        start_year: Optional start year for summary
        end_year: Optional end year for summary

    Returns:
        Dictionary with summary statistics:
            - period: Time period covered
            - total_tests: Total tests in period
            - monthly_mean: Average monthly tests
            - monthly_min: Minimum monthly tests
            - monthly_max: Maximum monthly tests
            - months_count: Number of months included

    Example:
        >>> df = get_latest_vehicle_tests()
        >>> stats = get_summary_statistics(df, start_year=2020)
        >>> sorted(stats.keys())
        ['monthly_max', 'monthly_mean', 'monthly_min', 'months_count', 'period', 'total_tests']
    """
    filtered = df.copy()

    if start_year:
        filtered = filtered[filtered["year"] >= start_year]
    if end_year:
        filtered = filtered[filtered["year"] <= end_year]

    return {
        "period": f"{filtered['year'].min()}-{filtered['year'].max()}",
        "total_tests": int(filtered["tests_conducted"].sum()),
        "monthly_mean": float(filtered["tests_conducted"].mean()),
        "monthly_min": int(filtered["tests_conducted"].min()),
        "monthly_max": int(filtered["tests_conducted"].max()),
        "months_count": len(filtered),
    }




[docs]
def validate_dva_test_data(df: pd.DataFrame) -> bool:  # pragma: no cover
    """Validate DVA test statistics data integrity.

    Args:
        df: DataFrame from DVA test functions (vehicle, driver, or theory tests)

    Returns:
        True if validation passes, False otherwise
    """
    if df.empty:
        logger.warning("DVA test data is empty")
        return False

    required_cols = {"month", "tests_conducted"}
    if not required_cols.issubset(df.columns):
        missing = required_cols - set(df.columns)
        logger.warning(f"Missing required DVA columns: {missing}")
        return False

    # Check for non-negative test counts
    if (df["tests_conducted"] < 0).any():
        logger.warning("Found negative test counts in DVA data")
        return False

    # Check for reasonable monthly test volumes
    # Vehicle tests typically range from 40,000 to 100,000 per month in NI
    if df["tests_conducted"].max() > 200000:  # Allow for variation but catch obvious errors
        logger.warning("Unreasonably high test counts found")
        return False

    return True