"""DVA (Driver & Vehicle Agency) Monthly Tests Statistics Module.
This module provides access to Northern Ireland's Driver & Vehicle Agency monthly
test statistics, including vehicle tests, driver tests, and theory tests.
Data is published monthly by the Department for Infrastructure (DfI) Northern Ireland.
Data Coverage:
- Vehicle Tests (Full & Retests): April 2014 - Present
- Driver Tests: April 2014 - Present
- Theory Tests: April 2014 - Present
- Test breakdowns by category and test centre
Data Source: Department for Infrastructure Northern Ireland provides Driver & Vehicle Agency
statistics through their publications portal at https://www.infrastructure-ni.gov.uk/publications?f%5B0%5D=type%3Astatisticalreports.
The DVA publishes monthly test statistics covering vehicle tests, driver tests, and theory tests
conducted across Northern Ireland, providing comprehensive data on driving and vehicle testing performance.
Update Frequency: Monthly publications are released covering the previous month's test statistics.
DVA data is published by the Department for Infrastructure Analytics Branch approximately 4-6 weeks
after the reference month ends, providing consistent monthly updates on driving test performance
and vehicle testing statistics across Northern Ireland.
Publication Details:
- Published by: Department for Infrastructure (DfI) - Analytics Branch
- Data Source: DVA Business & Regulatory Statistics
Example:
>>> from bolster.data_sources import dva
>>> # Get latest vehicle test statistics
>>> df = dva.get_latest_vehicle_tests()
>>> 'tests_conducted' in df.columns
True
>>> # Get latest driver test statistics
>>> df = dva.get_latest_driver_tests()
>>> len(df) > 0
True
>>> # Get latest theory test statistics
>>> df = dva.get_latest_theory_tests()
>>> len(df) > 0
True
>>> # Get all test types combined
>>> data = dva.get_latest_all_tests()
>>> sorted(data.keys())
['driver', 'theory', 'vehicle']
"""
import contextlib
import logging
import re
from datetime import datetime
from pathlib import Path
import pandas as pd
from bs4 import BeautifulSoup
from bolster.utils.web import session
[docs]
logger = logging.getLogger(__name__)
# Cache directory
[docs]
CACHE_DIR = Path.home() / ".cache" / "bolster" / "dva"
CACHE_DIR.mkdir(parents=True, exist_ok=True)
# Base URL for DVA statistics publications
[docs]
DVA_PUBLICATIONS_URL = "https://www.infrastructure-ni.gov.uk/publications/type/statistics"
[docs]
DVA_SEARCH_TERM = "driver-and-vehicle-agency-monthly-tests-conducted"
[docs]
class DVADataError(Exception):
"""Base exception for DVA data errors."""
pass
[docs]
class DVADataNotFoundError(DVADataError):
"""Data file not available."""
pass
def _hash_url(url: str) -> str:
"""Generate a safe filename from a URL."""
import hashlib
return hashlib.md5(url.encode()).hexdigest()
def _download_file(url: str, cache_ttl_hours: int = 24, force_refresh: bool = False) -> Path:
"""Download a file with caching support.
Args:
url: URL to download
cache_ttl_hours: Cache validity in hours (default: 24)
force_refresh: Force re-download even if cached
Returns:
Path to downloaded file
Raises:
DVADataNotFoundError: If download fails
"""
url_hash = _hash_url(url)
ext = Path(url).suffix or ".xlsx"
cache_path = CACHE_DIR / f"{url_hash}{ext}"
# Check cache
if not force_refresh and cache_path.exists():
age = datetime.now() - datetime.fromtimestamp(cache_path.stat().st_mtime)
if age.total_seconds() < cache_ttl_hours * 3600:
logger.info(f"Using cached file: {cache_path}")
return cache_path
# Download
try:
logger.info(f"Downloading {url}")
response = session.get(url, timeout=60)
response.raise_for_status()
cache_path.write_bytes(response.content)
logger.info(f"Saved to {cache_path}")
return cache_path
except Exception as e:
raise DVADataNotFoundError(f"Failed to download {url}: {e}") from e
[docs]
def get_latest_dva_publication_url() -> tuple[str, str, datetime]:
"""Get the URL of the latest DVA Monthly Tests publication.
Attempts to find the most recent DVA monthly tests statistics publication
by trying recent months in reverse order.
Returns:
Tuple of (excel_url, publication_title, publication_date)
Raises:
DVADataNotFoundError: If unable to find any recent publication
Example:
>>> url, title, pub_date = get_latest_dva_publication_url()
>>> url.startswith('https://')
True
"""
from dateutil.relativedelta import relativedelta
# Try the last 6 months to find the most recent publication
current_date = datetime.now()
months_to_try = []
for i in range(6):
check_date = current_date - relativedelta(months=i)
months_to_try.append(check_date)
month_names = [
"january",
"february",
"march",
"april",
"may",
"june",
"july",
"august",
"september",
"october",
"november",
"december",
]
for check_date in months_to_try:
month_name = month_names[check_date.month - 1]
year = check_date.year
# Construct the publication URL
pub_url = (
f"https://www.infrastructure-ni.gov.uk/publications/"
f"driver-and-vehicle-agency-monthly-tests-conducted-statistics-{month_name}-{year}"
)
logger.info(f"Trying publication URL: {pub_url}")
try:
response = session.get(pub_url, timeout=30)
if response.status_code == 200:
logger.info(f"Found publication for {month_name.title()} {year}")
soup = BeautifulSoup(response.content, "html.parser")
# Find Excel file link
excel_url = None
for file_link in soup.find_all("a", href=True):
href = file_link["href"]
if ".xlsx" in href.lower() and "tables" in href.lower():
if not href.startswith("http"):
href = f"https://www.infrastructure-ni.gov.uk{href}"
excel_url = href
break
if excel_url:
pub_title = f"DVA Monthly Tests Conducted Statistics - {month_name.title()} {year}"
pub_date = datetime(year, check_date.month, 1)
logger.info(f"Found DVA Excel file: {excel_url}")
return excel_url, pub_title, pub_date
except Exception as e:
logger.debug(f"Failed to fetch {pub_url}: {e}")
continue
raise DVADataNotFoundError("Could not find any DVA monthly tests publications in the last 6 months")
def _parse_month_year(date_str: str) -> datetime | None:
"""Parse a 'YYYY Month' string into a datetime.
Args:
date_str: String like '2024 December' or '2025 January'
Returns:
datetime object or None if parsing fails
"""
if not date_str or not isinstance(date_str, str):
return None
date_str = date_str.strip()
# Pattern: "YYYY Month"
match = re.match(
r"(\d{4})\s+(January|February|March|April|May|June|July|August|September|October|November|December)",
date_str,
re.IGNORECASE,
)
if match:
year, month_name = match.groups()
month_map = {
"january": 1,
"february": 2,
"march": 3,
"april": 4,
"may": 5,
"june": 6,
"july": 7,
"august": 8,
"september": 9,
"october": 10,
"november": 11,
"december": 12,
}
return datetime(int(year), month_map[month_name.lower()], 1)
return None
[docs]
def parse_vehicle_tests(file_path: str | Path) -> pd.DataFrame:
"""Parse DVA vehicle tests data from Excel file.
Extracts full vehicle tests conducted from Table 1.1a.
Args:
file_path: Path to the DVA Excel file
Returns:
DataFrame with columns:
- date: datetime (first day of month)
- year: int
- month: str (month name)
- tests_conducted: int (full tests conducted)
- rolling_12_month_total: int (optional, rolling 12-month sum)
Example:
>>> url, _, _ = get_latest_dva_publication_url()
>>> path = _download_file(url)
>>> df = parse_vehicle_tests(path)
>>> 'tests_conducted' in df.columns
True
>>> len(df) > 0
True
"""
from openpyxl import load_workbook
logger.info(f"Parsing vehicle tests from: {file_path}")
wb = load_workbook(file_path, data_only=True)
# Find the vehicle tests sheet
sheet_name = None
for name in wb.sheetnames:
if "1_1a" in name and "Veh" in name:
sheet_name = name
break
if not sheet_name:
raise DVADataError("Could not find vehicle tests sheet (1_1a)")
sheet = wb[sheet_name]
# Parse data rows (skip header rows 1-3)
records = []
for row in sheet.iter_rows(min_row=4, values_only=True):
date_str = row[0]
tests = row[1]
rolling_total = row[3] if len(row) > 3 else None
if not date_str or not tests:
continue
date = _parse_month_year(str(date_str))
if not date:
continue
try:
tests_int = int(tests)
except (ValueError, TypeError):
continue
record = {
"date": date,
"year": date.year,
"month": date.strftime("%B"),
"tests_conducted": tests_int,
}
if rolling_total:
with contextlib.suppress(ValueError, TypeError):
record["rolling_12_month_total"] = int(rolling_total)
records.append(record)
df = pd.DataFrame(records)
if df.empty:
raise DVADataError("No vehicle test data found in file")
logger.info(f"Parsed {len(df)} months of vehicle test data ({df['year'].min()}-{df['year'].max()})")
return df
[docs]
def parse_driver_tests(file_path: str | Path) -> pd.DataFrame:
"""Parse DVA driver tests data from Excel file.
Extracts driver tests conducted from Table 2.1.
Args:
file_path: Path to the DVA Excel file
Returns:
DataFrame with columns:
- date: datetime (first day of month)
- year: int
- month: str (month name)
- tests_conducted: int (driver tests conducted)
- rolling_12_month_total: int (optional, rolling 12-month sum)
Example:
>>> url, _, _ = get_latest_dva_publication_url()
>>> path = _download_file(url)
>>> df = parse_driver_tests(path)
>>> 'tests_conducted' in df.columns
True
>>> len(df) > 0
True
"""
from openpyxl import load_workbook
logger.info(f"Parsing driver tests from: {file_path}")
wb = load_workbook(file_path, data_only=True)
# Find the driver tests sheet
sheet_name = None
for name in wb.sheetnames:
if "2_1" in name and "Driver" in name:
sheet_name = name
break
if not sheet_name:
raise DVADataError("Could not find driver tests sheet (2_1)")
sheet = wb[sheet_name]
# Parse data rows (skip header rows 1-3)
records = []
for row in sheet.iter_rows(min_row=4, values_only=True):
date_str = row[0]
tests = row[1]
rolling_total = row[3] if len(row) > 3 else None
if not date_str or not tests:
continue
date = _parse_month_year(str(date_str))
if not date:
continue
try:
tests_int = int(tests)
except (ValueError, TypeError):
continue
record = {
"date": date,
"year": date.year,
"month": date.strftime("%B"),
"tests_conducted": tests_int,
}
if rolling_total:
with contextlib.suppress(ValueError, TypeError):
record["rolling_12_month_total"] = int(rolling_total)
records.append(record)
df = pd.DataFrame(records)
if df.empty:
raise DVADataError("No driver test data found in file")
logger.info(f"Parsed {len(df)} months of driver test data ({df['year'].min()}-{df['year'].max()})")
return df
[docs]
def parse_theory_tests(file_path: str | Path) -> pd.DataFrame:
"""Parse DVA theory tests data from Excel file.
Extracts theory tests conducted from Table 3.1.
Args:
file_path: Path to the DVA Excel file
Returns:
DataFrame with columns:
- date: datetime (first day of month)
- year: int
- month: str (month name)
- tests_conducted: int (theory tests conducted)
- rolling_12_month_total: int (optional, rolling 12-month sum)
Example:
>>> url, _, _ = get_latest_dva_publication_url()
>>> path = _download_file(url)
>>> df = parse_theory_tests(path)
>>> 'tests_conducted' in df.columns
True
>>> len(df) > 0
True
"""
from openpyxl import load_workbook
logger.info(f"Parsing theory tests from: {file_path}")
wb = load_workbook(file_path, data_only=True)
# Find the theory tests sheet
sheet_name = None
for name in wb.sheetnames:
if "3_1" in name and "Theory" in name:
sheet_name = name
break
if not sheet_name:
raise DVADataError("Could not find theory tests sheet (3_1)")
sheet = wb[sheet_name]
# Parse data rows (skip header rows 1-3)
records = []
for row in sheet.iter_rows(min_row=4, values_only=True):
date_str = row[0]
tests = row[1]
rolling_total = row[3] if len(row) > 3 else None
if not date_str or not tests:
continue
date = _parse_month_year(str(date_str))
if not date:
continue
try:
tests_int = int(tests)
except (ValueError, TypeError):
continue
record = {
"date": date,
"year": date.year,
"month": date.strftime("%B"),
"tests_conducted": tests_int,
}
if rolling_total:
with contextlib.suppress(ValueError, TypeError):
record["rolling_12_month_total"] = int(rolling_total)
records.append(record)
df = pd.DataFrame(records)
if df.empty:
raise DVADataError("No theory test data found in file")
logger.info(f"Parsed {len(df)} months of theory test data ({df['year'].min()}-{df['year'].max()})")
return df
# ============================================================================
# Main API Functions
# ============================================================================
[docs]
def get_latest_vehicle_tests(force_refresh: bool = False) -> pd.DataFrame:
"""Get the latest vehicle test statistics.
Downloads and parses the most recent DVA monthly tests publication.
Results are cached for 7 days unless force_refresh=True.
Args:
force_refresh: If True, bypass cache and download fresh data
Returns:
DataFrame with monthly vehicle test data
Example:
>>> df = get_latest_vehicle_tests()
>>> 'tests_conducted' in df.columns
True
"""
excel_url, _, _ = get_latest_dva_publication_url()
file_path = _download_file(excel_url, cache_ttl_hours=168, force_refresh=force_refresh)
return parse_vehicle_tests(file_path)
[docs]
def get_latest_driver_tests(force_refresh: bool = False) -> pd.DataFrame:
"""Get the latest driver test statistics.
Downloads and parses the most recent DVA monthly tests publication.
Results are cached for 7 days unless force_refresh=True.
Args:
force_refresh: If True, bypass cache and download fresh data
Returns:
DataFrame with monthly driver test data
Example:
>>> df = get_latest_driver_tests()
>>> len(df) > 0
True
"""
excel_url, _, _ = get_latest_dva_publication_url()
file_path = _download_file(excel_url, cache_ttl_hours=168, force_refresh=force_refresh)
return parse_driver_tests(file_path)
[docs]
def get_latest_theory_tests(force_refresh: bool = False) -> pd.DataFrame:
"""Get the latest theory test statistics.
Downloads and parses the most recent DVA monthly tests publication.
Results are cached for 7 days unless force_refresh=True.
Args:
force_refresh: If True, bypass cache and download fresh data
Returns:
DataFrame with monthly theory test data
Example:
>>> df = get_latest_theory_tests()
>>> len(df) > 0
True
"""
excel_url, _, _ = get_latest_dva_publication_url()
file_path = _download_file(excel_url, cache_ttl_hours=168, force_refresh=force_refresh)
return parse_theory_tests(file_path)
[docs]
def get_latest_all_tests(force_refresh: bool = False) -> dict[str, pd.DataFrame]:
"""Get all test types (vehicle, driver, theory) from the latest publication.
Downloads the file once and parses all three test types.
Args:
force_refresh: If True, bypass cache and download fresh data
Returns:
Dictionary with keys 'vehicle', 'driver', 'theory' containing DataFrames
Example:
>>> data = get_latest_all_tests()
>>> sorted(data.keys())
['driver', 'theory', 'vehicle']
"""
excel_url, _, _ = get_latest_dva_publication_url()
file_path = _download_file(excel_url, cache_ttl_hours=168, force_refresh=force_refresh)
return {
"vehicle": parse_vehicle_tests(file_path),
"driver": parse_driver_tests(file_path),
"theory": parse_theory_tests(file_path),
}
# ============================================================================
# Helper Functions for Analysis
# ============================================================================
[docs]
def get_tests_by_year(df: pd.DataFrame, year: int) -> pd.DataFrame:
"""Filter test data for a specific year.
Args:
df: Test statistics DataFrame
year: Year to filter for
Returns:
DataFrame with only the specified year's data
Example:
>>> df = get_latest_vehicle_tests()
>>> df_2024 = get_tests_by_year(df, 2024)
>>> 'tests_conducted' in df_2024.columns
True
"""
return df[df["year"] == year].reset_index(drop=True)
[docs]
def get_tests_by_month(df: pd.DataFrame, month: str, year: int) -> pd.DataFrame:
"""Get test data for a specific month and year.
Args:
df: Test statistics DataFrame
month: Month name (e.g., 'January', 'December')
year: Year
Returns:
DataFrame with single row for the specified month
Example:
>>> df = get_latest_vehicle_tests()
>>> dec_2025 = get_tests_by_month(df, 'December', 2025)
>>> 'tests_conducted' in dec_2025.columns
True
"""
return df[(df["month"] == month) & (df["year"] == year)].reset_index(drop=True)
[docs]
def calculate_growth_rates(df: pd.DataFrame, periods: int = 12) -> pd.DataFrame:
"""Calculate year-on-year growth rates for test statistics.
Args:
df: Test statistics DataFrame
periods: Number of months for comparison (default: 12 for YoY)
Returns:
DataFrame with additional column:
- yoy_growth: Percentage change vs same month previous year
Example:
>>> df = get_latest_vehicle_tests()
>>> df_growth = calculate_growth_rates(df)
>>> 'yoy_growth' in df_growth.columns
True
"""
result = df.copy()
result["yoy_growth"] = result["tests_conducted"].pct_change(periods=periods) * 100
return result
[docs]
def get_summary_statistics(df: pd.DataFrame, start_year: int | None = None, end_year: int | None = None) -> dict:
"""Calculate summary statistics for test data.
Args:
df: Test statistics DataFrame
start_year: Optional start year for summary
end_year: Optional end year for summary
Returns:
Dictionary with summary statistics:
- period: Time period covered
- total_tests: Total tests in period
- monthly_mean: Average monthly tests
- monthly_min: Minimum monthly tests
- monthly_max: Maximum monthly tests
- months_count: Number of months included
Example:
>>> df = get_latest_vehicle_tests()
>>> stats = get_summary_statistics(df, start_year=2020)
>>> sorted(stats.keys())
['monthly_max', 'monthly_mean', 'monthly_min', 'months_count', 'period', 'total_tests']
"""
filtered = df.copy()
if start_year:
filtered = filtered[filtered["year"] >= start_year]
if end_year:
filtered = filtered[filtered["year"] <= end_year]
return {
"period": f"{filtered['year'].min()}-{filtered['year'].max()}",
"total_tests": int(filtered["tests_conducted"].sum()),
"monthly_mean": float(filtered["tests_conducted"].mean()),
"monthly_min": int(filtered["tests_conducted"].min()),
"monthly_max": int(filtered["tests_conducted"].max()),
"months_count": len(filtered),
}
[docs]
def validate_dva_test_data(df: pd.DataFrame) -> bool: # pragma: no cover
"""Validate DVA test statistics data integrity.
Args:
df: DataFrame from DVA test functions (vehicle, driver, or theory tests)
Returns:
True if validation passes, False otherwise
"""
if df.empty:
logger.warning("DVA test data is empty")
return False
required_cols = {"month", "tests_conducted"}
if not required_cols.issubset(df.columns):
missing = required_cols - set(df.columns)
logger.warning(f"Missing required DVA columns: {missing}")
return False
# Check for non-negative test counts
if (df["tests_conducted"] < 0).any():
logger.warning("Found negative test counts in DVA data")
return False
# Check for reasonable monthly test volumes
# Vehicle tests typically range from 40,000 to 100,000 per month in NI
if df["tests_conducted"].max() > 200000: # Allow for variation but catch obvious errors
logger.warning("Unreasonably high test counts found")
return False
return True