Source code for bolster.data_sources.psni.pace

"""PSNI Police and Criminal Evidence (PACE) Order Statistics.

Provides access to annual PACE statistics for Northern Ireland, covering:
- Stop and search activity (monthly counts by reason: stolen articles, offensive
  weapons/blade or point, going equipped/prohibited articles, fireworks)
- Arrests under PACE by quarter, gender, and whether a solicitor or friend/relative
  was requested during detention

Each annual Excel workbook covers a single financial year (April–March) and is
published by PSNI Statistics Branch each May on the PSNI publications index:

    https://www.psni.police.uk/about-us/our-publications-and-reports/official-statistics/police-and-criminal-evidence-pace-order

**URL discovery note**: The PSNI publications index page is protected by Cloudflare
and cannot be scraped programmatically. Direct asset URLs at ``/sites/default/files/``
*can* be fetched with a browser-like ``User-Agent`` + ``Referer`` header, but the
filename portion of the URL is not predictable (includes the year in ``YYYY.YY``
format and may include a revision suffix such as ``a2``).

The :data:`PACE_URLS` dict therefore hard-codes confirmed download URLs. It should
be updated each May when PSNI publishes the new edition. Use
:func:`get_latest_pace_url` to retrieve the most recent known URL.

Data Source:
    PSNI Statistics Branch
    https://www.psni.police.uk/about-us/our-publications-and-reports/official-statistics/police-and-criminal-evidence-pace-order

Update Frequency: Annual (published each May)
Geographic Coverage: Northern Ireland (NI-wide aggregate)
Time Coverage: One financial year per workbook; ``PACE_URLS`` spans 2024/25–2025/26

Example:
    >>> from bolster.data_sources.psni import pace
    >>> url = pace.get_latest_pace_url()
    >>> url.startswith("https://")
    True
    >>> df = pace.get_latest_pace(breakdown="stop_search")
    >>> "reason" in df.columns
    True
    >>> pace.validate_pace(df, "stop_search")
    True
"""

import logging
import re
from pathlib import Path

import pandas as pd

from ._base import PSNIValidationError, download_file


[docs]
logger = logging.getLogger(__name__)


# ---------------------------------------------------------------------------
# Known download URLs for PACE annual Excel workbooks.
#
# IMPORTANT: Update this dict each May when PSNI publishes the new edition.
# The URL filename is not predictable (year format YYYY.YY, sometimes with a
# revision suffix like ``a2``).  The direct asset URLs can be fetched with the
# browser-like headers defined in _base_http_headers.py.
# ---------------------------------------------------------------------------

[docs]
PACE_URLS: dict[str, str] = {
    "2024/25": (
        "https://www.psni.police.uk/sites/default/files/2025-05/"
        "PACE%20Statistics%20Report%202024.25%20-%20Accompanying%20Spreadsheet.xlsx"
    ),
    "2025/26": (
        "https://www.psni.police.uk/sites/default/files/2026-05/"
        "PACE%20Statistics%20Report%202025.26%20-%20Accompanying%20Spreadsheeta2.xlsx"
    ),
}


# PSNI publications index — used as the Referer header when downloading assets
_PACE_INDEX_URL = (
    "https://www.psni.police.uk/about-us/our-publications-and-reports/"
    "official-statistics/police-and-criminal-evidence-pace-order"
)

# Cache TTL: annual publication, so refresh roughly annually
_CACHE_TTL_HOURS = 24 * 365

# Month abbreviations in the order they appear across the financial year (Apr–Mar)
_MONTHS = ["Apr", "May", "Jun", "Jul", "Aug", "Sept", "Oct", "Nov", "Dec", "Jan", "Feb", "Mar"]

# Human-readable reason labels as they appear in column 0 of the spreadsheet
# (after forward-filling the merged cells in the source)
_STOP_SEARCH_REASONS = {
    "Stolen Property / Articles": "Stolen Articles",
    "Offensive Weapon / Blade or Point": "Offensive Weapon / Blade or Point",
    "Going Equipped / Prohibited Articles": "Going Equipped / Prohibited Articles",
    "Fireworks": "Fireworks",
    "Total": "Total",
}

# Row indices (0-based from the sheet start) for Table 1 data rows.
# Row 11 in the sheet is the month header row; data starts at row 12.
# Using positional constants avoids fragile header detection.
_T1_HEADER_ROW = 10  # 0-indexed row 10 = sheet row 11 (month names)
_T1_DATA_START = 11  # sheet row 12
_T1_DATA_END = 21  # sheet row 21 (inclusive): last data row before footnotes

# Table 2 starts at sheet row 31 (0-indexed 30)
_T2_DATA_START = 30  # sheet rows 31–35 (Q1–Q4 + Total)
_T2_DATA_END = 35  # exclusive



[docs]
def get_latest_pace_url() -> str:
    """Return the download URL for the most recent known PACE annual workbook.

    The URL is drawn from :data:`PACE_URLS`. Update that dict each May when
    PSNI publishes a new edition.

    Returns:
        Direct download URL for the latest PACE Excel workbook.

    Example:
        >>> from bolster.data_sources.psni.pace import get_latest_pace_url
        >>> url = get_latest_pace_url()
        >>> url.startswith("https://www.psni.police.uk/")
        True
    """
    latest_year = sorted(PACE_URLS.keys())[-1]
    return PACE_URLS[latest_year]



def _extract_financial_year(file_path: Path) -> str:
    """Extract the financial year string from the spreadsheet header row.

    Reads the cell at (row 4, col A) which contains text like
    ``"Accompanying spreadsheet for statistics covering the period 2025/26 …"``
    and extracts the ``YYYY/YY`` portion.

    Falls back to ``"unknown"`` if the pattern is not found.

    Args:
        file_path: Local path to the downloaded Excel workbook.

    Returns:
        Financial year string, e.g. ``"2025/26"``.
    """
    df_raw = pd.read_excel(file_path, sheet_name="Statistical_Tables", header=None, nrows=5)
    header_text = str(df_raw.iloc[3, 0])  # Row 4 (0-indexed 3)
    match = re.search(r"(\d{4}/\d{2})", header_text)
    return match.group(1) if match else "unknown"



[docs]
def parse_stop_search(file_path: Path | str) -> pd.DataFrame:
    """Parse Table 1 (monthly stop & search counts) from a PACE Excel workbook.

    The table covers stop and search activity for a single financial year,
    broken down by month (Apr–Mar) and search reason.

    Args:
        file_path: Local path to the downloaded PACE Excel workbook.

    Returns:
        DataFrame with columns:
        - ``financial_year``: e.g. ``"2025/26"``
        - ``year``: int, start year of financial year (e.g. ``2025``)
        - ``month``: month abbreviation, e.g. ``"Apr"``
        - ``reason``: search reason category
        - ``metric``: ``"Searches"`` or ``"Arrests"``
        - ``count``: integer count

    Raises:
        PSNIValidationError: If the expected table structure is not found.

    Example:
        >>> import tempfile, pathlib
        >>> df = parse_stop_search("/tmp/pace_2025_26.xlsx")  # doctest: +SKIP
        >>> list(df.columns)  # doctest: +SKIP
        ['financial_year', 'year', 'month', 'reason', 'metric', 'count']
    """
    file_path = Path(file_path)
    financial_year = _extract_financial_year(file_path)
    year = int(financial_year.split("/")[0]) if "/" in financial_year else 0

    df_raw = pd.read_excel(file_path, sheet_name="Statistical_Tables", header=None)

    # Find the row containing the month abbreviation "Apr" to locate table start
    t1_header_idx = None
    for idx, row in df_raw.iterrows():
        if row[2] == "Apr":
            t1_header_idx = idx
            break

    if t1_header_idx is None:
        raise PSNIValidationError("Could not locate stop & search table header row (expected 'Apr' in column C)")

    # Data rows immediately follow the header
    data_rows = df_raw.iloc[t1_header_idx + 1 : t1_header_idx + 11]

    records = []
    current_reason = None

    for _, row in data_rows.iterrows():
        # Column A (idx 0): reason label (may be None for continuation rows)
        reason_raw = row[0]
        metric_raw = str(row[1]) if row[1] is not None else ""

        if reason_raw is not None and str(reason_raw).strip():
            current_reason = _STOP_SEARCH_REASONS.get(str(reason_raw).strip(), str(reason_raw).strip())

        if current_reason is None:
            continue

        # Normalise metric label: strip footnote markers like (1), (2), (1,2)
        metric = re.sub(r"\s*\(\d[,\d]*\)", "", metric_raw).strip()
        if metric not in ("Searches", "Arrests"):
            continue

        # Columns C–N (indices 2–13) hold Apr–Mar monthly counts
        for col_idx, month in enumerate(_MONTHS, start=2):
            val = row[col_idx]
            count = int(val) if pd.notna(val) else 0
            records.append(
                {
                    "financial_year": financial_year,
                    "year": year,
                    "month": month,
                    "reason": current_reason,
                    "metric": metric,
                    "count": count,
                }
            )

    df = pd.DataFrame(records)
    if df.empty:
        raise PSNIValidationError("No stop & search data rows were parsed from the workbook")

    df["count"] = df["count"].astype(int)
    df["year"] = df["year"].astype(int)
    df["financial_year"] = df["financial_year"].astype("category")
    df["month"] = pd.Categorical(df["month"], categories=_MONTHS, ordered=True)
    df["reason"] = df["reason"].astype("category")
    df["metric"] = df["metric"].astype("category")

    logger.info(f"Parsed {len(df)} stop & search records for {financial_year}")
    return df




[docs]
def parse_arrests(file_path: Path | str) -> pd.DataFrame:
    """Parse Table 2 (quarterly PACE arrests) from a PACE Excel workbook.

    The table covers arrests under PACE for a single financial year, broken
    down by quarter and category (total, male, female, unknown/other, and
    whether a solicitor or friend/relative was requested during detention).

    Args:
        file_path: Local path to the downloaded PACE Excel workbook.

    Returns:
        DataFrame with columns:
        - ``financial_year``: e.g. ``"2025/26"``
        - ``year``: int, start year of financial year (e.g. ``2025``)
        - ``quarter``: quarter label, e.g. ``"Q1 (Apr–Jun)"``
        - ``category``: demographic/request category
        - ``count``: integer count

    Raises:
        PSNIValidationError: If the expected table structure is not found.

    Example:
        >>> df = parse_arrests("/tmp/pace_2025_26.xlsx")  # doctest: +SKIP
        >>> list(df.columns)  # doctest: +SKIP
        ['financial_year', 'year', 'quarter', 'category', 'count']
    """
    file_path = Path(file_path)
    financial_year = _extract_financial_year(file_path)
    year = int(financial_year.split("/")[0]) if "/" in financial_year else 0

    df_raw = pd.read_excel(file_path, sheet_name="Statistical_Tables", header=None)

    # Find the row containing a quarter label like "April 25 – June 25"
    # which is the first data row of Table 2.
    t2_data_start = None
    for idx, row in df_raw.iterrows():
        cell = str(row[0]) if row[0] is not None else ""
        # Quarter data rows start with a month name + year span, e.g. "April 25 – June 25"
        if re.match(r"(April|July|October|January)\s+\d{2}", cell):
            t2_data_start = idx
            break

    if t2_data_start is None:
        raise PSNIValidationError("Could not locate arrests table data rows (expected quarter label in column A)")

    # Collect up to 5 rows: Q1, Q2, Q3, Q4, Total
    data_rows = df_raw.iloc[t2_data_start : t2_data_start + 5]

    # Categories correspond to column indices 1, 2, 3, 4, 6, 7
    # Col 1: Total, Col 2: Male, Col 3: Female, Col 4: Unknown/Other
    # Col 6: Friend/relative, Col 7: Solicitor
    category_cols = {
        1: "Total",
        2: "Male",
        3: "Female",
        4: "Unknown / Other",
        6: "Requested friend / relative",
        7: "Requested solicitor",
    }

    # Quarter labels: map from source row labels to standardised short labels
    quarter_labels = ["Q1 (Apr–Jun)", "Q2 (Jul–Sep)", "Q3 (Oct–Dec)", "Q4 (Jan–Mar)", "Annual Total"]

    records = []
    for row_pos, (_, row) in enumerate(data_rows.iterrows()):
        if row_pos >= len(quarter_labels):
            break
        quarter = quarter_labels[row_pos]
        for col_idx, category in category_cols.items():
            val = row[col_idx]
            count = int(val) if pd.notna(val) and val != "" else 0
            records.append(
                {
                    "financial_year": financial_year,
                    "year": year,
                    "quarter": quarter,
                    "category": category,
                    "count": count,
                }
            )

    df = pd.DataFrame(records)
    if df.empty:
        raise PSNIValidationError("No arrests data rows were parsed from the workbook")

    df["count"] = df["count"].astype(int)
    df["year"] = df["year"].astype(int)
    df["financial_year"] = df["financial_year"].astype("category")
    _quarter_order = ["Q1 (Apr–Jun)", "Q2 (Jul–Sep)", "Q3 (Oct–Dec)", "Q4 (Jan–Mar)", "Annual Total"]
    df["quarter"] = pd.Categorical(df["quarter"], categories=_quarter_order, ordered=True)
    df["category"] = df["category"].astype("category")

    logger.info(f"Parsed {len(df)} arrests records for {financial_year}")
    return df




[docs]
def get_latest_pace(breakdown: str = "stop_search", force_refresh: bool = False) -> pd.DataFrame:
    """Download and return the latest PACE statistics.

    Downloads the most recent PACE Excel workbook (from :data:`PACE_URLS`),
    caches it locally for one year, and returns either the stop & search or
    the arrests breakdown.

    Args:
        breakdown: Which table to return — ``"stop_search"`` (Table 1, monthly
            stop & search counts) or ``"arrests"`` (Table 2, quarterly arrest
            demographics). Default: ``"stop_search"``.
        force_refresh: If ``True``, bypass the cache and re-download. Default:
            ``False``.

    Returns:
        DataFrame — see :func:`parse_stop_search` or :func:`parse_arrests` for
        column descriptions.

    Raises:
        ValueError: If ``breakdown`` is not ``"stop_search"`` or ``"arrests"``.
        PSNIDataNotFoundError: If the download fails.
        PSNIValidationError: If the workbook structure is not as expected.

    Example:
        >>> df = get_latest_pace(breakdown="stop_search")  # doctest: +SKIP
        >>> "reason" in df.columns
        True
        >>> df = get_latest_pace(breakdown="arrests")  # doctest: +SKIP
        >>> "category" in df.columns
        True
    """
    if breakdown not in ("stop_search", "arrests"):
        raise ValueError(f"breakdown must be 'stop_search' or 'arrests', got {breakdown!r}")

    url = get_latest_pace_url()
    headers = {
        "User-Agent": (
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
            "(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
        ),
        "Referer": _PACE_INDEX_URL,
    }

    file_path = download_file(url, cache_ttl_hours=_CACHE_TTL_HOURS, force_refresh=force_refresh, headers=headers)

    if breakdown == "stop_search":
        return parse_stop_search(file_path)
    return parse_arrests(file_path)




[docs]
def validate_pace(df: pd.DataFrame, breakdown: str) -> bool:
    """Validate a PACE DataFrame for structural integrity.

    Checks that the DataFrame has the required columns and contains at least
    some data.

    Args:
        df: DataFrame returned by :func:`parse_stop_search` or
            :func:`parse_arrests`.
        breakdown: ``"stop_search"`` or ``"arrests"`` — selects the expected
            column set.

    Returns:
        ``True`` if the DataFrame passes all checks.

    Raises:
        PSNIValidationError: If any check fails (empty DataFrame, missing
            columns, non-positive counts).

    Example:
        >>> import pandas as pd
        >>> from bolster.data_sources.psni.pace import validate_pace, PSNIValidationError
        >>> validate_pace(pd.DataFrame(), "stop_search")
        Traceback (most recent call last):
            ...
        bolster.data_sources.psni._base.PSNIValidationError: PACE DataFrame is empty
    """
    if df.empty:
        raise PSNIValidationError("PACE DataFrame is empty")

    required_columns: dict[str, list[str]] = {
        "stop_search": ["financial_year", "year", "month", "reason", "metric", "count"],
        "arrests": ["financial_year", "year", "quarter", "category", "count"],
    }

    if breakdown not in required_columns:
        raise PSNIValidationError(f"Unknown breakdown {breakdown!r}; expected 'stop_search' or 'arrests'")

    missing = [c for c in required_columns[breakdown] if c not in df.columns]
    if missing:
        raise PSNIValidationError(f"PACE DataFrame missing required columns: {missing}")

    if (df["count"] < 0).any():
        raise PSNIValidationError("PACE DataFrame contains negative counts")

    return True