Source code for bolster.data_sources.psni.pace

"""PSNI Police and Criminal Evidence (PACE) Order Statistics.

Provides access to annual PACE statistics for Northern Ireland, covering:
- Stop and search activity (monthly counts by reason: stolen articles, offensive
  weapons/blade or point, going equipped/prohibited articles, fireworks)
- Arrests under PACE by quarter, gender, and whether a solicitor or friend/relative
  was requested during detention

Each annual Excel workbook covers a single financial year (April–March) and is
published by PSNI Statistics Branch each May on the PSNI publications index:

    https://www.psni.police.uk/about-us/our-publications-and-reports/official-statistics/police-and-criminal-evidence-pace-order

**URL discovery note**: The PSNI publications index page is protected by Cloudflare
and cannot be scraped programmatically. Direct asset URLs at ``/sites/default/files/``
*can* be fetched with a browser-like ``User-Agent`` + ``Referer`` header, but the
filename portion of the URL is not predictable (includes the year in ``YYYY.YY``
format and may include a revision suffix such as ``a2``).

The :data:`PACE_URLS` dict therefore hard-codes confirmed download URLs. It should
be updated each May when PSNI publishes the new edition. Use
:func:`get_latest_pace_url` to retrieve the most recent known URL.

Data Source:
    PSNI Statistics Branch
    https://www.psni.police.uk/about-us/our-publications-and-reports/official-statistics/police-and-criminal-evidence-pace-order

Update Frequency: Annual (published each May)
Geographic Coverage: Northern Ireland (NI-wide aggregate)
Time Coverage: One financial year per workbook; ``PACE_URLS`` spans 2024/25–2025/26

Example:
    >>> from bolster.data_sources.psni import pace
    >>> url = pace.get_latest_pace_url()
    >>> url.startswith("https://")
    True
    >>> df = pace.get_latest_pace(breakdown="stop_search")
    >>> "reason" in df.columns
    True
    >>> pace.validate_pace(df, "stop_search")
    True
"""

import logging
import re
from pathlib import Path

import pandas as pd

from ._base import PSNIValidationError, download_file

[docs] logger = logging.getLogger(__name__)
# --------------------------------------------------------------------------- # Known download URLs for PACE annual Excel workbooks. # # IMPORTANT: Update this dict each May when PSNI publishes the new edition. # The URL filename is not predictable (year format YYYY.YY, sometimes with a # revision suffix like ``a2``). The direct asset URLs can be fetched with the # browser-like headers defined in _base_http_headers.py. # ---------------------------------------------------------------------------
[docs] PACE_URLS: dict[str, str] = { "2024/25": ( "https://www.psni.police.uk/sites/default/files/2025-05/" "PACE%20Statistics%20Report%202024.25%20-%20Accompanying%20Spreadsheet.xlsx" ), "2025/26": ( "https://www.psni.police.uk/sites/default/files/2026-05/" "PACE%20Statistics%20Report%202025.26%20-%20Accompanying%20Spreadsheeta2.xlsx" ), }
# PSNI publications index — used as the Referer header when downloading assets _PACE_INDEX_URL = ( "https://www.psni.police.uk/about-us/our-publications-and-reports/" "official-statistics/police-and-criminal-evidence-pace-order" ) # Cache TTL: annual publication, so refresh roughly annually _CACHE_TTL_HOURS = 24 * 365 # Month abbreviations in the order they appear across the financial year (Apr–Mar) _MONTHS = ["Apr", "May", "Jun", "Jul", "Aug", "Sept", "Oct", "Nov", "Dec", "Jan", "Feb", "Mar"] # Human-readable reason labels as they appear in column 0 of the spreadsheet # (after forward-filling the merged cells in the source) _STOP_SEARCH_REASONS = { "Stolen Property / Articles": "Stolen Articles", "Offensive Weapon / Blade or Point": "Offensive Weapon / Blade or Point", "Going Equipped / Prohibited Articles": "Going Equipped / Prohibited Articles", "Fireworks": "Fireworks", "Total": "Total", } # Row indices (0-based from the sheet start) for Table 1 data rows. # Row 11 in the sheet is the month header row; data starts at row 12. # Using positional constants avoids fragile header detection. _T1_HEADER_ROW = 10 # 0-indexed row 10 = sheet row 11 (month names) _T1_DATA_START = 11 # sheet row 12 _T1_DATA_END = 21 # sheet row 21 (inclusive): last data row before footnotes # Table 2 starts at sheet row 31 (0-indexed 30) _T2_DATA_START = 30 # sheet rows 31–35 (Q1–Q4 + Total) _T2_DATA_END = 35 # exclusive
[docs] def get_latest_pace_url() -> str: """Return the download URL for the most recent known PACE annual workbook. The URL is drawn from :data:`PACE_URLS`. Update that dict each May when PSNI publishes a new edition. Returns: Direct download URL for the latest PACE Excel workbook. Example: >>> from bolster.data_sources.psni.pace import get_latest_pace_url >>> url = get_latest_pace_url() >>> url.startswith("https://www.psni.police.uk/") True """ latest_year = sorted(PACE_URLS.keys())[-1] return PACE_URLS[latest_year]
def _extract_financial_year(file_path: Path) -> str: """Extract the financial year string from the spreadsheet header row. Reads the cell at (row 4, col A) which contains text like ``"Accompanying spreadsheet for statistics covering the period 2025/26 …"`` and extracts the ``YYYY/YY`` portion. Falls back to ``"unknown"`` if the pattern is not found. Args: file_path: Local path to the downloaded Excel workbook. Returns: Financial year string, e.g. ``"2025/26"``. """ df_raw = pd.read_excel(file_path, sheet_name="Statistical_Tables", header=None, nrows=5) header_text = str(df_raw.iloc[3, 0]) # Row 4 (0-indexed 3) match = re.search(r"(\d{4}/\d{2})", header_text) return match.group(1) if match else "unknown"
[docs] def parse_arrests(file_path: Path | str) -> pd.DataFrame: """Parse Table 2 (quarterly PACE arrests) from a PACE Excel workbook. The table covers arrests under PACE for a single financial year, broken down by quarter and category (total, male, female, unknown/other, and whether a solicitor or friend/relative was requested during detention). Args: file_path: Local path to the downloaded PACE Excel workbook. Returns: DataFrame with columns: - ``financial_year``: e.g. ``"2025/26"`` - ``year``: int, start year of financial year (e.g. ``2025``) - ``quarter``: quarter label, e.g. ``"Q1 (Apr–Jun)"`` - ``category``: demographic/request category - ``count``: integer count Raises: PSNIValidationError: If the expected table structure is not found. Example: >>> df = parse_arrests("/tmp/pace_2025_26.xlsx") # doctest: +SKIP >>> list(df.columns) # doctest: +SKIP ['financial_year', 'year', 'quarter', 'category', 'count'] """ file_path = Path(file_path) financial_year = _extract_financial_year(file_path) year = int(financial_year.split("/")[0]) if "/" in financial_year else 0 df_raw = pd.read_excel(file_path, sheet_name="Statistical_Tables", header=None) # Find the row containing a quarter label like "April 25 – June 25" # which is the first data row of Table 2. t2_data_start = None for idx, row in df_raw.iterrows(): cell = str(row[0]) if row[0] is not None else "" # Quarter data rows start with a month name + year span, e.g. "April 25 – June 25" if re.match(r"(April|July|October|January)\s+\d{2}", cell): t2_data_start = idx break if t2_data_start is None: raise PSNIValidationError("Could not locate arrests table data rows (expected quarter label in column A)") # Collect up to 5 rows: Q1, Q2, Q3, Q4, Total data_rows = df_raw.iloc[t2_data_start : t2_data_start + 5] # Categories correspond to column indices 1, 2, 3, 4, 6, 7 # Col 1: Total, Col 2: Male, Col 3: Female, Col 4: Unknown/Other # Col 6: Friend/relative, Col 7: Solicitor category_cols = { 1: "Total", 2: "Male", 3: "Female", 4: "Unknown / Other", 6: "Requested friend / relative", 7: "Requested solicitor", } # Quarter labels: map from source row labels to standardised short labels quarter_labels = ["Q1 (Apr–Jun)", "Q2 (Jul–Sep)", "Q3 (Oct–Dec)", "Q4 (Jan–Mar)", "Annual Total"] records = [] for row_pos, (_, row) in enumerate(data_rows.iterrows()): if row_pos >= len(quarter_labels): break quarter = quarter_labels[row_pos] for col_idx, category in category_cols.items(): val = row[col_idx] count = int(val) if pd.notna(val) and val != "" else 0 records.append( { "financial_year": financial_year, "year": year, "quarter": quarter, "category": category, "count": count, } ) df = pd.DataFrame(records) if df.empty: raise PSNIValidationError("No arrests data rows were parsed from the workbook") df["count"] = df["count"].astype(int) df["year"] = df["year"].astype(int) df["financial_year"] = df["financial_year"].astype("category") _quarter_order = ["Q1 (Apr–Jun)", "Q2 (Jul–Sep)", "Q3 (Oct–Dec)", "Q4 (Jan–Mar)", "Annual Total"] df["quarter"] = pd.Categorical(df["quarter"], categories=_quarter_order, ordered=True) df["category"] = df["category"].astype("category") logger.info(f"Parsed {len(df)} arrests records for {financial_year}") return df
[docs] def get_latest_pace(breakdown: str = "stop_search", force_refresh: bool = False) -> pd.DataFrame: """Download and return the latest PACE statistics. Downloads the most recent PACE Excel workbook (from :data:`PACE_URLS`), caches it locally for one year, and returns either the stop & search or the arrests breakdown. Args: breakdown: Which table to return — ``"stop_search"`` (Table 1, monthly stop & search counts) or ``"arrests"`` (Table 2, quarterly arrest demographics). Default: ``"stop_search"``. force_refresh: If ``True``, bypass the cache and re-download. Default: ``False``. Returns: DataFrame — see :func:`parse_stop_search` or :func:`parse_arrests` for column descriptions. Raises: ValueError: If ``breakdown`` is not ``"stop_search"`` or ``"arrests"``. PSNIDataNotFoundError: If the download fails. PSNIValidationError: If the workbook structure is not as expected. Example: >>> df = get_latest_pace(breakdown="stop_search") # doctest: +SKIP >>> "reason" in df.columns True >>> df = get_latest_pace(breakdown="arrests") # doctest: +SKIP >>> "category" in df.columns True """ if breakdown not in ("stop_search", "arrests"): raise ValueError(f"breakdown must be 'stop_search' or 'arrests', got {breakdown!r}") url = get_latest_pace_url() headers = { "User-Agent": ( "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " "(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" ), "Referer": _PACE_INDEX_URL, } file_path = download_file(url, cache_ttl_hours=_CACHE_TTL_HOURS, force_refresh=force_refresh, headers=headers) if breakdown == "stop_search": return parse_stop_search(file_path) return parse_arrests(file_path)
[docs] def validate_pace(df: pd.DataFrame, breakdown: str) -> bool: """Validate a PACE DataFrame for structural integrity. Checks that the DataFrame has the required columns and contains at least some data. Args: df: DataFrame returned by :func:`parse_stop_search` or :func:`parse_arrests`. breakdown: ``"stop_search"`` or ``"arrests"`` — selects the expected column set. Returns: ``True`` if the DataFrame passes all checks. Raises: PSNIValidationError: If any check fails (empty DataFrame, missing columns, non-positive counts). Example: >>> import pandas as pd >>> from bolster.data_sources.psni.pace import validate_pace, PSNIValidationError >>> validate_pace(pd.DataFrame(), "stop_search") Traceback (most recent call last): ... bolster.data_sources.psni._base.PSNIValidationError: PACE DataFrame is empty """ if df.empty: raise PSNIValidationError("PACE DataFrame is empty") required_columns: dict[str, list[str]] = { "stop_search": ["financial_year", "year", "month", "reason", "metric", "count"], "arrests": ["financial_year", "year", "quarter", "category", "count"], } if breakdown not in required_columns: raise PSNIValidationError(f"Unknown breakdown {breakdown!r}; expected 'stop_search' or 'arrests'") missing = [c for c in required_columns[breakdown] if c not in df.columns] if missing: raise PSNIValidationError(f"PACE DataFrame missing required columns: {missing}") if (df["count"] < 0).any(): raise PSNIValidationError("PACE DataFrame contains negative counts") return True