Source code for bolster.data_sources.psni.stop_and_search

"""PSNI Stop and Search Statistics.

Provides access to Police Service of Northern Ireland stop and search data,
covering individual stop and search records from 2017/18 to the latest
available financial year.

Data includes:
- Financial year and quarter (quarterly breakdowns)
- Legislation used (Misuse of Drugs Act, PACE, Justice & Security Act, etc.)
- PACE-specific reasons for search (stolen articles, prohibited articles, blade/point, fireworks)
- Subject demographics: age group and gender
- Geographic level: Northern Ireland-wide (no district breakdown in this dataset)

Data Source:
    **Primary Source**: OpenDataNI — Stop and Search Statistics 2017/18–2024/25

    https://www.opendatani.gov.uk/dataset/stop-and-search

    Data is published by the PSNI under the Open Government Licence v3.0.

Update Frequency: Annual (full dataset refreshed with each release)
Geographic Coverage: Northern Ireland (NI-wide only — no district breakdown)
Time Coverage: 2017/18 financial year to present
Row count: ~199,000 individual stop and search records

Example:
    >>> from bolster.data_sources.psni import stop_and_search
    >>> df = stop_and_search.get_latest_stop_and_search()
    >>> 'financial_year' in df.columns
    True
    >>> stop_and_search.validate_stop_and_search(df)
    True
"""

import logging

import pandas as pd

from bolster.utils.web import session

from ._base import PSNIValidationError, download_file

[docs] logger = logging.getLogger(__name__)
# OpenDataNI CKAN API (admin endpoint — the public endpoint redirects to a Cloudflare page)
[docs] OPENDATANI_API = "https://admin.opendatani.gov.uk/api/3/action"
# Stable dataset identifier on OpenDataNI
[docs] DATASET_ID = "421d96c1-fa5b-43e7-914c-b9a13e163d33"
# Fallback URL confirmed working as of 2025 — used if CKAN API is unavailable
[docs] FALLBACK_CSV_URL = ( "https://admin.opendatani.gov.uk/dataset/421d96c1-fa5b-43e7-914c-b9a13e163d33" "/resource/73fcba18-4616-4a60-91ea-873f69f6d063" "/download/stop-and-search-open-data-201718to202425.csv" )
# Cache TTL: monthly updates, so refresh roughly monthly
[docs] CACHE_TTL_HOURS = 24 * 30
# Mapping from verbose raw column names to clean snake_case equivalents
[docs] COLUMN_RENAMES: dict[str, str] = { "Financial Year": "financial_year", "Geographical Level": "geographical_level", "Legislation": "legislation", "(PACE) Reason for search - Stolen Articles": "pace_reason_stolen_articles", "(PACE) Reason for search - Prohibited Articles": "pace_reason_prohibited_articles", "(PACE) Reason for search - Blade or Point": "pace_reason_blade_or_point", "(PACE) Reason for search - Fireworks": "pace_reason_fireworks", "Quarter": "quarter", "AgeGroup": "age_group", "Gender": "gender", }
# Quarter ordering for categorical dtype (chronological order within a year)
[docs] QUARTER_ORDER = [ "April to June", "July to September", "October to December", "January to March", ]
# Age group ordering for categorical dtype
[docs] AGE_GROUP_ORDER = [ "Under 18", "18 to 25", "26 to 35", "36 to 45", "46 to 55", "56 to 65", "Over 65", "Not Specified", ]
[docs] def get_latest_dataset_url() -> str: """Query the OpenDataNI CKAN API to find the latest Stop and Search CSV URL. Fetches resource metadata for the stop-and-search dataset from the OpenDataNI CKAN API and returns the download URL for the CSV resource. Falls back to the known direct URL if the API request fails. Returns: Download URL for the latest stop and search CSV file. Example: >>> url = get_latest_dataset_url() >>> url.startswith("https://") True >>> url.endswith(".csv") True """ try: resp = session.get( f"{OPENDATANI_API}/package_show", params={"id": DATASET_ID}, headers={"User-Agent": "bolster/1.0"}, timeout=30, ) resp.raise_for_status() data = resp.json() if not data.get("success"): logger.warning("OpenDataNI CKAN API returned unsuccessful response; falling back to known URL") return FALLBACK_CSV_URL resources = data.get("result", {}).get("resources", []) for resource in resources: if resource.get("format", "").upper() == "CSV": url = resource.get("url", "") if url: logger.info(f"Found CSV resource via CKAN API: {url}") return url logger.warning("No CSV resource found via CKAN API; falling back to known URL") return FALLBACK_CSV_URL except Exception as e: logger.warning(f"CKAN API request failed ({e}); falling back to known URL") return FALLBACK_CSV_URL
def _parse_stop_and_search(file_path: str) -> pd.DataFrame: """Parse a downloaded Stop and Search CSV file into a clean DataFrame. Args: file_path: Local path to the downloaded CSV file. Returns: Cleaned DataFrame with snake_case column names and appropriate dtypes. Raises: PSNIValidationError: If the CSV does not contain the expected columns. """ df = pd.read_csv(file_path) # Validate raw columns before renaming expected_raw = set(COLUMN_RENAMES.keys()) missing = expected_raw - set(df.columns) if missing: raise PSNIValidationError( f"Stop and Search CSV missing expected columns: {missing}. Found columns: {df.columns.tolist()}" ) # Rename to snake_case df = df.rename(columns=COLUMN_RENAMES) # Normalise age_group: harmonise 'over 65' -> 'Over 65' (case inconsistency in source) df["age_group"] = df["age_group"].str.strip() df["age_group"] = df["age_group"].replace({"over 65": "Over 65", "Specified": "Not Specified"}) # Normalise legislation: strip trailing whitespace (source has trailing spaces in some rows) df["legislation"] = df["legislation"].str.strip() # Boolean columns for PACE reasons (Yes/No -> bool) pace_cols = [ "pace_reason_stolen_articles", "pace_reason_prohibited_articles", "pace_reason_blade_or_point", "pace_reason_fireworks", ] for col in pace_cols: df[col] = df[col].str.strip().str.upper().map({"YES": True, "NO": False}) # Ordered categoricals for dimensions with a natural order df["quarter"] = pd.Categorical(df["quarter"], categories=QUARTER_ORDER, ordered=True) df["age_group"] = pd.Categorical(df["age_group"], categories=AGE_GROUP_ORDER, ordered=True) # Unordered categoricals for nominal dimensions for col in ["financial_year", "geographical_level", "legislation", "gender"]: df[col] = df[col].astype("category") logger.info(f"Parsed {len(df):,} stop and search records") return df