Source code for bolster.data_sources.psni.stop_and_search

"""PSNI Stop and Search Statistics.

Provides access to Police Service of Northern Ireland stop and search data,
covering individual stop and search records from 2017/18 to the latest
available financial year.

Data includes:
- Financial year and quarter (quarterly breakdowns)
- Legislation used (Misuse of Drugs Act, PACE, Justice & Security Act, etc.)
- PACE-specific reasons for search (stolen articles, prohibited articles, blade/point, fireworks)
- Subject demographics: age group and gender
- Geographic level: Northern Ireland-wide (no district breakdown in this dataset)

Data Source:
    **Primary Source**: OpenDataNI — Stop and Search Statistics 2017/18–2024/25

    https://www.opendatani.gov.uk/dataset/stop-and-search

    Data is published by the PSNI under the Open Government Licence v3.0.

Update Frequency: Annual (full dataset refreshed with each release)
Geographic Coverage: Northern Ireland (NI-wide only — no district breakdown)
Time Coverage: 2017/18 financial year to present
Row count: ~199,000 individual stop and search records

Example:
    >>> from bolster.data_sources.psni import stop_and_search
    >>> df = stop_and_search.get_latest_stop_and_search()
    >>> 'financial_year' in df.columns
    True
    >>> stop_and_search.validate_stop_and_search(df)
    True
"""

import logging

import pandas as pd

from bolster.utils.web import session

from ._base import PSNIValidationError, download_file


[docs]
logger = logging.getLogger(__name__)


# OpenDataNI CKAN API (admin endpoint — the public endpoint redirects to a Cloudflare page)

[docs]
OPENDATANI_API = "https://admin.opendatani.gov.uk/api/3/action"


# Stable dataset identifier on OpenDataNI

[docs]
DATASET_ID = "421d96c1-fa5b-43e7-914c-b9a13e163d33"


# Fallback URL confirmed working as of 2025 — used if CKAN API is unavailable

[docs]
FALLBACK_CSV_URL = (
    "https://admin.opendatani.gov.uk/dataset/421d96c1-fa5b-43e7-914c-b9a13e163d33"
    "/resource/73fcba18-4616-4a60-91ea-873f69f6d063"
    "/download/stop-and-search-open-data-201718to202425.csv"
)


# Cache TTL: monthly updates, so refresh roughly monthly

[docs]
CACHE_TTL_HOURS = 24 * 30


# Mapping from verbose raw column names to clean snake_case equivalents

[docs]
COLUMN_RENAMES: dict[str, str] = {
    "Financial Year": "financial_year",
    "Geographical Level": "geographical_level",
    "Legislation": "legislation",
    "(PACE) Reason for search - Stolen Articles": "pace_reason_stolen_articles",
    "(PACE) Reason for search - Prohibited Articles": "pace_reason_prohibited_articles",
    "(PACE) Reason for search - Blade or Point": "pace_reason_blade_or_point",
    "(PACE) Reason for search - Fireworks": "pace_reason_fireworks",
    "Quarter": "quarter",
    "AgeGroup": "age_group",
    "Gender": "gender",
}


# Quarter ordering for categorical dtype (chronological order within a year)

[docs]
QUARTER_ORDER = [
    "April to June",
    "July to September",
    "October to December",
    "January to March",
]


# Age group ordering for categorical dtype

[docs]
AGE_GROUP_ORDER = [
    "Under 18",
    "18 to 25",
    "26 to 35",
    "36 to 45",
    "46 to 55",
    "56 to 65",
    "Over 65",
    "Not Specified",
]




[docs]
def get_latest_dataset_url() -> str:
    """Query the OpenDataNI CKAN API to find the latest Stop and Search CSV URL.

    Fetches resource metadata for the stop-and-search dataset from the OpenDataNI
    CKAN API and returns the download URL for the CSV resource. Falls back to the
    known direct URL if the API request fails.

    Returns:
        Download URL for the latest stop and search CSV file.

    Example:
        >>> url = get_latest_dataset_url()
        >>> url.startswith("https://")
        True
        >>> url.endswith(".csv")
        True
    """
    try:
        resp = session.get(
            f"{OPENDATANI_API}/package_show",
            params={"id": DATASET_ID},
            headers={"User-Agent": "bolster/1.0"},
            timeout=30,
        )
        resp.raise_for_status()
        data = resp.json()

        if not data.get("success"):
            logger.warning("OpenDataNI CKAN API returned unsuccessful response; falling back to known URL")
            return FALLBACK_CSV_URL

        resources = data.get("result", {}).get("resources", [])
        for resource in resources:
            if resource.get("format", "").upper() == "CSV":
                url = resource.get("url", "")
                if url:
                    logger.info(f"Found CSV resource via CKAN API: {url}")
                    return url

        logger.warning("No CSV resource found via CKAN API; falling back to known URL")
        return FALLBACK_CSV_URL

    except Exception as e:
        logger.warning(f"CKAN API request failed ({e}); falling back to known URL")
        return FALLBACK_CSV_URL



def _parse_stop_and_search(file_path: str) -> pd.DataFrame:
    """Parse a downloaded Stop and Search CSV file into a clean DataFrame.

    Args:
        file_path: Local path to the downloaded CSV file.

    Returns:
        Cleaned DataFrame with snake_case column names and appropriate dtypes.

    Raises:
        PSNIValidationError: If the CSV does not contain the expected columns.
    """
    df = pd.read_csv(file_path)

    # Validate raw columns before renaming
    expected_raw = set(COLUMN_RENAMES.keys())
    missing = expected_raw - set(df.columns)
    if missing:
        raise PSNIValidationError(
            f"Stop and Search CSV missing expected columns: {missing}. Found columns: {df.columns.tolist()}"
        )

    # Rename to snake_case
    df = df.rename(columns=COLUMN_RENAMES)

    # Normalise age_group: harmonise 'over 65' -> 'Over 65' (case inconsistency in source)
    df["age_group"] = df["age_group"].str.strip()
    df["age_group"] = df["age_group"].replace({"over 65": "Over 65", "Specified": "Not Specified"})

    # Normalise legislation: strip trailing whitespace (source has trailing spaces in some rows)
    df["legislation"] = df["legislation"].str.strip()

    # Boolean columns for PACE reasons (Yes/No -> bool)
    pace_cols = [
        "pace_reason_stolen_articles",
        "pace_reason_prohibited_articles",
        "pace_reason_blade_or_point",
        "pace_reason_fireworks",
    ]
    for col in pace_cols:
        df[col] = df[col].str.strip().str.upper().map({"YES": True, "NO": False})

    # Ordered categoricals for dimensions with a natural order
    df["quarter"] = pd.Categorical(df["quarter"], categories=QUARTER_ORDER, ordered=True)
    df["age_group"] = pd.Categorical(df["age_group"], categories=AGE_GROUP_ORDER, ordered=True)

    # Unordered categoricals for nominal dimensions
    for col in ["financial_year", "geographical_level", "legislation", "gender"]:
        df[col] = df[col].astype("category")

    logger.info(f"Parsed {len(df):,} stop and search records")
    return df



[docs]
def get_latest_stop_and_search(force_refresh: bool = False) -> pd.DataFrame:
    """Download and return the latest PSNI Stop and Search dataset.

    Fetches the current stop and search data from OpenDataNI, caches it
    locally for ~30 days, and returns a cleaned DataFrame with snake_case
    column names and appropriate dtypes.

    The dataset covers individual stop and search records for Northern Ireland
    from financial year 2017/18 to the most recently published year. Note that
    the dataset does **not** include a district-level geographic breakdown —
    all records are at Northern Ireland level.

    Args:
        force_refresh: If True, bypass the local cache and re-download the data.

    Returns:
        DataFrame with columns:
            - financial_year (category): e.g. ``"2023/24"``
            - geographical_level (category): always ``"Northern Ireland"``
            - legislation (category): legislation under which the search was conducted
            - pace_reason_stolen_articles (bool): PACE reason — stolen articles
            - pace_reason_prohibited_articles (bool): PACE reason — prohibited articles
            - pace_reason_blade_or_point (bool): PACE reason — blade or point
            - pace_reason_fireworks (bool): PACE reason — fireworks
            - quarter (Categorical[ordered]): quarter label, e.g. ``"April to June"``
            - age_group (Categorical[ordered]): age band of the subject
            - gender (category): subject gender

    Raises:
        PSNIDataNotFoundError: If the download fails.
        PSNIValidationError: If the downloaded file does not match the expected schema.

    Example:
        >>> df = get_latest_stop_and_search()
        >>> len(df) > 100_000
        True
        >>> sorted(df['financial_year'].cat.categories.tolist())  # doctest: +SKIP
        ['2017/18', '2018/19', '2019/20', '2020/21', '2021/22', '2022/23', '2023/24', '2024/25']
    """
    url = get_latest_dataset_url()
    file_path = download_file(url, cache_ttl_hours=CACHE_TTL_HOURS, force_refresh=force_refresh)
    return _parse_stop_and_search(str(file_path))




[docs]
def validate_stop_and_search(df: pd.DataFrame) -> bool:
    """Validate the integrity of a Stop and Search DataFrame.

    Checks that the DataFrame has the expected shape, required columns,
    a sensible set of financial years, no unexpected null values in key
    fields, and that PACE boolean columns contain only booleans.

    Args:
        df: DataFrame to validate (e.g. from :func:`get_latest_stop_and_search`).

    Returns:
        ``True`` if all checks pass.

    Raises:
        PSNIValidationError: If any check fails, with a descriptive message.

    Example:
        >>> df = get_latest_stop_and_search()
        >>> validate_stop_and_search(df)
        True
    """
    if df.empty:
        raise PSNIValidationError("Stop and Search DataFrame is empty")

    required_columns = {
        "financial_year",
        "geographical_level",
        "legislation",
        "pace_reason_stolen_articles",
        "pace_reason_prohibited_articles",
        "pace_reason_blade_or_point",
        "pace_reason_fireworks",
        "quarter",
        "age_group",
        "gender",
    }
    missing = required_columns - set(df.columns)
    if missing:
        raise PSNIValidationError(f"Missing required columns: {missing}")

    # Must have records from at least 2017/18
    years = df["financial_year"].unique().tolist()
    str_years = [str(y) for y in years]
    if "2017/18" not in str_years:
        raise PSNIValidationError(f"Expected data from 2017/18 but found years: {sorted(str_years)}")

    # Must have records for multiple financial years
    if len(str_years) < 2:
        raise PSNIValidationError(f"Expected multiple financial years, found only: {str_years}")

    # PACE columns must be boolean (no nulls from failed mapping)
    pace_cols = [
        "pace_reason_stolen_articles",
        "pace_reason_prohibited_articles",
        "pace_reason_blade_or_point",
        "pace_reason_fireworks",
    ]
    for col in pace_cols:
        null_count = df[col].isna().sum()
        if null_count > 0:
            raise PSNIValidationError(f"Column '{col}' has {null_count} unexpected null values")

    # At least 50,000 records (significantly less than 199k would indicate truncation)
    if len(df) < 50_000:
        raise PSNIValidationError(f"Too few records: expected ≥50,000 but got {len(df):,}")

    logger.info(f"Validation passed: {len(df):,} records, years {sorted(str_years)}")
    return True