Source code for bolster.data_sources.psni.crime_statistics

"""PSNI Police Recorded Crime Statistics.

Provides access to police recorded crime statistics for Northern Ireland.

Data includes:
- Monthly crime counts by crime type and policing district
- Geographic breakdown by 11 policing districts (aligned with LGDs)
- Outcome data (charges, cautions, etc.) by district
- Historical time series from April 2001 to December 2021
- Integration with NISRA datasets via LGD and NUTS3 codes

Data Source:
    **Primary Source**: OpenDataNI - Police Recorded Crime in Northern Ireland

    https://www.opendatani.gov.uk/dataset/police-recorded-crime-in-northern-ireland

    **DATA LIMITATION — STALE SINCE JANUARY 2022**:

    The OpenDataNI dataset was last updated 27 January 2022 and only contains
    data through December 2021. PSNI stopped pushing updates to OpenDataNI after
    that date. The PSNI official statistics page publishes quarterly Excel files
    with current data, but psni.police.uk is protected by Cloudflare which
    blocks automated downloads.

    Calling ``get_latest_crime_statistics()`` will raise ``PSNIDataStaleError``
    to make this limitation explicit. The historical data (Apr 2001–Dec 2021)
    remains accessible via ``get_historical_crime_statistics()``.

    For 2022+ data, consult PSNI directly:
    - Official stats page: https://www.psni.police.uk/about-us/our-publications-and-reports/official-statistics/police-recorded-crime-statistics
    - Contact: statistics@psni.police.uk

Update Frequency: Quarterly (end of Jan, May, Jul, Oct) — **STALE SINCE 2022**
Geographic Coverage: Northern Ireland (11 policing districts + NI total)
Reference Date: Month of crime occurrence
Time Coverage: April 2001 to December 2021

Example:
    >>> from bolster.data_sources.psni import crime_statistics
    >>> df = crime_statistics.get_historical_crime_statistics()
    >>> sorted(df.columns.tolist())
    ['calendar_year', 'count', 'crime_type', 'data_measure', 'date', 'lgd_code', 'month', 'nuts3_code', 'nuts3_name', 'policing_district']
    >>> belfast_lgd = crime_statistics.get_lgd_code('Belfast City')
    >>> belfast_lgd
    'N09000003'
"""

import logging
from datetime import datetime
from pathlib import Path

import pandas as pd

from ._base import (
    PSNIDataStaleError,
    PSNIValidationError,
    download_file,
    get_lgd_code,
    get_nuts3_code,
    get_nuts_region_name,
)


[docs]
logger = logging.getLogger(__name__)


# OpenDataNI CSV URL (direct download, no Cloudflare protection)

[docs]
CRIME_STATISTICS_URL = "https://admin.opendatani.gov.uk/dataset/80dc9542-7b2a-48f5-bbf4-ccc7040d36af/resource/6fd51851-df78-4469-98c5-4f06953621a0/download/police-recorded-crime-monthly-data.csv"


# Data guide for reference

[docs]
DATA_GUIDE_URL = "https://admin.opendatani.gov.uk/dataset/80dc9542-7b2a-48f5-bbf4-ccc7040d36af/resource/51cd6a9e-646b-42bf-9daa-8d2cb618764e/download/police-recorded-crime-data-guide.pdf"


# PSNI Official Statistics (for current data not available on OpenDataNI)

[docs]
PSNI_OFFICIAL_STATS_URL = "https://www.psni.police.uk/about-us/our-publications-and-reports/official-statistics/police-recorded-crime-statistics"


[docs]
PSNI_STATISTICS_EMAIL = "statistics@psni.police.uk"




[docs]
def get_data_source_info() -> dict:
    """Get information about crime statistics data sources.

    Returns a dictionary with URLs and contact information for accessing
    PSNI crime statistics. Use this when you need data beyond December 2021.

    Returns:
        Dictionary with keys:
            - opendatani_url: OpenDataNI dataset URL (data through Dec 2021)
            - data_guide_url: PDF data guide URL
            - psni_official_url: PSNI official statistics page (current data)
            - contact_email: PSNI Statistics Branch email
            - data_limitation: Description of OpenDataNI data limitations
            - last_update: Last known update date for OpenDataNI

    Example:
        >>> info = get_data_source_info()
        >>> sorted(info.keys())
        ['contact_email', 'data_guide_url', 'data_limitation', 'last_update', 'opendatani_url', 'psni_official_url']
    """
    return {
        "opendatani_url": "https://www.opendatani.gov.uk/dataset/police-recorded-crime-in-northern-ireland",
        "data_guide_url": DATA_GUIDE_URL,
        "psni_official_url": PSNI_OFFICIAL_STATS_URL,
        "contact_email": PSNI_STATISTICS_EMAIL,
        "data_limitation": (
            "OpenDataNI dataset was last updated January 2022 and only contains "
            "data through December 2021. For 2022-2025 data, consult PSNI's quarterly "
            "bulletins at the official statistics URL or contact PSNI Statistics Branch."
        ),
        "last_update": "2022-01-27",
    }




[docs]
def parse_crime_statistics_file(
    file_path: str | Path,
    add_geographic_codes: bool = True,
) -> pd.DataFrame:
    """Parse PSNI crime statistics CSV file.

    The file is in long format with columns for year, month, district,
    crime type, data measure, and count. This function reads the CSV,
    cleans column names, adds date parsing, and optionally adds LGD and
    NUTS3 geographic codes for cross-dataset integration.

    Args:
        file_path: Path to the crime statistics CSV file
        add_geographic_codes: If True, add LGD and NUTS3 code columns

    Returns:
        DataFrame with columns:
            - calendar_year: int (year of crime)
            - month: str (month name: Apr, May, ..., Dec)
            - policing_district: str (district name or "Northern Ireland")
            - crime_type: str (Home Office crime classification)
            - data_measure: str (type of measure - crime count, outcome number, outcome rate)
            - count: float (value - can be count or percentage)
            - date: datetime (first day of month)
            - lgd_code: str (ONS LGD code, if add_geographic_codes=True)
            - nuts3_code: str (NUTS3 region code, if add_geographic_codes=True)
            - nuts3_name: str (NUTS3 region name, if add_geographic_codes=True)

    Raises:
        PSNIValidationError: If file structure is unexpected

    Example:
        >>> path = download_file(CRIME_STATISTICS_URL, cache_ttl_hours=24*7)
        >>> df = parse_crime_statistics_file(path)
        >>> 'crime_type' in df.columns
        True
        >>> len(df) > 0
        True
    """
    file_path = Path(file_path)

    try:
        # Read CSV - it's already clean and well-structured
        df = pd.read_csv(file_path)
    except Exception as e:
        raise PSNIValidationError(f"Failed to read crime statistics file: {e}") from e

    # Validate expected columns
    expected_cols = {"Calendar_Year", "Month", "Policing_District", "Crime_Type", "Data_Measure", "Count"}
    if not expected_cols.issubset(df.columns):
        missing = expected_cols - set(df.columns)
        raise PSNIValidationError(f"Missing expected columns: {missing}")

    # Clean column names (lowercase with underscores)
    df = df.rename(
        columns={
            "Calendar_Year": "calendar_year",
            "Month": "month",
            "Policing_District": "policing_district",
            "Crime_Type": "crime_type",
            "Data_Measure": "data_measure",
            "Count": "count",
        }
    )

    # Strip whitespace from string columns (source data has trailing spaces)
    for col in ["policing_district", "crime_type", "data_measure", "month"]:
        df[col] = df[col].str.strip()

    # Create datetime column (first day of month)
    # Month names are 3-letter abbreviations: Apr, May, Jun, etc.
    month_map = {
        "Jan": 1,
        "Feb": 2,
        "Mar": 3,
        "Apr": 4,
        "May": 5,
        "Jun": 6,
        "Jul": 7,
        "Aug": 8,
        "Sep": 9,
        "Oct": 10,
        "Nov": 11,
        "Dec": 12,
    }

    df["month_num"] = df["month"].map(month_map)

    if df["month_num"].isna().any():
        unrecognized = df[df["month_num"].isna()]["month"].unique()
        raise PSNIValidationError(f"Unrecognized month values: {unrecognized}")

    df["date"] = pd.to_datetime({"year": df["calendar_year"], "month": df["month_num"], "day": 1}, errors="coerce")

    # Drop temporary month_num column
    df = df.drop(columns=["month_num"])

    # Handle special values in count column
    # "/0" means outcome rate could not be calculated (distinct from 0)
    df["count"] = df["count"].replace("/0", pd.NA)
    df["count"] = pd.to_numeric(df["count"], errors="coerce")

    # Add geographic codes for cross-dataset integration
    if add_geographic_codes:
        df["lgd_code"] = df["policing_district"].apply(get_lgd_code)
        df["nuts3_code"] = df["policing_district"].apply(get_nuts3_code)
        df["nuts3_name"] = df["nuts3_code"].apply(get_nuts_region_name)

    # Sort by date and district for consistent output
    df = df.sort_values(["date", "policing_district", "crime_type"]).reset_index(drop=True)

    # Log summary
    total_records = len(df)
    date_range = f"{df['date'].min().strftime('%Y-%m')} to {df['date'].max().strftime('%Y-%m')}"
    districts = df["policing_district"].nunique()
    crime_types = df["crime_type"].nunique()

    logger.info(f"Parsed {total_records:,} crime records ({date_range})")
    logger.info(f"  {districts} policing districts, {crime_types} crime types")

    return df




[docs]
def get_latest_crime_statistics(
    force_refresh: bool = False,
    add_geographic_codes: bool = True,
) -> pd.DataFrame:
    """Raises PSNIDataStaleError — use get_historical_crime_statistics() instead.

    The OpenDataNI source was last updated January 2022. PSNI's official site
    publishes current data but is Cloudflare-protected and inaccessible to
    automated downloads. Use ``get_historical_crime_statistics()`` to access
    the data available (Apr 2001–Dec 2021).

    Raises:
        PSNIDataStaleError: Always — this data source has no accessible update.
    """
    raise PSNIDataStaleError(
        "PSNI crime statistics are stale since January 2022 (last data: December 2021). "
        "The OpenDataNI mirror has not been updated and the official PSNI website "
        "(psni.police.uk) is Cloudflare-protected, blocking automated access to 2022+ data. "
        "To access the available historical data (Apr 2001–Dec 2021), use "
        "get_historical_crime_statistics() instead. "
        "For current data, visit: https://www.psni.police.uk/about-us/our-publications-and-reports/official-statistics/police-recorded-crime-statistics "
        "or contact: statistics@psni.police.uk"
    )




[docs]
def get_historical_crime_statistics(
    force_refresh: bool = False,
    add_geographic_codes: bool = True,
) -> pd.DataFrame:
    """Get historical police recorded crime statistics (April 2001 – December 2021).

    Downloads the crime statistics CSV from OpenDataNI. This dataset covers
    April 2001 through December 2021 and has not been updated since January 2022.
    For 2022+ data, consult PSNI directly.

    Args:
        force_refresh: If True, bypass cache and download fresh data
        add_geographic_codes: If True, add LGD and NUTS3 code columns

    Returns:
        DataFrame with columns: date, calendar_year, month, policing_district,
        crime_type, data_measure, count, lgd_code, nuts3_code, nuts3_name

    Raises:
        PSNIDataNotFoundError: If download fails
        PSNIValidationError: If file structure is unexpected

    Example:
        >>> df = get_historical_crime_statistics()
        >>> sorted(df.columns.tolist())
        ['calendar_year', 'count', 'crime_type', 'data_measure', 'date', 'lgd_code', 'month', 'nuts3_code', 'nuts3_name', 'policing_district']
        >>> df['date'].max().year
        2021
    """
    logger.warning(
        "Loading historical PSNI crime statistics (Apr 2001–Dec 2021). "
        "This dataset has not been updated since January 2022. "
        "For current data visit https://www.psni.police.uk/about-us/our-publications-and-reports/official-statistics/police-recorded-crime-statistics"
    )
    file_path = download_file(CRIME_STATISTICS_URL, cache_ttl_hours=24 * 90, force_refresh=force_refresh)
    return parse_crime_statistics_file(file_path, add_geographic_codes=add_geographic_codes)




[docs]
def validate_crime_statistics(df: pd.DataFrame) -> bool:  # pragma: no cover
    """Validate crime statistics data integrity.

    Performs sanity checks on the crime statistics data:
    - Non-negative crime counts
    - Reasonable date ranges
    - Expected policing districts present
    - No unexpected missing data

    Args:
        df: DataFrame from parse_crime_statistics_file or get_latest_crime_statistics

    Returns:
        True if validation passes

    Raises:
        PSNIValidationError: If validation fails

    Example:
        >>> df = get_latest_crime_statistics()
        >>> validate_crime_statistics(df)
        True
    """
    # Check for negative crime counts (excluding NA which represents "/0")
    crime_counts = df[df["data_measure"] == "Police Recorded Crime"]["count"]
    if (crime_counts < 0).any():
        negative_count = (crime_counts < 0).sum()
        raise PSNIValidationError(f"Found {negative_count} negative crime counts")

    # Check date range is reasonable
    min_date = df["date"].min()
    max_date = df["date"].max()

    if min_date < pd.Timestamp("2001-01-01"):
        raise PSNIValidationError(f"Data includes dates before 2001: {min_date}")

    if max_date > pd.Timestamp.now():
        raise PSNIValidationError(f"Data includes future dates: {max_date}")

    # Check that we have the expected policing districts
    expected_districts = {
        "Northern Ireland",
        "Belfast City",
        "Lisburn & Castlereagh City",
        "Ards & North Down",
        "Newry Mourne & Down",
        "Armagh City Banbridge & Craigavon",
        "Mid Ulster",
        "Fermanagh & Omagh",
        "Derry City & Strabane",
        "Causeway Coast & Glens",
        "Mid & East Antrim",
        "Antrim & Newtownabbey",
    }

    actual_districts = set(df["policing_district"].unique())
    missing_districts = expected_districts - actual_districts

    if missing_districts:
        logger.warning(f"Missing expected policing districts: {missing_districts}")

    # Check for reasonable data coverage per district
    records_per_district = df.groupby("policing_district").size()
    if records_per_district.min() < 100:
        sparse_districts = records_per_district[records_per_district < 100]
        logger.warning(f"Some districts have very few records: {sparse_districts.to_dict()}")

    logger.info(f"Validation passed: {len(df):,} records checked")
    logger.info(f"  Date range: {min_date.strftime('%Y-%m')} to {max_date.strftime('%Y-%m')}")
    logger.info(f"  {len(actual_districts)} policing districts")

    return True




[docs]
def filter_by_district(
    df: pd.DataFrame,
    district: str | list[str],
) -> pd.DataFrame:
    """Filter crime statistics to specific policing district(s).

    Args:
        df: DataFrame from get_latest_crime_statistics
        district: District name(s) to filter (e.g., "Belfast City" or ["Belfast City", "Derry City & Strabane"])

    Returns:
        Filtered DataFrame

    Example:
        >>> df = get_latest_crime_statistics()
        >>> belfast = filter_by_district(df, "Belfast City")
        >>> belfast['policing_district'].unique().tolist()
        ['Belfast City']
        >>>
        >>> # Multiple districts
        >>> cities = filter_by_district(df, ["Belfast City", "Derry City & Strabane"])
        >>> len(cities['policing_district'].unique()) == 2
        True
    """
    if isinstance(district, str):
        district = [district]

    return df[df["policing_district"].isin(district)].reset_index(drop=True)




[docs]
def filter_by_crime_type(
    df: pd.DataFrame,
    crime_type: str | list[str],
) -> pd.DataFrame:
    """Filter crime statistics to specific crime type(s).

    Args:
        df: DataFrame from get_latest_crime_statistics
        crime_type: Crime type(s) to filter (e.g., "Burglary" or ["Violence with injury", "Robbery"])

    Returns:
        Filtered DataFrame

    Example:
        >>> df = get_latest_crime_statistics()
        >>> violence = filter_by_crime_type(df, "Violence with injury (including homicide & death/serious injury by unlawful driving)")
        >>> len(violence) > 0
        True
    """
    if isinstance(crime_type, str):
        crime_type = [crime_type]

    return df[df["crime_type"].isin(crime_type)].reset_index(drop=True)




[docs]
def filter_by_date_range(
    df: pd.DataFrame,
    start_date: str | datetime | None = None,
    end_date: str | datetime | None = None,
) -> pd.DataFrame:
    """Filter crime statistics to a date range.

    Args:
        df: DataFrame from get_latest_crime_statistics
        start_date: Start date (inclusive), e.g., "2020-01-01" or datetime
        end_date: End date (inclusive), e.g., "2021-12-31" or datetime

    Returns:
        Filtered DataFrame

    Example:
        >>> df = get_latest_crime_statistics()
        >>> # Get 2020 data
        >>> df_2020 = filter_by_date_range(df, "2020-01-01", "2020-12-31")
        >>> df_2020['calendar_year'].unique().tolist()
        [2020]
        >>>
        >>> # Get data from 2018 onwards
        >>> recent = filter_by_date_range(df, start_date="2018-01-01")
        >>> len(recent) > 0
        True
    """
    filtered = df.copy()

    if start_date:
        if isinstance(start_date, str):
            start_date = pd.to_datetime(start_date)
        filtered = filtered[filtered["date"] >= start_date]

    if end_date:
        if isinstance(end_date, str):
            end_date = pd.to_datetime(end_date)
        filtered = filtered[filtered["date"] <= end_date]

    return filtered.reset_index(drop=True)




[docs]
def get_total_crimes_by_district(
    df: pd.DataFrame,
    year: int | None = None,
) -> pd.DataFrame:
    """Calculate total recorded crimes by policing district.

    Args:
        df: DataFrame from get_latest_crime_statistics
        year: Optional year to filter (uses all years if None)

    Returns:
        DataFrame with columns: policing_district, lgd_code, nuts3_code, total_crimes

    Example:
        >>> df = get_latest_crime_statistics()
        >>> totals_2021 = get_total_crimes_by_district(df, year=2021)
        >>> sorted(totals_2021.columns.tolist())
        ['lgd_code', 'nuts3_code', 'policing_district', 'total_crimes']
    """
    # Filter to total crimes measure
    crime_df = df[
        (df["data_measure"] == "Police Recorded Crime") & (df["crime_type"] == "Total police recorded crime")
    ].copy()

    # Filter to specific year if provided
    if year:
        crime_df = crime_df[crime_df["calendar_year"] == year]

    # Group by district and sum
    result = (
        crime_df.groupby(["policing_district", "lgd_code", "nuts3_code"])["count"]
        .sum()
        .reset_index()
        .rename(columns={"count": "total_crimes"})
    )

    return result.sort_values("total_crimes", ascending=False).reset_index(drop=True)




[docs]
def get_crime_trends(
    df: pd.DataFrame,
    crime_type: str = "Total police recorded crime",
    district: str = "Northern Ireland",
    measure: str = "Police Recorded Crime",
) -> pd.DataFrame:
    """Get monthly crime trends for a specific crime type and district.

    Args:
        df: DataFrame from get_latest_crime_statistics
        crime_type: Crime type to analyze (default: total crimes)
        district: Policing district (default: Northern Ireland total)
        measure: Data measure to use (default: Police Recorded Crime)

    Returns:
        DataFrame with columns: date, calendar_year, month, count

    Example:
        >>> df = get_latest_crime_statistics()
        >>> trends = get_crime_trends(df, district="Belfast City")
        >>> sorted(trends.columns.tolist())
        ['calendar_year', 'count', 'date', 'month']
        >>> len(trends) > 0
        True
    """
    filtered = df[
        (df["crime_type"] == crime_type) & (df["policing_district"] == district) & (df["data_measure"] == measure)
    ].copy()

    return filtered[["date", "calendar_year", "month", "count"]].sort_values("date").reset_index(drop=True)




[docs]
def get_outcome_rates_by_district(
    df: pd.DataFrame,
    year: int | None = None,
    crime_type: str = "Total police recorded crime",
) -> pd.DataFrame:
    """Calculate crime outcome rates by policing district.

    Outcome rate represents the percentage of crimes with an outcome
    (charge, caution, community resolution, etc.)

    Args:
        df: DataFrame from get_latest_crime_statistics
        year: Optional year to filter (uses all years if None)
        crime_type: Crime type to analyze (default: total crimes)

    Returns:
        DataFrame with columns: policing_district, lgd_code, average_outcome_rate

    Example:
        >>> df = get_latest_crime_statistics()
        >>> outcomes = get_outcome_rates_by_district(df, year=2021)
        >>> 'average_outcome_rate' in outcomes.columns
        True
    """
    # Filter to outcome rate measure
    outcome_df = df[
        (df["data_measure"] == "Police Recorded Crime Outcomes (rate %)") & (df["crime_type"] == crime_type)
    ].copy()

    # Filter to specific year if provided
    if year:
        outcome_df = outcome_df[outcome_df["calendar_year"] == year]

    # Group by district and calculate average outcome rate
    result = (
        outcome_df.groupby(["policing_district", "lgd_code"])["count"]
        .mean()
        .reset_index()
        .rename(columns={"count": "average_outcome_rate"})
    )

    # Round to 1 decimal place
    result["average_outcome_rate"] = result["average_outcome_rate"].round(1)

    return result.sort_values("average_outcome_rate", ascending=False).reset_index(drop=True)




[docs]
def get_available_crime_types(df: pd.DataFrame) -> list[str]:
    """Get list of all crime types in the dataset.

    Args:
        df: DataFrame from get_latest_crime_statistics

    Returns:
        Sorted list of crime type names

    Example:
        >>> df = get_latest_crime_statistics()
        >>> crime_types = get_available_crime_types(df)
        >>> isinstance(crime_types, list)
        True
        >>> 'Total police recorded crime' in crime_types
        True
    """
    return sorted(df["crime_type"].unique().tolist())




[docs]
def get_available_districts(df: pd.DataFrame) -> list[str]:
    """Get list of all policing districts in the dataset.

    Args:
        df: DataFrame from get_latest_crime_statistics

    Returns:
        Sorted list of district names

    Example:
        >>> df = get_latest_crime_statistics()
        >>> districts = get_available_districts(df)
        >>> isinstance(districts, list)
        True
        >>> 'Northern Ireland' in districts
        True
    """
    return sorted(df["policing_district"].unique().tolist())