Source code for bolster.data_sources.psni.police_ombudsman

"""PSNI Police Ombudsman Complaint Statistics.

Provides access to complaint statistics published by the Police Ombudsman for
Northern Ireland (PONI), covering:

- Annual complaint totals back to 2000/01
- Complaints by policing district (2011/12 onwards)
- Allegations by type and subtype (2011/12 onwards)
- Complaint closures by outcome (2011/12 onwards)
- Quarterly complaint and allegation counts (latest 5 years)

Data Source:
    **Annual**: Police Ombudsman annual statistics bulletin
    https://www.policeombudsman.org/statistics-and-research/complaint-statistics-in-northern-ireland

    **Quarterly**: Police Ombudsman quarterly statistical bulletin
    https://www.policeombudsman.org/statistics-and-research/quarterly-reports

    Published under the Open Government Licence v3.0.

Update Frequency:
    - Annual: once per year (summer, covering previous financial year)
    - Quarterly: four times per year

Geographic Coverage:
    Northern Ireland — 11 Policing Districts aligned with LGDs.

Time Coverage:
    - Totals: 2000/01 to present
    - District / allegation / outcome breakdowns: 2011/12 to present
    - Quarterly: latest 5 financial years

Example:
    >>> from bolster.data_sources.psni import police_ombudsman
    >>> df = police_ombudsman.get_latest_complaints()
    >>> 'year' in df.columns
    True
    >>> url = police_ombudsman.get_annual_publication_url()
    >>> url.startswith("https://")
    True
"""

import logging
import re

import pandas as pd
from bs4 import BeautifulSoup

from bolster.utils.web import session

from ._base import (
    LGD_CODES,
    PSNIDataNotFoundError,
    PSNIValidationError,
    download_file,
)

[docs] logger = logging.getLogger(__name__)
# ── Public page URLs ────────────────────────────────────────────────────────── _QUARTERLY_PAGE = "https://www.policeombudsman.org/statistics-and-research/quarterly-reports" _ANNUAL_PAGE = "https://www.policeombudsman.org/statistics-and-research/complaint-statistics-in-northern-ireland" _BASE_URL = "https://www.policeombudsman.org" # policeombudsman.org returns 403 to requests' default UA; provide a browser UA. _SCRAPE_HEADERS = {"User-Agent": ("Mozilla/5.0 (compatible; bolster/1.0; +https://github.com/andrewbolster/bolster)")} # ── District name normalisation ─────────────────────────────────────────────── # Quarterly uses "District A - Belfast City" style labels. # Annual uses "A - Belfast City" style labels. # Both map to the canonical LGD names found in _base.LGD_CODES. _QUARTERLY_DISTRICT_MAP: dict[str, str] = { "District A - Belfast City": "Belfast City", "District B - Lisburn & Castlereagh City": "Lisburn & Castlereagh City", "District C - Ards & North Down": "Ards & North Down", "District D - Newry, Mourne & Down": "Newry Mourne & Down", "District E - Armagh City, Banbridge & Craigavon": "Armagh City Banbridge & Craigavon", "District F - Mid Ulster": "Mid Ulster", "District G - Fermanagh & Omagh": "Fermanagh & Omagh", "District H - Derry City & Strabane": "Derry City & Strabane", "District J - Causeway Coast & Glens": "Causeway Coast & Glens", "District K - Mid & East Antrim": "Mid & East Antrim", "District L - Antrim & Newtownabbey": "Antrim & Newtownabbey", } # Annual "A - Belfast City" → canonical LGD name. # Built by stripping "District " prefix from quarterly keys, then any # spelling differences between the two formats are patched below. _ANNUAL_DISTRICT_MAP: dict[str, str] = {k.replace("District ", ""): v for k, v in _QUARTERLY_DISTRICT_MAP.items()} # The annual T8 sheet omits the comma in "Newry Mourne & Down": _ANNUAL_DISTRICT_MAP["D - Newry Mourne & Down"] = "Newry Mourne & Down" # Ensure the annual-format variant also covers "E - Armagh…" (no comma variant): _ANNUAL_DISTRICT_MAP["E - Armagh City, Banbridge & Craigavon"] = "Armagh City Banbridge & Craigavon" # Financial-year column pattern: "2024/25" _FY_RE = re.compile(r"^\d{4}/\d{2}$") def _normalise_annual_district(raw: str) -> str: """Normalise an annual-file district label to the canonical LGD name. Args: raw: Raw district label from the annual Excel (e.g. ``"A - Belfast City"``). Returns: Canonical LGD name matching ``_base.LGD_CODES``, or the input string unchanged if no mapping is found. """ return _ANNUAL_DISTRICT_MAP.get(str(raw).strip(), str(raw).strip()) def _normalise_quarterly_district(raw: str) -> str: """Normalise a quarterly-file district label to the canonical LGD name. Args: raw: Raw district string (e.g. ``"District A - Belfast City"``). Returns: Canonical LGD name, or the original string if no mapping found. """ return _QUARTERLY_DISTRICT_MAP.get(str(raw).strip(), str(raw).strip()) def _parse_fy_label(label: str) -> int: """Parse a financial-year label such as ``'2024/25'`` to the start year. Args: label: String like ``'2024/25'`` or ``'2000/01*'``. Returns: Integer start year, e.g. ``2024``. """ return int(str(label).strip().split("/")[0]) def _find_sheet(xl: pd.ExcelFile, prefix: str) -> str: """Return the first sheet name that starts with *prefix* (case-insensitive). Args: xl: Opened ``pd.ExcelFile`` object. prefix: Sheet name prefix to search for (e.g. ``"T8"``). Returns: Matching sheet name. Raises: PSNIDataNotFoundError: If no matching sheet is found. """ for name in xl.sheet_names: if name.upper().startswith(prefix.upper()): return name raise PSNIDataNotFoundError(f"No sheet starting with {prefix!r} found. Available: {xl.sheet_names}") # ── URL scrapers ───────────────────────────────────────────────────────────────
[docs] def get_quarterly_publication_url() -> str: """Scrape the quarterly-reports page for the latest .xlsx download link. policeombudsman.org returns 403 to default User-Agents; this function uses a browser-like UA via ``bolster.utils.web.session``. Returns: Absolute URL of the latest quarterly Excel spreadsheet. Raises: PSNIDataNotFoundError: If the page cannot be retrieved or no .xlsx link is found. Example: >>> url = get_quarterly_publication_url() >>> url.startswith("https://") True """ try: resp = session.get(_QUARTERLY_PAGE, headers=_SCRAPE_HEADERS, timeout=30) resp.raise_for_status() except Exception as exc: raise PSNIDataNotFoundError(f"Failed to fetch quarterly-reports page: {exc}") from exc soup = BeautifulSoup(resp.text, "html.parser") for a in soup.find_all("a", href=True): href: str = a["href"] if href.lower().endswith((".xlsx", ".xls")): return href if href.startswith("http") else _BASE_URL + href raise PSNIDataNotFoundError(f"No .xlsx link found on {_QUARTERLY_PAGE}")
[docs] def get_annual_publication_url() -> str: """Scrape the complaint-statistics page for the latest .xlsx download link. policeombudsman.org returns 403 to default User-Agents; this function uses a browser-like UA via ``bolster.utils.web.session``. Returns: Absolute URL of the latest annual Excel spreadsheet. Raises: PSNIDataNotFoundError: If the page cannot be retrieved or no .xlsx link is found. Example: >>> url = get_annual_publication_url() >>> url.startswith("https://") True """ try: resp = session.get(_ANNUAL_PAGE, headers=_SCRAPE_HEADERS, timeout=30) resp.raise_for_status() except Exception as exc: raise PSNIDataNotFoundError(f"Failed to fetch annual statistics page: {exc}") from exc soup = BeautifulSoup(resp.text, "html.parser") for a in soup.find_all("a", href=True): href: str = a["href"] if href.lower().endswith((".xlsx", ".xls")): return href if href.startswith("http") else _BASE_URL + href raise PSNIDataNotFoundError(f"No .xlsx link found on {_ANNUAL_PAGE}")
# ── Annual file parsers ──────────────────────────────────────────────────────── def _parse_annual_t1(xl: pd.ExcelFile) -> pd.DataFrame: """Parse the T1 (total complaints) sheet of the annual workbook. Returns a tidy DataFrame with columns ``year`` (int) and ``complaints`` (int), one row per financial year from 2000/01 onwards. """ sheet = _find_sheet(xl, "T1") raw = xl.parse(sheet, header=None) # Locate the "Year" header row header_row = None for i, val in enumerate(raw.iloc[:, 0]): if str(val).strip().lower() == "year": header_row = i break if header_row is None: raise PSNIDataNotFoundError("Cannot find 'Year' header row in T1 sheet") data = raw.iloc[header_row + 1 :].copy() data.columns = ["year_label", "complaints"] data = data.dropna(subset=["year_label", "complaints"]) data = data[data["year_label"].astype(str).str.match(r"^\d{4}/\d{2}")] data["year"] = data["year_label"].apply(_parse_fy_label) data["complaints"] = pd.to_numeric(data["complaints"], errors="coerce").astype("Int64") return data[["year", "year_label", "complaints"]].reset_index(drop=True) def _parse_annual_t8(xl: pd.ExcelFile) -> pd.DataFrame: """Parse the T8 (by district) sheet of the annual workbook. Returns a tidy long-form DataFrame with columns ``year``, ``year_label``, ``district``, ``lgd_code``, ``complaints``. """ sheet = _find_sheet(xl, "T8") raw = xl.parse(sheet, header=None) # Row 1 (0-indexed) is headers: District, 2011/12, 2012/13, ... header_row = 1 headers = raw.iloc[header_row].tolist() data = raw.iloc[header_row + 1 :].copy() data.columns = [str(h).strip() if pd.notna(h) else f"_col{i}" for i, h in enumerate(headers)] district_col = data.columns[0] year_cols = [c for c in data.columns if _FY_RE.match(str(c).strip())] # Keep district rows only — drop NaN, total, and "Unknown / Other" rows data = data.dropna(subset=[district_col]) _exclude = data[district_col].astype(str).str.lower() data = data[~(_exclude.str.contains("total", na=False) | _exclude.str.contains("unknown", na=False))] melted = data[[district_col] + year_cols].melt( id_vars=[district_col], var_name="year_label", value_name="complaints" ) melted = melted.dropna(subset=["complaints"]) melted["complaints"] = pd.to_numeric(melted["complaints"], errors="coerce").astype("Int64") melted["district"] = melted[district_col].apply(_normalise_annual_district) melted["lgd_code"] = melted["district"].map(LGD_CODES) melted["year"] = melted["year_label"].apply(_parse_fy_label) return melted[["year", "year_label", "district", "lgd_code", "complaints"]].reset_index(drop=True) def _parse_annual_t10(xl: pd.ExcelFile) -> pd.DataFrame: """Parse the T10 (allegation types) sheet of the annual workbook. Returns a tidy long-form DataFrame with columns ``year``, ``year_label``, ``allegation_type``, ``allegation_subtype``, ``allegations``. The ``allegation_type`` column is forward-filled because the Excel uses merged cells for the parent category. """ sheet = _find_sheet(xl, "T10") raw = xl.parse(sheet, header=None) # Row 1 has: "Allegation Type", "Allegation Subtype", year cols... header_row = 1 headers = raw.iloc[header_row].tolist() data = raw.iloc[header_row + 1 :].copy() data.columns = [str(h).strip() if pd.notna(h) else f"_col{i}" for i, h in enumerate(headers)] type_col = data.columns[0] subtype_col = data.columns[1] # Forward-fill allegation type (merged cells appear as NaN below first row) data[type_col] = data[type_col].ffill() year_cols = [c for c in data.columns if _FY_RE.match(str(c).strip())] # Drop subtotal / total rows data = data[~data[subtype_col].astype(str).str.lower().isin(["subtotal", "total", "nan"])] data = data.dropna(subset=[subtype_col]) melted = data[[type_col, subtype_col] + year_cols].melt( id_vars=[type_col, subtype_col], var_name="year_label", value_name="allegations" ) melted = melted.dropna(subset=["allegations"]) melted["allegations"] = pd.to_numeric(melted["allegations"], errors="coerce").astype("Int64") melted["year"] = melted["year_label"].apply(_parse_fy_label) melted = melted.rename(columns={type_col: "allegation_type", subtype_col: "allegation_subtype"}) return melted[["year", "year_label", "allegation_type", "allegation_subtype", "allegations"]].reset_index(drop=True) def _parse_annual_t12(xl: pd.ExcelFile) -> pd.DataFrame: """Parse the T12 (complaint closures) sheet of the annual workbook. Returns a tidy long-form DataFrame with columns ``year``, ``year_label``, ``outcome``, ``closures``. """ sheet = _find_sheet(xl, "T12") raw = xl.parse(sheet, header=None) # Row 1 has: NaN, year_labels... header_row = 1 headers = raw.iloc[header_row].tolist() data = raw.iloc[header_row + 1 :].copy() data.columns = [str(h).strip() if pd.notna(h) else f"_col{i}" for i, h in enumerate(headers)] outcome_col = data.columns[0] year_cols = [c for c in data.columns if _FY_RE.match(str(c).strip())] data = data.dropna(subset=[outcome_col]) data = data[data[outcome_col].astype(str).str.strip() != "nan"] melted = data[[outcome_col] + year_cols].melt(id_vars=[outcome_col], var_name="year_label", value_name="closures") melted = melted.dropna(subset=["closures"]) melted["closures"] = pd.to_numeric(melted["closures"], errors="coerce").astype("Int64") melted["year"] = melted["year_label"].apply(_parse_fy_label) melted = melted.rename(columns={outcome_col: "outcome"}) return melted[["year", "year_label", "outcome", "closures"]].reset_index(drop=True)
[docs] def parse_annual(file_path: str) -> dict[str, pd.DataFrame]: """Parse the annual Police Ombudsman statistics Excel workbook. Extracts four key tables from the workbook: - ``totals``: total complaints 2000/01 onwards (T1) - ``by_district``: complaints by policing district, 2011/12 onwards (T8) - ``by_allegation_type``: allegations by type & subtype, 2011/12+ (T10) - ``by_outcome``: complaint closures by outcome, 2011/12 onwards (T12) Args: file_path: Local path (or file-like) to the downloaded ``.xlsx`` file. Returns: Dict mapping breakdown name to tidy DataFrame. All DataFrames include ``year`` (int, financial-year start) and ``year_label`` (e.g. ``"2024/25"``) columns. Raises: PSNIDataNotFoundError: If required sheets cannot be found. Example: >>> from bolster.data_sources.psni import police_ombudsman >>> result = parse_annual.__doc__ # placeholder >>> 'totals' in result False """ xl = pd.ExcelFile(file_path) return { "totals": _parse_annual_t1(xl), "by_district": _parse_annual_t8(xl), "by_allegation_type": _parse_annual_t10(xl), "by_outcome": _parse_annual_t12(xl), }
# ── Quarterly file parser ──────────────────────────────────────────────────────
[docs] def parse_quarterly(file_path: str) -> dict[str, pd.DataFrame]: """Parse a quarterly Police Ombudsman statistics Excel workbook. Extracts three tables: - ``complaints``: complaints received by quarter × year - ``allegations``: allegations received by quarter × year - ``by_district``: complaints by policing district × year The quarterly workbook covers the latest five financial years, with four quarters per year plus totals. Args: file_path: Local path (or file-like) to the downloaded ``.xlsx`` file. Returns: Dict mapping key name to long-form DataFrame. Each DataFrame includes ``year_label`` (e.g. ``"2024/25"``) and ``year`` (int start year). Raises: PSNIDataNotFoundError: If required sheets cannot be found. Example: >>> from bolster.data_sources.psni import police_ombudsman >>> True # real call requires downloaded file True """ xl = pd.ExcelFile(file_path) def _parse_quarterly_table(sheet_name: str, value_name: str) -> pd.DataFrame: raw = xl.parse(sheet_name, header=None) # Row 3 (0-indexed): "Quarter", year_label, year_label, ... header_row = 3 headers = raw.iloc[header_row].tolist() data = raw.iloc[header_row + 1 :].copy() data.columns = [str(h).strip() if pd.notna(h) else f"_col{i}" for i, h in enumerate(headers)] quarter_col = data.columns[0] year_cols = [c for c in data.columns if _FY_RE.match(str(c).strip())] # Keep only quarter rows data = data[data[quarter_col].astype(str).str.lower().str.startswith("quarter")] melted = data[[quarter_col] + year_cols].melt( id_vars=[quarter_col], var_name="year_label", value_name=value_name ) melted = melted.dropna(subset=[value_name]) melted[value_name] = pd.to_numeric(melted[value_name], errors="coerce").astype("Int64") melted["year"] = melted["year_label"].apply(_parse_fy_label) melted = melted.rename(columns={quarter_col: "quarter"}) return melted[["year", "year_label", "quarter", value_name]].reset_index(drop=True) complaints_df = _parse_quarterly_table("Complaints Received", "complaints") allegations_df = _parse_quarterly_table("Allegations Received", "allegations") # District sheet name has a trailing space: "Complaints - Area & District " district_sheet = next( (s for s in xl.sheet_names if "area" in s.lower() and "district" in s.lower()), None, ) if district_sheet is None: raise PSNIDataNotFoundError("District sheet not found in quarterly workbook") raw_d = xl.parse(district_sheet, header=None) # Row 3: Area, District, year1, year2 header_row = 3 headers = raw_d.iloc[header_row].tolist() data_d = raw_d.iloc[header_row + 1 :].copy() data_d.columns = [str(h).strip() if pd.notna(h) else f"_col{i}" for i, h in enumerate(headers)] district_col = data_d.columns[1] # col 1 is "District" year_cols_d = [c for c in data_d.columns if _FY_RE.match(str(c).strip())] # Keep only rows where the district column starts with "District" data_d = data_d[data_d[district_col].astype(str).str.lower().str.startswith("district")] melted_d = data_d[[district_col] + year_cols_d].melt( id_vars=[district_col], var_name="year_label", value_name="complaints" ) melted_d = melted_d.dropna(subset=["complaints"]) melted_d["complaints"] = pd.to_numeric(melted_d["complaints"], errors="coerce").astype("Int64") melted_d["district"] = melted_d[district_col].apply(_normalise_quarterly_district) melted_d["lgd_code"] = melted_d["district"].map(LGD_CODES) melted_d["year"] = melted_d["year_label"].apply(_parse_fy_label) district_df = melted_d[["year", "year_label", "district", "lgd_code", "complaints"]].reset_index(drop=True) return { "complaints": complaints_df, "allegations": allegations_df, "by_district": district_df, }
# ── High-level accessor ────────────────────────────────────────────────────────
[docs] def get_latest_complaints( breakdown: str = "totals", force_refresh: bool = False, ) -> pd.DataFrame: """Download and return the latest Police Ombudsman complaint data. For ``totals``, ``by_district``, ``by_allegation_type``, and ``by_outcome`` the annual publication is used (richest historical coverage). For ``quarterly`` the latest quarterly bulletin is used. Args: breakdown: One of: - ``"totals"`` — total complaints 2000/01 to present (default) - ``"by_district"`` — complaints by policing district, 2011/12+ - ``"by_allegation_type"`` — allegations by type, 2011/12+ - ``"by_outcome"`` — closures by outcome, 2011/12+ - ``"quarterly"`` — quarterly complaints, latest 5 financial years force_refresh: If ``True``, bypass cache and re-download the source file. Returns: Tidy DataFrame for the requested breakdown. Raises: ValueError: If *breakdown* is not one of the recognised values. PSNIDataNotFoundError: If the source cannot be downloaded. Example: >>> df = get_latest_complaints() >>> set(["year", "complaints"]).issubset(df.columns) True >>> df_d = get_latest_complaints("by_district") >>> "district" in df_d.columns True """ _annual_keys = {"totals", "by_district", "by_allegation_type", "by_outcome"} _quarterly_keys = {"quarterly"} _all_keys = _annual_keys | _quarterly_keys if breakdown not in _all_keys: raise ValueError(f"Invalid breakdown {breakdown!r}. Choose from: {sorted(_all_keys)}") if breakdown in _quarterly_keys: url = get_quarterly_publication_url() file_path = download_file(url, cache_ttl_hours=24 * 7, force_refresh=force_refresh) data = parse_quarterly(file_path) return data["complaints"] url = get_annual_publication_url() file_path = download_file(url, cache_ttl_hours=24 * 30, force_refresh=force_refresh) data = parse_annual(file_path) return data[breakdown]
# ── Validation ─────────────────────────────────────────────────────────────────
[docs] def validate_complaints(df: pd.DataFrame, breakdown: str) -> bool: """Validate a Police Ombudsman complaints DataFrame. Checks that: - The DataFrame is non-empty. - Required columns for the given *breakdown* are present. - The ``year`` column contains plausible financial-year start years. - Complaint / allegation counts are non-negative. Args: df: DataFrame to validate (as returned by :func:`get_latest_complaints`). breakdown: One of ``"totals"``, ``"by_district"``, ``"by_allegation_type"``, ``"by_outcome"``, ``"quarterly"``. Returns: ``True`` if validation passes. Raises: PSNIValidationError: If any check fails. Example: >>> import pandas as pd >>> df = pd.DataFrame({"year": [2020, 2021], "complaints": [3000, 3100]}) >>> validate_complaints(df, "totals") True """ if df.empty: raise PSNIValidationError(f"Empty DataFrame for breakdown={breakdown!r}") required: dict[str, list[str]] = { "totals": ["year", "complaints"], "by_district": ["year", "district", "complaints"], "by_allegation_type": ["year", "allegation_type", "allegations"], "by_outcome": ["year", "outcome", "closures"], "quarterly": ["year", "quarter", "complaints"], } if breakdown not in required: raise PSNIValidationError(f"Unknown breakdown {breakdown!r}") missing = set(required[breakdown]) - set(df.columns) if missing: raise PSNIValidationError(f"Missing required columns for breakdown={breakdown!r}: {missing}") # Year range sanity check years = df["year"].dropna().astype(int) if years.empty: raise PSNIValidationError("No valid year values found") if years.min() < 2000 or years.max() > 2030: raise PSNIValidationError( f"Year values out of expected range [2000, 2030]: min={years.min()}, max={years.max()}" ) # Value column must be non-negative value_col = required[breakdown][-1] values = pd.to_numeric(df[value_col], errors="coerce").dropna() if (values < 0).any(): raise PSNIValidationError(f"Negative values found in column {value_col!r} for breakdown={breakdown!r}") logger.info( "validate_complaints passed: breakdown=%r, rows=%d, years=%d%d", breakdown, len(df), int(years.min()), int(years.max()), ) return True