Source code for bolster.data_sources.nisra.child_protection

"""NISRA Children's Social Care — Child Protection Statistics Module.

This module provides access to Northern Ireland's annual Children's Social Care
Statistics, focusing on the child protection chapter (referrals, investigations,
registrations, and register characteristics).

Data Coverage:
    - Children on the Child Protection Register (CPR) by age, sex, trust
    - CPR trend data from 31 March 2015 to present (annual snapshots)
    - Child Protection Referrals by trust and by referral source (2013/14–present)
    - Child Protection Investigations by type and trust (2015–present)
    - Category of abuse breakdowns (neglect, physical, sexual, emotional)
    - Registration duration on the CPR

HSC Trusts:
    - Belfast, Northern, South Eastern, Southern, Western

Data Source:
    Department of Health Northern Ireland publishes annual Children's Social Care
    Statistics covering the 12 months to 31 March each year. Child protection data
    is drawn from the Children Order Return (CPR series).

    Source: https://www.health-ni.gov.uk/topics/childrens-services-statistics
    Article: https://www.health-ni.gov.uk/articles/child-protection-register

Update Frequency:
    Annual, typically published in October for the year ending 31 March.

Example:
    >>> from bolster.data_sources.nisra import child_protection as cp
    >>> df = cp.get_latest_child_protection()
    >>> sorted(df.columns.tolist())
    ['category', 'measure', 'notes', 'subcategory', 'value', 'year']

    >>> # CPR trend data shows registrations back to 2015
    >>> trend = df[df['measure'] == 'cpr_registrations_ni_total']
    >>> 2015 in trend['year'].values
    True

Publication Details:
    - Frequency: Annual (year ending 31 March)
    - Published by: Department of Health, Community Information Branch
    - Latest: Children's Social Care Statistics for Northern Ireland 2024/25
    - Source: https://www.health-ni.gov.uk/topics/childrens-services-statistics
"""

import logging
import re
from pathlib import Path

import pandas as pd
from bs4 import BeautifulSoup

from bolster.utils.web import session

from ._base import NISRAValidationError, download_file, make_absolute_url

[docs] logger = logging.getLogger(__name__)
# Base URLs
[docs] HEALTH_NI_CHILDREN_ARTICLE = "https://www.health-ni.gov.uk/articles/child-protection-register"
[docs] HEALTH_NI_BASE_URL = "https://www.health-ni.gov.uk"
# Expected columns in the long-format output
[docs] REQUIRED_COLUMNS = {"year", "measure", "category", "subcategory", "value"}
# HSC Trust names (canonical form after normalisation)
[docs] HSC_TRUSTS = ["Belfast", "Northern", "South Eastern", "Southern", "Western"]
# Known measures in output
[docs] MEASURE_CPR_TOTAL = "cpr_registrations_ni_total"
[docs] MEASURE_REFERRALS_TOTAL = "referrals_ni_total"
[docs] MEASURE_INVESTIGATIONS_TOTAL = "investigations_ni_total"
[docs] def get_child_protection_publication_url() -> str: r"""Find the latest Children's Social Care Statistics publication URL. Scrapes the Department of Health child protection register article page to find the most recent annual publication, then extracts the Excel download link. Returns: URL string for the latest Excel data file. Raises: NISRAValidationError: If publication page cannot be fetched or no Excel link found. Example: >>> url = get_child_protection_publication_url() >>> url.startswith("https://") True >>> url.endswith((".xlsx", ".XLSX", ".xls")) True """ logger.info("Fetching child protection publications from %s", HEALTH_NI_CHILDREN_ARTICLE) try: response = session.get(HEALTH_NI_CHILDREN_ARTICLE, timeout=30) response.raise_for_status() except Exception as e: raise NISRAValidationError(f"Failed to fetch child protection register page: {e}") from e soup = BeautifulSoup(response.content, "html.parser") # Find the link to the latest annual Children's Social Care Statistics publication pub_url = None for link in soup.find_all("a", href=True): text = link.get_text(strip=True).lower() href = link["href"] # Match "children's social care statistics" annual publications if "children" in text and "social care statistics" in text and ("202" in text or "publications" in href): pub_url = make_absolute_url(href, HEALTH_NI_BASE_URL) logger.info("Found publication link: %s -> %s", link.get_text(strip=True), pub_url) break if not pub_url: raise NISRAValidationError( f"Could not find Children's Social Care Statistics publication link on {HEALTH_NI_CHILDREN_ARTICLE}" ) # Fetch the publication page to get the Excel download try: pub_response = session.get(pub_url, timeout=30) pub_response.raise_for_status() except Exception as e: raise NISRAValidationError(f"Failed to fetch publication page {pub_url}: {e}") from e pub_soup = BeautifulSoup(pub_response.content, "html.parser") # Look for the main Excel data file (Tables spreadsheet, not pre-release PDF) excel_url = None for link in pub_soup.find_all("a", href=True): href = link["href"] text = link.get_text(strip=True).lower() if ( re.search(r"\.(xlsx|xls)$", href, re.IGNORECASE) and "pre-release" not in href.lower() and "pre-release" not in text ): excel_url = make_absolute_url(href, HEALTH_NI_BASE_URL) logger.info("Found Excel file: %s", excel_url) break if not excel_url: raise NISRAValidationError(f"Could not find Excel data file in publication page {pub_url}") return excel_url
def _normalise_trust(raw: str) -> str: """Normalise an HSC Trust name to its short canonical form. Args: raw: Raw trust name string (e.g. "Belfast HSC Trust", "Northern HSC Trust") Returns: Short canonical form (e.g. "Belfast", "Northern") """ for trust in HSC_TRUSTS: if trust.lower() in str(raw).lower(): return trust return str(raw).strip() def _safe_int(val) -> int | None: """Convert a value to int, returning None for missing/suppressed values. Args: val: Value to convert (may be '[S]', '-', NaN, or numeric) Returns: Integer value or None """ if val is None: return None s = str(val).strip() if s in ("", "nan", "-", "[S]", "[z]", "[c]"): return None try: return int(float(s)) except (ValueError, TypeError): return None def _parse_cpr_trend(file_path: str | Path, content: bytes | None = None) -> list[dict]: """Parse Table 2.2a: CPR registrations by trust over time. Args: file_path: Path to the Excel file (used if content is None) content: Optional pre-loaded bytes Returns: List of record dicts with keys: year, measure, category, subcategory, value, notes """ kwargs = {"sheet_name": "Table 2.2", "header": None} if content is not None: from io import BytesIO df = pd.read_excel(BytesIO(content), **kwargs) else: df = pd.read_excel(file_path, **kwargs) records = [] # The sheet has two sub-tables: 2.2a (numbers) and 2.2b (rates per 10k) # 2.2a starts at row 0, 2.2b follows after 2.2a data # Identify header rows by looking for year columns current_subtable = None header_years = [] for _idx, row in df.iterrows(): cells = [str(c).strip() if str(c) != "nan" else "" for c in row] non_empty = [c for c in cells if c] if not non_empty: continue # Detect subtable header first = non_empty[0].lower() if "number of children" in first and "child protection register" in first: current_subtable = "count" continue if "rate of children" in first or "per 10,000" in first.lower(): current_subtable = "rate" continue # Detect year header row if any(re.match(r"31 march \d{4}", c, re.IGNORECASE) for c in non_empty): header_years = [] for c in cells[1:]: m = re.search(r"\d{4}", c) if m: header_years.append(int(m.group())) elif c == "": header_years.append(None) continue if header_years and current_subtable == "count": # First cell should be a trust or "Northern Ireland" trust_raw = cells[0] if not trust_raw or trust_raw.lower() in ("hsc trust", ""): continue trust = _normalise_trust(trust_raw) for i, year in enumerate(header_years): if year is None or i + 1 >= len(cells): continue val = _safe_int(cells[i + 1]) if val is not None: records.append( { "year": year, "measure": "cpr_registrations_trust_snapshot", "category": "trust", "subcategory": trust, "value": val, "notes": "Count at 31 March", } ) return records def _parse_referrals_trend(file_path: str | Path, content: bytes | None = None) -> list[dict]: """Parse Table 2.9: Child Protection Referrals by HSC Trust over time. Args: file_path: Path to the Excel file (used if content is None) content: Optional pre-loaded bytes Returns: List of record dicts """ kwargs = {"sheet_name": "Table 2.9", "header": None} if content is not None: from io import BytesIO df = pd.read_excel(BytesIO(content), **kwargs) else: df = pd.read_excel(file_path, **kwargs) records = [] header_trusts = [] for _idx, row in df.iterrows(): cells = [str(c).strip() if str(c) != "nan" else "" for c in row] non_empty = [c for c in cells if c] if not non_empty: continue first = cells[0].lower() if cells[0] else "" # Detect the header row (trusts as columns) if "hsc trust" in first or first == "hsc trust": header_trusts = [_normalise_trust(c) for c in cells[1:] if c and c.lower() != "nan"] continue # Trust column header row with trust names if any("trust" in c.lower() for c in non_empty[1:]): header_trusts = [] for c in cells[1:]: if c and c.lower() not in ("", "nan"): header_trusts.append(_normalise_trust(c)) continue # Data row: first cell is a financial year like "2013/14" if re.match(r"\d{4}/\d{2,4}", cells[0]): year_str = cells[0] # Derive year from end of financial year (e.g. 2013/14 -> 2014) m = re.match(r"(\d{4})/(\d{2,4})", year_str) if not m: continue end_year = int(m.group(1)) + 1 # Last column is "Northern Ireland" total ni_total_idx = len(cells) - 1 ni_val = _safe_int(cells[ni_total_idx]) if ni_val is not None: records.append( { "year": end_year, "measure": MEASURE_REFERRALS_TOTAL, "category": "ni_total", "subcategory": "Northern Ireland", "value": ni_val, "notes": f"Financial year {year_str}", } ) # Individual trusts for i, trust in enumerate(header_trusts): if trust == "Northern Ireland": continue col_idx = i + 1 if col_idx >= len(cells): continue val = _safe_int(cells[col_idx]) if val is not None: records.append( { "year": end_year, "measure": "referrals_by_trust", "category": "trust", "subcategory": trust, "value": val, "notes": f"Financial year {year_str}", } ) return records def _parse_cpr_snapshot(file_path: str | Path, content: bytes | None = None) -> list[dict]: """Parse Table 2.1: CPR registrations by age and sex for latest year. Args: file_path: Path to the Excel file (used if content is None) content: Optional pre-loaded bytes Returns: List of record dicts """ kwargs = {"sheet_name": "Table 2.1", "header": None} if content is not None: from io import BytesIO df = pd.read_excel(BytesIO(content), **kwargs) else: df = pd.read_excel(file_path, **kwargs) records = [] # Row 0: title (extract year), row 2: notes, row 3+: header with age/sex, data rows follow # We just want NI total row publication_year = None for idx, row in df.iterrows(): cells = [str(c).strip() if str(c) != "nan" else "" for c in row] first = cells[0] if cells else "" # Extract year from title — handles "31 March 2025" or "2024/25" if idx == 0 and cells[0]: # Try "31 March YYYY" first m = re.search(r"31 March (\d{4})", cells[0], re.IGNORECASE) if m: publication_year = int(m.group(1)) else: # Fallback: "YYYY/YY" financial year m = re.search(r"(\d{4})/(\d{2})", cells[0]) if m: publication_year = int(m.group(1)) + 1 # e.g. 2024/25 → 2025 # "All" column: position 13 (0-indexed), "Northern Ireland" row if first.lower() == "northern ireland": all_val = _safe_int(cells[13]) if len(cells) > 13 else None if all_val is not None and publication_year: records.append( { "year": publication_year, "measure": MEASURE_CPR_TOTAL, "category": "ni_total", "subcategory": "Northern Ireland", "value": all_val, "notes": f"Snapshot at 31 March {publication_year}", } ) break return records def _parse_investigations_trend(file_path: str | Path, content: bytes | None = None) -> list[dict]: """Parse Table 2.12: Child Protection Investigations by trust over time. Args: file_path: Path to the Excel file (used if content is None) content: Optional pre-loaded bytes Returns: List of record dicts """ kwargs = {"sheet_name": "Table 2.12", "header": None} if content is not None: from io import BytesIO df = pd.read_excel(BytesIO(content), **kwargs) else: df = pd.read_excel(file_path, **kwargs) records = [] header_years: list[int] = [] for row_idx, row in df.iterrows(): cells = [str(c).strip() if str(c) != "nan" else "" for c in row] non_empty = [c for c in cells if c] if not non_empty: continue first = cells[0].lower() if cells[0] else "" # Header row contains years if first in ("hsc trust", "trust") or (row_idx == 2 and any(re.match(r"\d{4}", c) for c in non_empty)): header_years = [] for c in cells[1:]: m = re.match(r"(\d{4})", c) if m: header_years.append(int(m.group(1))) continue if ( header_years and "northern ireland" not in first and (any(t.lower() in first for t in ["belfast", "northern", "south", "southern", "western"])) ): trust = _normalise_trust(cells[0]) for i, year in enumerate(header_years): col_idx = i + 1 if col_idx >= len(cells): continue val = _safe_int(cells[col_idx]) if val is not None: records.append( { "year": year, "measure": "investigations_by_trust", "category": "trust", "subcategory": trust, "value": val, "notes": "Investigations year ending 31 March", } ) elif header_years and "northern ireland" in first: for i, year in enumerate(header_years): col_idx = i + 1 if col_idx >= len(cells): continue val = _safe_int(cells[col_idx]) if val is not None: records.append( { "year": year, "measure": MEASURE_INVESTIGATIONS_TOTAL, "category": "ni_total", "subcategory": "Northern Ireland", "value": val, "notes": "Investigations year ending 31 March", } ) return records def _parse_abuse_category_trend(file_path: str | Path, content: bytes | None = None) -> list[dict]: """Parse Table 2.5a: CPR registrations by abuse category over time. The sheet has a 2-column label structure: "Category of Abuse" and optionally "Main Category of Abuse" (for rows with combined categories), followed by year columns. Args: file_path: Path to the Excel file (used if content is None) content: Optional pre-loaded bytes Returns: List of record dicts """ kwargs = {"sheet_name": "Table 2.5", "header": None} if content is not None: from io import BytesIO df = pd.read_excel(BytesIO(content), **kwargs) else: df = pd.read_excel(file_path, **kwargs) records = [] header_years: list[int] = [] in_subtable_a = False # Number of label columns before the year values n_label_cols = 1 for _idx, row in df.iterrows(): cells = [str(c).strip() if str(c) != "nan" else "" for c in row] non_empty = [c for c in cells if c] if not non_empty: continue first_lower = cells[0].lower() if cells[0] else "" # Detect start of subtable 2.5a (counts) if "number of children" in first_lower or ("table 2.5a" in first_lower): in_subtable_a = True header_years = [] continue # Detect start of subtable 2.5b (percentages) — stop parsing if in_subtable_a and ("percentage" in first_lower or "table 2.5b" in first_lower): break # Detect the header row (contains year integers) if in_subtable_a and any(re.match(r"^\d{4}$", c) for c in non_empty): header_years = [] n_label_cols = 0 for ci, c in enumerate(cells): if re.match(r"^\d{4}$", c): if n_label_cols == 0: n_label_cols = ci # label cols before first year header_years.append(int(c)) if n_label_cols == 0: n_label_cols = 1 continue # Also handle float-formatted years like "2015.0" if in_subtable_a and any(re.match(r"^\d{4}\.0$", c) for c in non_empty): header_years = [] n_label_cols = 0 for ci, c in enumerate(cells): m = re.match(r"^(\d{4})\.0$", c) if m: if n_label_cols == 0: n_label_cols = ci header_years.append(int(m.group(1))) if n_label_cols == 0: n_label_cols = 1 continue if in_subtable_a and header_years and first_lower not in ("", "category of abuse"): if "note" in first_lower or "please" in first_lower or "large proportion" in first_lower: continue # Category name: use first non-empty label column category_raw = cells[0].strip() if not category_raw: continue # Values start at n_label_cols for i, year in enumerate(header_years): col_idx = n_label_cols + i if col_idx >= len(cells): continue val = _safe_int(cells[col_idx]) if val is not None: records.append( { "year": year, "measure": "cpr_by_abuse_category", "category": "abuse_category", "subcategory": category_raw, "value": val, "notes": "Count at 31 March", } ) return records
[docs] def parse_child_protection_file(file_path: str | Path) -> pd.DataFrame: """Parse a Children's Social Care Statistics Excel file into long-format DataFrame. Extracts child protection data from the following tables: - Table 2.1: CPR snapshot (total registrations for latest year) - Table 2.2a: CPR registrations by trust, 2015–present - Table 2.5a: CPR by category of abuse, 2015–present - Table 2.9: Child protection referrals by trust, 2013/14–present - Table 2.12: Child protection investigations by trust, 2015–present Args: file_path: Path to the Excel file. Returns: DataFrame with columns: year (int), measure (str), category (str), subcategory (str), value (int), notes (str). Raises: NISRAValidationError: If the file cannot be parsed or contains no data. """ file_path = Path(file_path) logger.info("Parsing child protection file: %s", file_path) records: list[dict] = [] try: records.extend(_parse_cpr_snapshot(file_path)) records.extend(_parse_cpr_trend(file_path)) records.extend(_parse_referrals_trend(file_path)) records.extend(_parse_investigations_trend(file_path)) records.extend(_parse_abuse_category_trend(file_path)) except Exception as e: raise NISRAValidationError(f"Failed to parse {file_path}: {e}") from e if not records: raise NISRAValidationError(f"No child protection data extracted from {file_path}") df = pd.DataFrame(records) # Ensure correct dtypes df["year"] = pd.to_numeric(df["year"], errors="coerce").astype("Int64") df["value"] = pd.to_numeric(df["value"], errors="coerce").astype("Int64") df["measure"] = df["measure"].astype(str) df["category"] = df["category"].astype(str) df["subcategory"] = df["subcategory"].astype(str) df["notes"] = df["notes"].astype(str) return df[["year", "measure", "category", "subcategory", "value", "notes"]].copy()
[docs] def get_latest_child_protection(force_refresh: bool = False) -> pd.DataFrame: """Download and parse the latest Children's Social Care Statistics. Fetches the most recent annual publication from the Department of Health and returns child protection data in long format. Args: force_refresh: Force re-download even if cached (default: False). Returns: DataFrame with columns: year, measure, category, subcategory, value, notes. Raises: NISRAValidationError: If download or parsing fails. """ excel_url = get_child_protection_publication_url() file_path = download_file(excel_url, force_refresh=force_refresh) return parse_child_protection_file(file_path)
[docs] def validate_child_protection_data(df: pd.DataFrame) -> pd.DataFrame: """Validate child protection DataFrame and raise on structural issues. Checks for: - Non-empty DataFrame - Required columns present - No negative values Args: df: DataFrame to validate. Returns: The input DataFrame (unchanged) if validation passes. Raises: NISRAValidationError: If any check fails. """ if df is None or df.empty: raise NISRAValidationError("Child protection data is empty") missing = REQUIRED_COLUMNS - set(df.columns) if missing: raise NISRAValidationError(f"Missing required columns: {sorted(missing)}") neg = df["value"].dropna() neg = neg[neg < 0] if len(neg) > 0: raise NISRAValidationError(f"Found {len(neg)} negative values in 'value' column") return df