Source code for bolster.data_sources.nisra.child_protection

"""NISRA Children's Social Care — Child Protection Statistics Module.

This module provides access to Northern Ireland's annual Children's Social Care
Statistics, focusing on the child protection chapter (referrals, investigations,
registrations, and register characteristics).

Data Coverage:
    - Children on the Child Protection Register (CPR) by age, sex, trust
    - CPR trend data from 31 March 2015 to present (annual snapshots)
    - Child Protection Referrals by trust and by referral source (2013/14–present)
    - Child Protection Investigations by type and trust (2015–present)
    - Category of abuse breakdowns (neglect, physical, sexual, emotional)
    - Registration duration on the CPR

HSC Trusts:
    - Belfast, Northern, South Eastern, Southern, Western

Data Source:
    Department of Health Northern Ireland publishes annual Children's Social Care
    Statistics covering the 12 months to 31 March each year. Child protection data
    is drawn from the Children Order Return (CPR series).

    Source: https://www.health-ni.gov.uk/topics/childrens-services-statistics
    Article: https://www.health-ni.gov.uk/articles/child-protection-register

Update Frequency:
    Annual, typically published in October for the year ending 31 March.

Example:
    >>> from bolster.data_sources.nisra import child_protection as cp
    >>> df = cp.get_latest_child_protection()
    >>> sorted(df.columns.tolist())
    ['category', 'measure', 'notes', 'subcategory', 'value', 'year']

    >>> # CPR trend data shows registrations back to 2015
    >>> trend = df[df['measure'] == 'cpr_registrations_ni_total']
    >>> 2015 in trend['year'].values
    True

Publication Details:
    - Frequency: Annual (year ending 31 March)
    - Published by: Department of Health, Community Information Branch
    - Latest: Children's Social Care Statistics for Northern Ireland 2024/25
    - Source: https://www.health-ni.gov.uk/topics/childrens-services-statistics
"""

import logging
import re
from pathlib import Path

import pandas as pd
from bs4 import BeautifulSoup

from bolster.utils.web import session

from ._base import NISRAValidationError, download_file, make_absolute_url


[docs]
logger = logging.getLogger(__name__)


# Base URLs

[docs]
HEALTH_NI_CHILDREN_ARTICLE = "https://www.health-ni.gov.uk/articles/child-protection-register"


[docs]
HEALTH_NI_BASE_URL = "https://www.health-ni.gov.uk"


# Expected columns in the long-format output

[docs]
REQUIRED_COLUMNS = {"year", "measure", "category", "subcategory", "value"}


# HSC Trust names (canonical form after normalisation)

[docs]
HSC_TRUSTS = ["Belfast", "Northern", "South Eastern", "Southern", "Western"]


# Known measures in output

[docs]
MEASURE_CPR_TOTAL = "cpr_registrations_ni_total"


[docs]
MEASURE_REFERRALS_TOTAL = "referrals_ni_total"


[docs]
MEASURE_INVESTIGATIONS_TOTAL = "investigations_ni_total"




[docs]
def get_child_protection_publication_url() -> str:
    r"""Find the latest Children's Social Care Statistics publication URL.

    Scrapes the Department of Health child protection register article page to find
    the most recent annual publication, then extracts the Excel download link.

    Returns:
        URL string for the latest Excel data file.

    Raises:
        NISRAValidationError: If publication page cannot be fetched or no Excel link found.

    Example:
        >>> url = get_child_protection_publication_url()
        >>> url.startswith("https://")
        True
        >>> url.endswith((".xlsx", ".XLSX", ".xls"))
        True
    """
    logger.info("Fetching child protection publications from %s", HEALTH_NI_CHILDREN_ARTICLE)

    try:
        response = session.get(HEALTH_NI_CHILDREN_ARTICLE, timeout=30)
        response.raise_for_status()
    except Exception as e:
        raise NISRAValidationError(f"Failed to fetch child protection register page: {e}") from e

    soup = BeautifulSoup(response.content, "html.parser")

    # Find the link to the latest annual Children's Social Care Statistics publication
    pub_url = None
    for link in soup.find_all("a", href=True):
        text = link.get_text(strip=True).lower()
        href = link["href"]
        # Match "children's social care statistics" annual publications
        if "children" in text and "social care statistics" in text and ("202" in text or "publications" in href):
            pub_url = make_absolute_url(href, HEALTH_NI_BASE_URL)
            logger.info("Found publication link: %s -> %s", link.get_text(strip=True), pub_url)
            break

    if not pub_url:
        raise NISRAValidationError(
            f"Could not find Children's Social Care Statistics publication link on {HEALTH_NI_CHILDREN_ARTICLE}"
        )

    # Fetch the publication page to get the Excel download
    try:
        pub_response = session.get(pub_url, timeout=30)
        pub_response.raise_for_status()
    except Exception as e:
        raise NISRAValidationError(f"Failed to fetch publication page {pub_url}: {e}") from e

    pub_soup = BeautifulSoup(pub_response.content, "html.parser")

    # Look for the main Excel data file (Tables spreadsheet, not pre-release PDF)
    excel_url = None
    for link in pub_soup.find_all("a", href=True):
        href = link["href"]
        text = link.get_text(strip=True).lower()
        if (
            re.search(r"\.(xlsx|xls)$", href, re.IGNORECASE)
            and "pre-release" not in href.lower()
            and "pre-release" not in text
        ):
            excel_url = make_absolute_url(href, HEALTH_NI_BASE_URL)
            logger.info("Found Excel file: %s", excel_url)
            break

    if not excel_url:
        raise NISRAValidationError(f"Could not find Excel data file in publication page {pub_url}")

    return excel_url



def _normalise_trust(raw: str) -> str:
    """Normalise an HSC Trust name to its short canonical form.

    Args:
        raw: Raw trust name string (e.g. "Belfast HSC Trust", "Northern HSC Trust")

    Returns:
        Short canonical form (e.g. "Belfast", "Northern")
    """
    for trust in HSC_TRUSTS:
        if trust.lower() in str(raw).lower():
            return trust
    return str(raw).strip()


def _safe_int(val) -> int | None:
    """Convert a value to int, returning None for missing/suppressed values.

    Args:
        val: Value to convert (may be '[S]', '-', NaN, or numeric)

    Returns:
        Integer value or None
    """
    if val is None:
        return None
    s = str(val).strip()
    if s in ("", "nan", "-", "[S]", "[z]", "[c]"):
        return None
    try:
        return int(float(s))
    except (ValueError, TypeError):
        return None


def _parse_cpr_trend(file_path: str | Path, content: bytes | None = None) -> list[dict]:
    """Parse Table 2.2a: CPR registrations by trust over time.

    Args:
        file_path: Path to the Excel file (used if content is None)
        content: Optional pre-loaded bytes

    Returns:
        List of record dicts with keys: year, measure, category, subcategory, value, notes
    """
    kwargs = {"sheet_name": "Table 2.2", "header": None}
    if content is not None:
        from io import BytesIO

        df = pd.read_excel(BytesIO(content), **kwargs)
    else:
        df = pd.read_excel(file_path, **kwargs)

    records = []

    # The sheet has two sub-tables: 2.2a (numbers) and 2.2b (rates per 10k)
    # 2.2a starts at row 0, 2.2b follows after 2.2a data
    # Identify header rows by looking for year columns
    current_subtable = None
    header_years = []

    for _idx, row in df.iterrows():
        cells = [str(c).strip() if str(c) != "nan" else "" for c in row]
        non_empty = [c for c in cells if c]

        if not non_empty:
            continue

        # Detect subtable header
        first = non_empty[0].lower()
        if "number of children" in first and "child protection register" in first:
            current_subtable = "count"
            continue
        if "rate of children" in first or "per 10,000" in first.lower():
            current_subtable = "rate"
            continue

        # Detect year header row
        if any(re.match(r"31 march \d{4}", c, re.IGNORECASE) for c in non_empty):
            header_years = []
            for c in cells[1:]:
                m = re.search(r"\d{4}", c)
                if m:
                    header_years.append(int(m.group()))
                elif c == "":
                    header_years.append(None)
            continue

        if header_years and current_subtable == "count":
            # First cell should be a trust or "Northern Ireland"
            trust_raw = cells[0]
            if not trust_raw or trust_raw.lower() in ("hsc trust", ""):
                continue
            trust = _normalise_trust(trust_raw)

            for i, year in enumerate(header_years):
                if year is None or i + 1 >= len(cells):
                    continue
                val = _safe_int(cells[i + 1])
                if val is not None:
                    records.append(
                        {
                            "year": year,
                            "measure": "cpr_registrations_trust_snapshot",
                            "category": "trust",
                            "subcategory": trust,
                            "value": val,
                            "notes": "Count at 31 March",
                        }
                    )

    return records


def _parse_referrals_trend(file_path: str | Path, content: bytes | None = None) -> list[dict]:
    """Parse Table 2.9: Child Protection Referrals by HSC Trust over time.

    Args:
        file_path: Path to the Excel file (used if content is None)
        content: Optional pre-loaded bytes

    Returns:
        List of record dicts
    """
    kwargs = {"sheet_name": "Table 2.9", "header": None}
    if content is not None:
        from io import BytesIO

        df = pd.read_excel(BytesIO(content), **kwargs)
    else:
        df = pd.read_excel(file_path, **kwargs)

    records = []
    header_trusts = []

    for _idx, row in df.iterrows():
        cells = [str(c).strip() if str(c) != "nan" else "" for c in row]
        non_empty = [c for c in cells if c]

        if not non_empty:
            continue

        first = cells[0].lower() if cells[0] else ""

        # Detect the header row (trusts as columns)
        if "hsc trust" in first or first == "hsc trust":
            header_trusts = [_normalise_trust(c) for c in cells[1:] if c and c.lower() != "nan"]
            continue

        # Trust column header row with trust names
        if any("trust" in c.lower() for c in non_empty[1:]):
            header_trusts = []
            for c in cells[1:]:
                if c and c.lower() not in ("", "nan"):
                    header_trusts.append(_normalise_trust(c))
            continue

        # Data row: first cell is a financial year like "2013/14"
        if re.match(r"\d{4}/\d{2,4}", cells[0]):
            year_str = cells[0]
            # Derive year from end of financial year (e.g. 2013/14 -> 2014)
            m = re.match(r"(\d{4})/(\d{2,4})", year_str)
            if not m:
                continue
            end_year = int(m.group(1)) + 1

            # Last column is "Northern Ireland" total
            ni_total_idx = len(cells) - 1
            ni_val = _safe_int(cells[ni_total_idx])
            if ni_val is not None:
                records.append(
                    {
                        "year": end_year,
                        "measure": MEASURE_REFERRALS_TOTAL,
                        "category": "ni_total",
                        "subcategory": "Northern Ireland",
                        "value": ni_val,
                        "notes": f"Financial year {year_str}",
                    }
                )

            # Individual trusts
            for i, trust in enumerate(header_trusts):
                if trust == "Northern Ireland":
                    continue
                col_idx = i + 1
                if col_idx >= len(cells):
                    continue
                val = _safe_int(cells[col_idx])
                if val is not None:
                    records.append(
                        {
                            "year": end_year,
                            "measure": "referrals_by_trust",
                            "category": "trust",
                            "subcategory": trust,
                            "value": val,
                            "notes": f"Financial year {year_str}",
                        }
                    )

    return records


def _parse_cpr_snapshot(file_path: str | Path, content: bytes | None = None) -> list[dict]:
    """Parse Table 2.1: CPR registrations by age and sex for latest year.

    Args:
        file_path: Path to the Excel file (used if content is None)
        content: Optional pre-loaded bytes

    Returns:
        List of record dicts
    """
    kwargs = {"sheet_name": "Table 2.1", "header": None}
    if content is not None:
        from io import BytesIO

        df = pd.read_excel(BytesIO(content), **kwargs)
    else:
        df = pd.read_excel(file_path, **kwargs)

    records = []

    # Row 0: title (extract year), row 2: notes, row 3+: header with age/sex, data rows follow
    # We just want NI total row
    publication_year = None
    for idx, row in df.iterrows():
        cells = [str(c).strip() if str(c) != "nan" else "" for c in row]
        first = cells[0] if cells else ""

        # Extract year from title — handles "31 March 2025" or "2024/25"
        if idx == 0 and cells[0]:
            # Try "31 March YYYY" first
            m = re.search(r"31 March (\d{4})", cells[0], re.IGNORECASE)
            if m:
                publication_year = int(m.group(1))
            else:
                # Fallback: "YYYY/YY" financial year
                m = re.search(r"(\d{4})/(\d{2})", cells[0])
                if m:
                    publication_year = int(m.group(1)) + 1  # e.g. 2024/25 → 2025

        # "All" column: position 13 (0-indexed), "Northern Ireland" row
        if first.lower() == "northern ireland":
            all_val = _safe_int(cells[13]) if len(cells) > 13 else None
            if all_val is not None and publication_year:
                records.append(
                    {
                        "year": publication_year,
                        "measure": MEASURE_CPR_TOTAL,
                        "category": "ni_total",
                        "subcategory": "Northern Ireland",
                        "value": all_val,
                        "notes": f"Snapshot at 31 March {publication_year}",
                    }
                )
            break

    return records


def _parse_investigations_trend(file_path: str | Path, content: bytes | None = None) -> list[dict]:
    """Parse Table 2.12: Child Protection Investigations by trust over time.

    Args:
        file_path: Path to the Excel file (used if content is None)
        content: Optional pre-loaded bytes

    Returns:
        List of record dicts
    """
    kwargs = {"sheet_name": "Table 2.12", "header": None}
    if content is not None:
        from io import BytesIO

        df = pd.read_excel(BytesIO(content), **kwargs)
    else:
        df = pd.read_excel(file_path, **kwargs)

    records = []
    header_years: list[int] = []

    for row_idx, row in df.iterrows():
        cells = [str(c).strip() if str(c) != "nan" else "" for c in row]
        non_empty = [c for c in cells if c]

        if not non_empty:
            continue

        first = cells[0].lower() if cells[0] else ""

        # Header row contains years
        if first in ("hsc trust", "trust") or (row_idx == 2 and any(re.match(r"\d{4}", c) for c in non_empty)):
            header_years = []
            for c in cells[1:]:
                m = re.match(r"(\d{4})", c)
                if m:
                    header_years.append(int(m.group(1)))
            continue

        if (
            header_years
            and "northern ireland" not in first
            and (any(t.lower() in first for t in ["belfast", "northern", "south", "southern", "western"]))
        ):
            trust = _normalise_trust(cells[0])
            for i, year in enumerate(header_years):
                col_idx = i + 1
                if col_idx >= len(cells):
                    continue
                val = _safe_int(cells[col_idx])
                if val is not None:
                    records.append(
                        {
                            "year": year,
                            "measure": "investigations_by_trust",
                            "category": "trust",
                            "subcategory": trust,
                            "value": val,
                            "notes": "Investigations year ending 31 March",
                        }
                    )

        elif header_years and "northern ireland" in first:
            for i, year in enumerate(header_years):
                col_idx = i + 1
                if col_idx >= len(cells):
                    continue
                val = _safe_int(cells[col_idx])
                if val is not None:
                    records.append(
                        {
                            "year": year,
                            "measure": MEASURE_INVESTIGATIONS_TOTAL,
                            "category": "ni_total",
                            "subcategory": "Northern Ireland",
                            "value": val,
                            "notes": "Investigations year ending 31 March",
                        }
                    )

    return records


def _parse_abuse_category_trend(file_path: str | Path, content: bytes | None = None) -> list[dict]:
    """Parse Table 2.5a: CPR registrations by abuse category over time.

    The sheet has a 2-column label structure: "Category of Abuse" and optionally
    "Main Category of Abuse" (for rows with combined categories), followed by year columns.

    Args:
        file_path: Path to the Excel file (used if content is None)
        content: Optional pre-loaded bytes

    Returns:
        List of record dicts
    """
    kwargs = {"sheet_name": "Table 2.5", "header": None}
    if content is not None:
        from io import BytesIO

        df = pd.read_excel(BytesIO(content), **kwargs)
    else:
        df = pd.read_excel(file_path, **kwargs)

    records = []
    header_years: list[int] = []
    in_subtable_a = False
    # Number of label columns before the year values
    n_label_cols = 1

    for _idx, row in df.iterrows():
        cells = [str(c).strip() if str(c) != "nan" else "" for c in row]
        non_empty = [c for c in cells if c]

        if not non_empty:
            continue

        first_lower = cells[0].lower() if cells[0] else ""

        # Detect start of subtable 2.5a (counts)
        if "number of children" in first_lower or ("table 2.5a" in first_lower):
            in_subtable_a = True
            header_years = []
            continue

        # Detect start of subtable 2.5b (percentages) — stop parsing
        if in_subtable_a and ("percentage" in first_lower or "table 2.5b" in first_lower):
            break

        # Detect the header row (contains year integers)
        if in_subtable_a and any(re.match(r"^\d{4}$", c) for c in non_empty):
            header_years = []
            n_label_cols = 0
            for ci, c in enumerate(cells):
                if re.match(r"^\d{4}$", c):
                    if n_label_cols == 0:
                        n_label_cols = ci  # label cols before first year
                    header_years.append(int(c))
            if n_label_cols == 0:
                n_label_cols = 1
            continue

        # Also handle float-formatted years like "2015.0"
        if in_subtable_a and any(re.match(r"^\d{4}\.0$", c) for c in non_empty):
            header_years = []
            n_label_cols = 0
            for ci, c in enumerate(cells):
                m = re.match(r"^(\d{4})\.0$", c)
                if m:
                    if n_label_cols == 0:
                        n_label_cols = ci
                    header_years.append(int(m.group(1)))
            if n_label_cols == 0:
                n_label_cols = 1
            continue

        if in_subtable_a and header_years and first_lower not in ("", "category of abuse"):
            if "note" in first_lower or "please" in first_lower or "large proportion" in first_lower:
                continue

            # Category name: use first non-empty label column
            category_raw = cells[0].strip()
            if not category_raw:
                continue

            # Values start at n_label_cols
            for i, year in enumerate(header_years):
                col_idx = n_label_cols + i
                if col_idx >= len(cells):
                    continue
                val = _safe_int(cells[col_idx])
                if val is not None:
                    records.append(
                        {
                            "year": year,
                            "measure": "cpr_by_abuse_category",
                            "category": "abuse_category",
                            "subcategory": category_raw,
                            "value": val,
                            "notes": "Count at 31 March",
                        }
                    )

    return records



[docs]
def parse_child_protection_file(file_path: str | Path) -> pd.DataFrame:
    """Parse a Children's Social Care Statistics Excel file into long-format DataFrame.

    Extracts child protection data from the following tables:
    - Table 2.1: CPR snapshot (total registrations for latest year)
    - Table 2.2a: CPR registrations by trust, 2015–present
    - Table 2.5a: CPR by category of abuse, 2015–present
    - Table 2.9: Child protection referrals by trust, 2013/14–present
    - Table 2.12: Child protection investigations by trust, 2015–present

    Args:
        file_path: Path to the Excel file.

    Returns:
        DataFrame with columns: year (int), measure (str), category (str),
        subcategory (str), value (int), notes (str).

    Raises:
        NISRAValidationError: If the file cannot be parsed or contains no data.
    """
    file_path = Path(file_path)
    logger.info("Parsing child protection file: %s", file_path)

    records: list[dict] = []

    try:
        records.extend(_parse_cpr_snapshot(file_path))
        records.extend(_parse_cpr_trend(file_path))
        records.extend(_parse_referrals_trend(file_path))
        records.extend(_parse_investigations_trend(file_path))
        records.extend(_parse_abuse_category_trend(file_path))
    except Exception as e:
        raise NISRAValidationError(f"Failed to parse {file_path}: {e}") from e

    if not records:
        raise NISRAValidationError(f"No child protection data extracted from {file_path}")

    df = pd.DataFrame(records)

    # Ensure correct dtypes
    df["year"] = pd.to_numeric(df["year"], errors="coerce").astype("Int64")
    df["value"] = pd.to_numeric(df["value"], errors="coerce").astype("Int64")
    df["measure"] = df["measure"].astype(str)
    df["category"] = df["category"].astype(str)
    df["subcategory"] = df["subcategory"].astype(str)
    df["notes"] = df["notes"].astype(str)

    return df[["year", "measure", "category", "subcategory", "value", "notes"]].copy()




[docs]
def get_latest_child_protection(force_refresh: bool = False) -> pd.DataFrame:
    """Download and parse the latest Children's Social Care Statistics.

    Fetches the most recent annual publication from the Department of Health
    and returns child protection data in long format.

    Args:
        force_refresh: Force re-download even if cached (default: False).

    Returns:
        DataFrame with columns: year, measure, category, subcategory, value, notes.

    Raises:
        NISRAValidationError: If download or parsing fails.
    """
    excel_url = get_child_protection_publication_url()
    file_path = download_file(excel_url, force_refresh=force_refresh)
    return parse_child_protection_file(file_path)




[docs]
def validate_child_protection_data(df: pd.DataFrame) -> pd.DataFrame:
    """Validate child protection DataFrame and raise on structural issues.

    Checks for:
    - Non-empty DataFrame
    - Required columns present
    - No negative values

    Args:
        df: DataFrame to validate.

    Returns:
        The input DataFrame (unchanged) if validation passes.

    Raises:
        NISRAValidationError: If any check fails.
    """
    if df is None or df.empty:
        raise NISRAValidationError("Child protection data is empty")

    missing = REQUIRED_COLUMNS - set(df.columns)
    if missing:
        raise NISRAValidationError(f"Missing required columns: {sorted(missing)}")

    neg = df["value"].dropna()
    neg = neg[neg < 0]
    if len(neg) > 0:
        raise NISRAValidationError(f"Found {len(neg)} negative values in 'value' column")

    return df