Source code for bolster.data_sources.education_suspensions

"""Pupil Suspensions and Expulsions in Northern Ireland.

Provides access to annual suspension and expulsion statistics for pupils of
compulsory school age in Northern Ireland, published by the Department of
Education Northern Ireland (DE NI).

Data covers pupil suspensions broken down by:
- Trend over time (Table 1, from 2011/12 to present)
- School type (Primary, Non Grammar, Grammar, Special) (Table 2)
- School management type (Controlled, Voluntary, etc.) (Table 3)
- Number of suspension occasions (Once, Twice, Three or more) (Table 4)
- Suspension duration (Table 5)
- Key Stage (Foundation/KS1, KS2, KS3, KS4) (Table 6)
- Pupil characteristics (sex, age, ethnicity, SEN, religion) (Table 7)
- Education Authority Region (Table 8)
- Reason for suspension (Table 9)

Data Source:
    **Stable Entry Point**: https://www.education-ni.gov.uk/articles/pupil-suspensions-and-expulsions

    The module scrapes the articles page to find the latest publication link,
    then scrapes the publication page for the Excel data file URL.

Update Frequency: Annual
Geographic Coverage: Northern Ireland
Reference Period: 2011/12 – present

Example:
    >>> from bolster.data_sources.education_suspensions import get_latest_suspensions
    >>> df = get_latest_suspensions()
    >>> 'academic_year' in df.columns
    True
"""

import hashlib
import logging
from pathlib import Path
from urllib.parse import urljoin

import bs4
import pandas as pd

from bolster.utils.web import session


[docs]
logger = logging.getLogger(__name__)


# Stable entry point – lists all annual publications

[docs]
ARTICLES_URL = "https://www.education-ni.gov.uk/articles/pupil-suspensions-and-expulsions"


[docs]
BASE_URL = "https://www.education-ni.gov.uk"


# Cache directory for downloaded files (annual data – 7 day TTL is ample)

[docs]
CACHE_DIR = Path.home() / ".cache" / "bolster" / "education_suspensions"

CACHE_DIR.mkdir(parents=True, exist_ok=True)

_CACHE_TTL_HOURS = 24 * 7  # 1 week


# ---------------------------------------------------------------------------
# Errors
# ---------------------------------------------------------------------------



[docs]
class EducationSuspensionsError(Exception):
    """Base exception for education suspensions data errors."""




[docs]
class EducationSuspensionsNotFoundError(EducationSuspensionsError):
    """Raised when the data file or publication page cannot be found."""




[docs]
class EducationSuspensionsParseError(EducationSuspensionsError):
    """Raised when parsing the data file fails in an unexpected way."""



# ---------------------------------------------------------------------------
# Internal helpers
# ---------------------------------------------------------------------------


def _hash_url(url: str) -> str:
    return hashlib.md5(url.encode()).hexdigest()


def _cached_file(url: str) -> Path | None:
    """Return path to a cached copy of *url* if it exists and is fresh."""
    url_hash = _hash_url(url)
    ext = Path(url.split("?")[0]).suffix or ".xlsx"
    cache_path = CACHE_DIR / f"{url_hash}{ext}"
    if cache_path.exists():
        from datetime import datetime

        age_hours = (datetime.now() - datetime.fromtimestamp(cache_path.stat().st_mtime)).total_seconds() / 3600
        if age_hours < _CACHE_TTL_HOURS:
            logger.debug("Using cached file: %s", cache_path)
            return cache_path
    return None


def _download_file(url: str, force_refresh: bool = False) -> Path:
    """Download *url* to the local cache, returning the cache path."""
    if not force_refresh:
        cached = _cached_file(url)
        if cached:
            return cached

    url_hash = _hash_url(url)
    ext = Path(url.split("?")[0]).suffix or ".xlsx"
    cache_path = CACHE_DIR / f"{url_hash}{ext}"

    try:
        logger.info("Downloading %s", url)
        response = session.get(url, timeout=60)
        response.raise_for_status()
        cache_path.write_bytes(response.content)
        logger.info("Saved to %s", cache_path)
        return cache_path
    except Exception as exc:
        raise EducationSuspensionsNotFoundError(f"Failed to download {url}: {exc}") from exc


def _extract_data_rows(ws_rows: list[tuple], header_row_idx: int) -> pd.DataFrame:
    """Build a DataFrame from raw worksheet rows given the 0-based header row index."""
    header = ws_rows[header_row_idx]
    data = ws_rows[header_row_idx + 1 :]
    df = pd.DataFrame(data, columns=header)
    # Drop fully-None columns and fully-None rows
    df = df.dropna(axis=1, how="all").dropna(axis=0, how="all")
    # The first column of every table is None (blank leading column in Excel)
    if df.columns[0] is None:
        df = df.iloc[:, 1:]
    return df.reset_index(drop=True)


# ---------------------------------------------------------------------------
# Public: URL discovery
# ---------------------------------------------------------------------------



[docs]
def get_suspensions_publication_url() -> str:
    """Scrape education-ni.gov.uk to find the latest suspensions XLSX URL.

    The function follows a two-step chain:
    1. Fetches the stable articles page and extracts the most recent
       publications link for pupil suspensions/expulsions.
    2. Fetches that publication page and extracts the XLSX download link.

    Returns:
        Absolute URL of the latest suspensions XLSX file.

    Raises:
        EducationSuspensionsNotFoundError: If the publication or XLSX link
            cannot be found.
    """
    # Step 1 – find the most recent publication page
    try:
        resp = session.get(ARTICLES_URL, timeout=30)
        resp.raise_for_status()
    except Exception as exc:
        raise EducationSuspensionsNotFoundError(f"Failed to fetch articles page {ARTICLES_URL}: {exc}") from exc

    soup = bs4.BeautifulSoup(resp.content, "html.parser")
    pub_href = None
    for a in soup.find_all("a", href=True):
        href = a["href"]
        if "suspension" in href.lower() and "/publications/" in href:
            pub_href = href
            break  # first match is the most recent

    if not pub_href:
        raise EducationSuspensionsNotFoundError(f"Could not find a suspensions publication link on {ARTICLES_URL}")

    pub_url = urljoin(BASE_URL, pub_href)
    logger.info("Found publication page: %s", pub_url)

    # Step 2 – find the XLSX on that publication page
    try:
        resp2 = session.get(pub_url, timeout=30)
        resp2.raise_for_status()
    except Exception as exc:
        raise EducationSuspensionsNotFoundError(f"Failed to fetch publication page {pub_url}: {exc}") from exc

    soup2 = bs4.BeautifulSoup(resp2.content, "html.parser")
    for a in soup2.find_all("a", href=True):
        href = a["href"]
        if href.lower().endswith(".xlsx") or href.lower().endswith(".xls"):
            xlsx_url = href if href.startswith("http") else urljoin(BASE_URL, href)
            logger.info("Found XLSX URL: %s", xlsx_url)
            return xlsx_url

    raise EducationSuspensionsNotFoundError(f"Could not find XLSX link on publication page {pub_url}")



# ---------------------------------------------------------------------------
# Public: Parsing
# ---------------------------------------------------------------------------



[docs]
def parse_suspensions_file(file_path: str | Path) -> pd.DataFrame:
    """Parse the DE NI suspensions XLSX file into a tidy DataFrame.

    Reads Table 1 (annual trend) and produces one row per academic year
    with standardised column names.  Table 1 is the only table with a
    multi-year time series; the remaining tables are single-year snapshots
    and are intentionally not included in the tidy output (use
    :func:`parse_all_tables` for those).

    Args:
        file_path: Path to the downloaded XLSX file.

    Returns:
        DataFrame with columns:
            - ``academic_year``: Academic year string, e.g. ``"2024/25"``
            - ``pupils_suspended``: Number of pupils suspended (int)
            - ``pct_pupils_suspended``: Percentage of all pupils suspended (float, 0–1)

    Raises:
        EducationSuspensionsParseError: If the file cannot be parsed.
    """
    try:
        wb_dict = pd.read_excel(file_path, sheet_name=None, header=None)
    except Exception as exc:
        raise EducationSuspensionsParseError(f"Failed to read Excel file {file_path}: {exc}") from exc

    if "Table 1" not in wb_dict:
        raise EducationSuspensionsParseError("Expected sheet 'Table 1' not found in workbook")

    raw = wb_dict["Table 1"]

    # Locate header row: the row that contains 'Year' in column B (index 1)
    header_row = None
    for i, row in raw.iterrows():
        if str(row.iloc[1]).strip() == "Year":
            header_row = i
            break

    if header_row is None:
        raise EducationSuspensionsParseError("Could not locate header row in Table 1")

    # Slice from header row onwards
    df = raw.iloc[header_row:].copy()
    df.columns = df.iloc[0]
    df = df.iloc[1:].reset_index(drop=True)

    # Keep only columns B (Year) and C (pupils suspended) and D (percentage)
    # The Excel has a blank leading column A, so after reset our useful columns
    # are at positions 1, 2, 3 (0-based).
    df = df.iloc[:, 1:4].copy()
    df.columns = ["academic_year", "pupils_suspended", "pct_pupils_suspended"]

    # Drop summary/footer rows (non-year values in academic_year column)
    df = df[df["academic_year"].notna()].copy()
    df = df[df["academic_year"].astype(str).str.match(r"^\d{4}/\d{2}$")].copy()

    # Coerce types
    df["academic_year"] = df["academic_year"].astype(str)
    df["pupils_suspended"] = pd.to_numeric(df["pupils_suspended"], errors="coerce").astype("Int64")
    df["pct_pupils_suspended"] = pd.to_numeric(df["pct_pupils_suspended"], errors="coerce")

    return df.reset_index(drop=True)




[docs]
def parse_all_tables(file_path: str | Path) -> dict[str, pd.DataFrame]:
    """Parse all tables from the DE NI suspensions XLSX into a dict of DataFrames.

    Each table is lightly cleaned (blank leading column removed, empty rows/cols
    dropped) but otherwise returned in its natural structure.

    Args:
        file_path: Path to the downloaded XLSX file.

    Returns:
        Dictionary mapping sheet names to DataFrames.

    Raises:
        EducationSuspensionsParseError: If the file cannot be read.
    """
    try:
        wb_dict = pd.read_excel(file_path, sheet_name=None, header=None)
    except Exception as exc:
        raise EducationSuspensionsParseError(f"Failed to read Excel file {file_path}: {exc}") from exc

    result: dict[str, pd.DataFrame] = {}
    for sheet_name, raw_df in wb_dict.items():
        if sheet_name == "Table of contents":
            continue
        # Locate header row (first row after the title rows, identified by
        # a non-None value in column B that is a string)
        header_row = None
        for i, row in raw_df.iterrows():
            val = row.iloc[1]
            if val is not None and isinstance(val, str) and val.strip() and not val.strip().startswith("Table"):
                header_row = i
                break
        if header_row is None:
            result[sheet_name] = raw_df
            continue

        df = raw_df.iloc[header_row:].copy()
        df.columns = df.iloc[0]
        df = df.iloc[1:].reset_index(drop=True)
        # Drop blank leading column
        if df.columns[0] is None:
            df = df.iloc[:, 1:]
        # Drop fully None columns and rows, and rows after the first total/source row
        df = df.dropna(axis=1, how="all").dropna(axis=0, how="all")
        # Truncate at rows where all data columns are None (footer notes)
        first_col = df.columns[0]
        footer_mask = df[first_col].isna() | df[first_col].astype(str).str.startswith("Source")
        if footer_mask.any():
            first_footer = footer_mask.idxmax()
            df = df.loc[: first_footer - 1]
        result[sheet_name] = df.reset_index(drop=True)

    return result



# ---------------------------------------------------------------------------
# Public: High-level entry points
# ---------------------------------------------------------------------------



[docs]
def get_latest_suspensions(force_refresh: bool = False) -> pd.DataFrame:
    """Download and return the latest NI pupil suspensions time-series data.

    This is the main entry point.  Returns Table 1 (annual trend) parsed
    into a tidy DataFrame via :func:`parse_suspensions_file`.

    Args:
        force_refresh: If ``True``, bypass the local cache and re-download.

    Returns:
        DataFrame with columns ``academic_year``, ``pupils_suspended``,
        ``pct_pupils_suspended``.

    Raises:
        EducationSuspensionsNotFoundError: If the source cannot be located.
        EducationSuspensionsParseError: If the file cannot be parsed.

    Example:
        >>> df = get_latest_suspensions()
        >>> 'academic_year' in df.columns
        True
    """
    xlsx_url = get_suspensions_publication_url()
    file_path = _download_file(xlsx_url, force_refresh=force_refresh)
    return parse_suspensions_file(file_path)




[docs]
def validate_suspensions_data(df: pd.DataFrame) -> bool:
    """Validate the structure and basic integrity of a suspensions DataFrame.

    Checks performed:
    - Required columns present
    - At least one row of data
    - ``pupils_suspended`` values are non-negative
    - ``pct_pupils_suspended`` values are in [0, 1]

    Args:
        df: DataFrame as returned by :func:`parse_suspensions_file` or
            :func:`get_latest_suspensions`.

    Returns:
        ``True`` if the data passes all checks.

    Raises:
        ValueError: If any check fails, with a descriptive message.
    """
    required = {"academic_year", "pupils_suspended", "pct_pupils_suspended"}
    missing = required - set(df.columns)
    if missing:
        raise ValueError(f"Missing required columns: {missing}")

    if len(df) == 0:
        raise ValueError("DataFrame is empty")

    if (df["pupils_suspended"].dropna() < 0).any():
        raise ValueError("pupils_suspended contains negative values")

    pct = df["pct_pupils_suspended"].dropna()
    if (pct < 0).any() or (pct > 1).any():
        raise ValueError("pct_pupils_suspended values outside [0, 1]")

    return True