Source code for bolster.data_sources.nisra.elective_waiting_times

"""NISRA Elective/Outpatient Waiting Times Module.

Provides access to Northern Ireland's elective and outpatient waiting times
statistics, covering inpatient/day case and outpatient referrals waiting by
weeks-waited band, specialty, and HSC Trust.

Data is published quarterly by the Department of Health NI and covers two
separate series:

- **Inpatient/Day Case Waiting Times** — patients waiting by management type
  (Day Case or Inpatient), weeks-waited band, specialty, and HSC Trust.
  Data from Q1 2007-08 (quarter ending June 2007) to present.

- **Outpatient Waiting Times** — referrals waiting by weeks-waited band,
  specialty, and HSC Trust. Data from Q1 2008-09 (quarter ending June 2008)
  to present.

Both series contain two sheets reflecting a system change:

- **Pre-encompass**: Legacy PAS data up to March 2025 (inpatient) / March 2025
  (outpatient). Inpatient pre-encompass includes additional derived aggregate
  columns (e.g. "> 26 weeks") that are excluded from the long-format output.
- **encompass**: Data from the new electronic patient record system, starting
  from South Eastern Trust in December 2023. Not directly comparable with
  pre-encompass data due to the system transition.

Waiting Bands (Inpatient/Day Case — weeks):
    0 - 6, >6 - 13, >13 - 21, >21 - 26, >26 - 52, >52 - 65, >65 - 78,
    >78 - 91, >91 - 104, >104

Waiting Bands (Outpatient — weeks):
    0 - 6, >6 - 9, >9 - 13 / >9 - 12*, >12 - 15, >15 - 18, >18 - 26,
    >26 - 39, >39 - 52, >52 - 65, >65 - 78, >78 - 91, >91 - 104, >104

    (*The >9-13 / >9-12 split is a historical artefact; both columns appear
    but only one is populated per row.)

HSC Trusts:
    Belfast, Northern, South Eastern, Southern, Western
    (DPC = Domiciliary/Primary Care, included in source data but typically
    excluded from trust-level analyses)

Data Sources:
    - Inpatient/Day Case: https://www.health-ni.gov.uk/articles/inpatient-waiting-times
    - Outpatient: https://www.health-ni.gov.uk/articles/outpatient-waiting-times

Update Frequency:
    Quarterly, published approximately 3-6 months after the quarter end.

Example:
    >>> from bolster.data_sources.nisra import elective_waiting_times as ewt
    >>> df = ewt.get_latest_elective_waiting_times()
    >>> sorted(df.columns.tolist())
    ['date', 'patients_waiting', 'programme_of_care', 'quarter_ending', 'specialty', 'trust', 'waiting_type', 'weeks_waited_band', 'year']

    >>> ewt.validate_elective_waiting_times(df)
    True

Publication Details:
    - Frequency: Quarterly (published ~3-6 months after quarter end)
    - Published by: Department of Health NI
    - Sources: https://www.health-ni.gov.uk/articles/inpatient-waiting-times
               https://www.health-ni.gov.uk/articles/outpatient-waiting-times
"""

import logging
from pathlib import Path

import pandas as pd
from bs4 import BeautifulSoup

from bolster.utils.web import session

from ._base import NISRADataNotFoundError, NISRAValidationError, download_file, make_absolute_url


[docs]
logger = logging.getLogger(__name__)


# Landing page URLs

[docs]
DOH_BASE_URL = "https://www.health-ni.gov.uk"


[docs]
DOH_INPATIENT_PAGE = "https://www.health-ni.gov.uk/articles/inpatient-waiting-times"


[docs]
DOH_OUTPATIENT_PAGE = "https://www.health-ni.gov.uk/articles/outpatient-waiting-times"


# Sheet names (same in both inpatient and outpatient files)

[docs]
SHEET_PRE_ENCOMPASS = "Pre-encompass"


[docs]
SHEET_ENCOMPASS = "encompass"


# Expected HSC Trusts (excluding DPC variants)

[docs]
EXPECTED_TRUSTS = {"Belfast", "Northern", "South Eastern", "Southern", "Western"}


# ---- Inpatient/Day Case column definitions ----
# Core identifier columns present in both pre-encompass and encompass sheets

[docs]
IP_ID_COLS = ["Management", "Quarter Ending", "HSC Trust", "Specialty", "Programme Of Care"]


# Granular waiting-band columns present in BOTH inpatient sheets
# (pre-encompass has additional derived columns like "> 26 weeks" which we ignore)

[docs]
IP_BAND_COLS = [
    "0 - 6 weeks",
    "> 6 - 13 weeks",
    "> 13 - 21 weeks",
    "> 21 - 26 weeks",
    "> 26-52 weeks",
    "> 52 - 65 weeks",
    "> 65 - 78 weeks",
    "> 78 - 91 weeks",
    "> 91 - 104 weeks",
    "> 104 weeks",
]


# ---- Outpatient column definitions ----

[docs]
OP_ID_COLS = ["Quarter Ending", "HSC Trust", "Specialty", "Programme of Care"]


# Granular waiting-band columns (both pre-encompass and encompass have the same cols)
# The ">9 - 13 Wks" / ">9 - 12 Wks" split is a historical artefact — both present,
# only one populated per row. We include both and let the melt capture them.

[docs]
OP_BAND_COLS = [
    "0 - 6 Wks",
    ">6 - 9 Wks",
    ">9 - 13 Wks",
    ">9 - 12 Wks",
    ">12 - 15 Wks",
    ">15 - 18 Wks",
    ">18 - 26 Wks",
    ">26 - 39 Wks",
    ">39 - 52 Wks",
    ">52 - 65 Wks",
    ">65 - 78 Wks",
    ">78 - 91 Wks",
    ">91 - 104 Wks",
    ">104 Wks",
]


# Required columns in the final validated output

[docs]
REQUIRED_COLUMNS = {
    "date",
    "year",
    "quarter_ending",
    "trust",
    "specialty",
    "programme_of_care",
    "weeks_waited_band",
    "patients_waiting",
    "waiting_type",
}



def _get_latest_excel_url(landing_page: str, keyword: str) -> str:
    """Scrape a Department of Health landing page to find the latest Excel link.

    Follows the two-hop pattern used by other NISRA modules:
    1. Scrape the article landing page for publication links.
    2. Follow the most recent publication page to find the Excel download.

    Args:
        landing_page: URL of the article landing page.
        keyword: Keyword to match in the publication link href
            (e.g. "inpatient-and-day-case" or "outpatient").

    Returns:
        Absolute URL of the Excel file.

    Raises:
        NISRADataNotFoundError: If a publication or Excel link cannot be found.
    """
    logger.info(f"Fetching landing page: {landing_page}")
    try:
        resp = session.get(landing_page, timeout=30)
        resp.raise_for_status()
    except Exception as e:
        raise NISRADataNotFoundError(f"Failed to fetch landing page {landing_page}: {e}") from e

    soup = BeautifulSoup(resp.content, "html.parser")

    pub_url = None
    for a in soup.find_all("a", href=True):
        href = a["href"]
        if "publications" in href and keyword in href:
            pub_url = make_absolute_url(href, DOH_BASE_URL)
            break

    if pub_url is None:
        raise NISRADataNotFoundError(f"Could not find a publication link containing '{keyword}' on {landing_page}")

    logger.info(f"Fetching publication page: {pub_url}")
    try:
        pub_resp = session.get(pub_url, timeout=30)
        pub_resp.raise_for_status()
    except Exception as e:
        raise NISRADataNotFoundError(f"Failed to fetch publication page {pub_url}: {e}") from e

    pub_soup = BeautifulSoup(pub_resp.content, "html.parser")

    excel_url = None
    for a in pub_soup.find_all("a", href=True):
        href = a["href"]
        if ".xlsx" in href.lower():
            excel_url = make_absolute_url(href, DOH_BASE_URL)
            break

    if excel_url is None:
        raise NISRADataNotFoundError(f"Could not find an Excel (.xlsx) file on {pub_url}")

    logger.info(f"Found Excel URL: {excel_url}")
    return excel_url



[docs]
def get_elective_waiting_times_url() -> dict[str, str]:
    """Scrape the Department of Health pages to find the latest Excel file URLs.

    Returns:
        Dictionary with keys ``"inpatient"`` and ``"outpatient"``, each mapping
        to the absolute URL of the most recent quarterly Excel file.

    Raises:
        NISRADataNotFoundError: If either URL cannot be located.
    """
    inpatient_url = _get_latest_excel_url(DOH_INPATIENT_PAGE, "inpatient-and-day-case")
    outpatient_url = _get_latest_excel_url(DOH_OUTPATIENT_PAGE, "outpatient")
    return {"inpatient": inpatient_url, "outpatient": outpatient_url}



def _parse_inpatient_sheet(file_path: str | Path, sheet_name: str) -> pd.DataFrame:
    """Parse one sheet from an inpatient/day case Excel file into long format.

    Reads the given sheet, selects only the core identifier and granular band
    columns (ignoring derived aggregate columns), melts the band columns, and
    returns a tidy DataFrame.

    Args:
        file_path: Path to the inpatient Excel file.
        sheet_name: Name of the sheet (``"Pre-encompass"`` or ``"encompass"``).

    Returns:
        Long-format DataFrame with columns: management, quarter_ending, trust,
        specialty, programme_of_care, weeks_waited_band, patients_waiting.
    """
    df = pd.read_excel(file_path, sheet_name=sheet_name)

    # Select only columns that exist (band cols may differ between sheets)
    available_band_cols = [c for c in IP_BAND_COLS if c in df.columns]
    select_cols = IP_ID_COLS + available_band_cols + ["Total"]
    df = df[[c for c in select_cols if c in df.columns]].copy()

    # Rename for consistency
    df = df.rename(
        columns={
            "Management": "management",
            "Quarter Ending": "quarter_ending",
            "HSC Trust": "trust",
            "Specialty": "specialty",
            "Programme Of Care": "programme_of_care",
            "Total": "total",
        }
    )

    # Melt band columns to long format
    id_cols = ["management", "quarter_ending", "trust", "specialty", "programme_of_care"]
    if "total" in df.columns:
        id_cols.append("total")

    return df.melt(
        id_vars=id_cols,
        value_vars=list(available_band_cols),
        var_name="weeks_waited_band",
        value_name="patients_waiting",
    )


def _parse_outpatient_sheet(file_path: str | Path, sheet_name: str) -> pd.DataFrame:
    """Parse one sheet from an outpatient Excel file into long format.

    Args:
        file_path: Path to the outpatient Excel file.
        sheet_name: Name of the sheet (``"Pre-encompass"`` or ``"encompass"``).

    Returns:
        Long-format DataFrame with columns: quarter_ending, trust, specialty,
        programme_of_care, weeks_waited_band, patients_waiting.
    """
    df = pd.read_excel(file_path, sheet_name=sheet_name)

    available_band_cols = [c for c in OP_BAND_COLS if c in df.columns]
    select_cols = OP_ID_COLS + available_band_cols + ["Total Waiting"]
    df = df[[c for c in select_cols if c in df.columns]].copy()

    df = df.rename(
        columns={
            "Quarter Ending": "quarter_ending",
            "HSC Trust": "trust",
            "Specialty": "specialty",
            "Programme of Care": "programme_of_care",
            "Total Waiting": "total",
        }
    )

    id_cols = ["quarter_ending", "trust", "specialty", "programme_of_care"]
    if "total" in df.columns:
        id_cols.append("total")

    return df.melt(
        id_vars=id_cols,
        value_vars=list(available_band_cols),
        var_name="weeks_waited_band",
        value_name="patients_waiting",
    )


def _parse_file(file_path: str | Path, waiting_type: str) -> pd.DataFrame:
    """Parse both sheets of an elective waiting times Excel file.

    Reads the ``Pre-encompass`` and ``encompass`` sheets, concatenates them,
    adds a ``waiting_type`` label, converts the quarter_ending column to a
    proper date, and derives ``date`` and ``year`` columns.

    Args:
        file_path: Path to the downloaded Excel file.
        waiting_type: Label to populate the ``waiting_type`` column.
            Use ``"inpatient_day_case"`` for inpatient files and
            ``"outpatient"`` for outpatient files.

    Returns:
        Long-format DataFrame.
    """
    with pd.ExcelFile(file_path) as xl:
        sheet_names = xl.sheet_names

    parts = []
    for sheet in [SHEET_PRE_ENCOMPASS, SHEET_ENCOMPASS]:
        if sheet not in sheet_names:
            logger.warning(f"Sheet '{sheet}' not found in {file_path} — skipping")
            continue

        if waiting_type == "inpatient_day_case":
            chunk = _parse_inpatient_sheet(file_path, sheet)
        else:
            chunk = _parse_outpatient_sheet(file_path, sheet)

        chunk["data_source"] = sheet  # Pre-encompass vs encompass label
        parts.append(chunk)

    if not parts:
        raise NISRAValidationError(f"No data sheets found in {file_path}")

    df = pd.concat(parts, ignore_index=True)
    df["waiting_type"] = waiting_type

    # Normalise quarter_ending to datetime (mixed formats: datetime objects from
    # openpyxl for inpatient pre-encompass, string dates like "30-Jun-08" for
    # outpatient pre-encompass, and ISO-ish strings from the encompass sheet)
    df["quarter_ending"] = pd.to_datetime(df["quarter_ending"], errors="coerce", format="mixed")
    df = df.dropna(subset=["quarter_ending"])

    # Add date (= quarter_ending) and year
    df["date"] = df["quarter_ending"]
    df["year"] = df["date"].dt.year.astype(int)

    # Coerce patients_waiting to numeric
    df["patients_waiting"] = pd.to_numeric(df["patients_waiting"], errors="coerce")

    return df



[docs]
def parse_elective_waiting_times_file(file_path: str | Path) -> pd.DataFrame:
    """Parse a combined elective waiting times Excel file (inpatient or outpatient).

    Reads both ``Pre-encompass`` and ``encompass`` sheets from the file, melts
    the weekly waiting-band columns into long format, and returns a unified
    DataFrame.

    The parser auto-detects the file type (inpatient vs outpatient) by checking
    whether a ``Management`` column (present only in inpatient files) exists in
    the first data sheet.

    Args:
        file_path: Path to an Excel file downloaded from the Department of
            Health inpatient or outpatient waiting times publication pages.

    Returns:
        Long-format DataFrame with columns:

        - ``date`` (datetime): Quarter-end date (e.g. 2025-12-31)
        - ``year`` (int): Calendar year of the quarter end
        - ``quarter_ending`` (datetime): Same as ``date``
        - ``trust`` (str): HSC Trust name
        - ``specialty`` (str): Medical specialty
        - ``programme_of_care`` (str): Programme of care grouping
        - ``weeks_waited_band`` (str): Waiting band label (e.g. "0 - 6 weeks")
        - ``patients_waiting`` (float): Number of patients/referrals in that band
        - ``waiting_type`` (str): ``"inpatient_day_case"`` or ``"outpatient"``

        For inpatient files, ``management`` (``"Day Case"`` or ``"Inpatient"``)
        is also present.

    Raises:
        NISRAValidationError: If neither expected sheet is found in the file.
    """
    with pd.ExcelFile(file_path) as xl:
        sheet_names = xl.sheet_names

    target_sheet = SHEET_PRE_ENCOMPASS if SHEET_PRE_ENCOMPASS in sheet_names else SHEET_ENCOMPASS
    probe = pd.read_excel(file_path, sheet_name=target_sheet, nrows=0)

    waiting_type = "inpatient_day_case" if "Management" in probe.columns else "outpatient"
    return _parse_file(file_path, waiting_type)




[docs]
def get_latest_elective_waiting_times(force_refresh: bool = False) -> pd.DataFrame:
    """Download and return the latest elective waiting times data (both series).

    Fetches the most recent quarterly publication for both the inpatient/day
    case and outpatient series, parses each into long format, and returns a
    combined DataFrame.

    Args:
        force_refresh: If ``True``, bypass the on-disk cache and re-download
            the Excel files.

    Returns:
        Combined long-format DataFrame covering both inpatient/day case and
        outpatient waiting times.  See :func:`parse_elective_waiting_times_file`
        for the column schema.

    Raises:
        NISRADataNotFoundError: If either publication page or Excel file cannot
            be located.
        NISRAValidationError: If the downloaded data fails schema validation.
    """
    urls = get_elective_waiting_times_url()

    parts = []
    for key, url in urls.items():
        logger.info(f"Downloading {key} waiting times from {url}")
        file_path = download_file(url, force_refresh=force_refresh)
        # Determine waiting_type from key so we don't need to re-probe
        waiting_type = "inpatient_day_case" if key == "inpatient" else "outpatient"
        parts.append(_parse_file(file_path, waiting_type))

    df = pd.concat(parts, ignore_index=True)
    validate_elective_waiting_times(df)
    return df




[docs]
def validate_elective_waiting_times(df: pd.DataFrame) -> bool:
    """Validate that an elective waiting times DataFrame meets quality requirements.

    Checks that the DataFrame is non-empty, contains all required columns, and
    has no negative patient counts.

    Args:
        df: DataFrame as returned by :func:`get_latest_elective_waiting_times`
            or :func:`parse_elective_waiting_times_file`.

    Returns:
        ``True`` if all checks pass.

    Raises:
        NISRAValidationError: If the DataFrame is empty, missing required
            columns, or contains negative ``patients_waiting`` values.
    """
    if df is None or len(df) == 0:
        raise NISRAValidationError("Elective waiting times DataFrame is empty")

    missing = REQUIRED_COLUMNS - set(df.columns)
    if missing:
        raise NISRAValidationError(f"Missing required columns: {missing}")

    valid = df["patients_waiting"].dropna()
    if len(valid) > 0 and (valid < 0).any():
        n_neg = (valid < 0).sum()
        raise NISRAValidationError(f"patients_waiting contains {n_neg} negative value(s); min = {valid.min()}")

    return True