"""NISRA Elective/Outpatient Waiting Times Module.
Provides access to Northern Ireland's elective and outpatient waiting times
statistics, covering inpatient/day case and outpatient referrals waiting by
weeks-waited band, specialty, and HSC Trust.
Data is published quarterly by the Department of Health NI and covers two
separate series:
- **Inpatient/Day Case Waiting Times** — patients waiting by management type
(Day Case or Inpatient), weeks-waited band, specialty, and HSC Trust.
Data from Q1 2007-08 (quarter ending June 2007) to present.
- **Outpatient Waiting Times** — referrals waiting by weeks-waited band,
specialty, and HSC Trust. Data from Q1 2008-09 (quarter ending June 2008)
to present.
Both series contain two sheets reflecting a system change:
- **Pre-encompass**: Legacy PAS data up to March 2025 (inpatient) / March 2025
(outpatient). Inpatient pre-encompass includes additional derived aggregate
columns (e.g. "> 26 weeks") that are excluded from the long-format output.
- **encompass**: Data from the new electronic patient record system, starting
from South Eastern Trust in December 2023. Not directly comparable with
pre-encompass data due to the system transition.
Waiting Bands (Inpatient/Day Case — weeks):
0 - 6, >6 - 13, >13 - 21, >21 - 26, >26 - 52, >52 - 65, >65 - 78,
>78 - 91, >91 - 104, >104
Waiting Bands (Outpatient — weeks):
0 - 6, >6 - 9, >9 - 13 / >9 - 12*, >12 - 15, >15 - 18, >18 - 26,
>26 - 39, >39 - 52, >52 - 65, >65 - 78, >78 - 91, >91 - 104, >104
(*The >9-13 / >9-12 split is a historical artefact; both columns appear
but only one is populated per row.)
HSC Trusts:
Belfast, Northern, South Eastern, Southern, Western
(DPC = Domiciliary/Primary Care, included in source data but typically
excluded from trust-level analyses)
Data Sources:
- Inpatient/Day Case: https://www.health-ni.gov.uk/articles/inpatient-waiting-times
- Outpatient: https://www.health-ni.gov.uk/articles/outpatient-waiting-times
Update Frequency:
Quarterly, published approximately 3-6 months after the quarter end.
Example:
>>> from bolster.data_sources.nisra import elective_waiting_times as ewt
>>> df = ewt.get_latest_elective_waiting_times()
>>> sorted(df.columns.tolist())
['date', 'patients_waiting', 'programme_of_care', 'quarter_ending', 'specialty', 'trust', 'waiting_type', 'weeks_waited_band', 'year']
>>> ewt.validate_elective_waiting_times(df)
True
Publication Details:
- Frequency: Quarterly (published ~3-6 months after quarter end)
- Published by: Department of Health NI
- Sources: https://www.health-ni.gov.uk/articles/inpatient-waiting-times
https://www.health-ni.gov.uk/articles/outpatient-waiting-times
"""
import logging
from pathlib import Path
import pandas as pd
from bs4 import BeautifulSoup
from bolster.utils.web import session
from ._base import NISRADataNotFoundError, NISRAValidationError, download_file, make_absolute_url
[docs]
logger = logging.getLogger(__name__)
# Landing page URLs
[docs]
DOH_BASE_URL = "https://www.health-ni.gov.uk"
[docs]
DOH_INPATIENT_PAGE = "https://www.health-ni.gov.uk/articles/inpatient-waiting-times"
[docs]
DOH_OUTPATIENT_PAGE = "https://www.health-ni.gov.uk/articles/outpatient-waiting-times"
# Sheet names (same in both inpatient and outpatient files)
[docs]
SHEET_PRE_ENCOMPASS = "Pre-encompass"
[docs]
SHEET_ENCOMPASS = "encompass"
# Expected HSC Trusts (excluding DPC variants)
[docs]
EXPECTED_TRUSTS = {"Belfast", "Northern", "South Eastern", "Southern", "Western"}
# ---- Inpatient/Day Case column definitions ----
# Core identifier columns present in both pre-encompass and encompass sheets
[docs]
IP_ID_COLS = ["Management", "Quarter Ending", "HSC Trust", "Specialty", "Programme Of Care"]
# Granular waiting-band columns present in BOTH inpatient sheets
# (pre-encompass has additional derived columns like "> 26 weeks" which we ignore)
[docs]
IP_BAND_COLS = [
"0 - 6 weeks",
"> 6 - 13 weeks",
"> 13 - 21 weeks",
"> 21 - 26 weeks",
"> 26-52 weeks",
"> 52 - 65 weeks",
"> 65 - 78 weeks",
"> 78 - 91 weeks",
"> 91 - 104 weeks",
"> 104 weeks",
]
# ---- Outpatient column definitions ----
[docs]
OP_ID_COLS = ["Quarter Ending", "HSC Trust", "Specialty", "Programme of Care"]
# Granular waiting-band columns (both pre-encompass and encompass have the same cols)
# The ">9 - 13 Wks" / ">9 - 12 Wks" split is a historical artefact — both present,
# only one populated per row. We include both and let the melt capture them.
[docs]
OP_BAND_COLS = [
"0 - 6 Wks",
">6 - 9 Wks",
">9 - 13 Wks",
">9 - 12 Wks",
">12 - 15 Wks",
">15 - 18 Wks",
">18 - 26 Wks",
">26 - 39 Wks",
">39 - 52 Wks",
">52 - 65 Wks",
">65 - 78 Wks",
">78 - 91 Wks",
">91 - 104 Wks",
">104 Wks",
]
# Required columns in the final validated output
[docs]
REQUIRED_COLUMNS = {
"date",
"year",
"quarter_ending",
"trust",
"specialty",
"programme_of_care",
"weeks_waited_band",
"patients_waiting",
"waiting_type",
}
def _get_latest_excel_url(landing_page: str, keyword: str) -> str:
"""Scrape a Department of Health landing page to find the latest Excel link.
Follows the two-hop pattern used by other NISRA modules:
1. Scrape the article landing page for publication links.
2. Follow the most recent publication page to find the Excel download.
Args:
landing_page: URL of the article landing page.
keyword: Keyword to match in the publication link href
(e.g. "inpatient-and-day-case" or "outpatient").
Returns:
Absolute URL of the Excel file.
Raises:
NISRADataNotFoundError: If a publication or Excel link cannot be found.
"""
logger.info(f"Fetching landing page: {landing_page}")
try:
resp = session.get(landing_page, timeout=30)
resp.raise_for_status()
except Exception as e:
raise NISRADataNotFoundError(f"Failed to fetch landing page {landing_page}: {e}") from e
soup = BeautifulSoup(resp.content, "html.parser")
pub_url = None
for a in soup.find_all("a", href=True):
href = a["href"]
if "publications" in href and keyword in href:
pub_url = make_absolute_url(href, DOH_BASE_URL)
break
if pub_url is None:
raise NISRADataNotFoundError(f"Could not find a publication link containing '{keyword}' on {landing_page}")
logger.info(f"Fetching publication page: {pub_url}")
try:
pub_resp = session.get(pub_url, timeout=30)
pub_resp.raise_for_status()
except Exception as e:
raise NISRADataNotFoundError(f"Failed to fetch publication page {pub_url}: {e}") from e
pub_soup = BeautifulSoup(pub_resp.content, "html.parser")
excel_url = None
for a in pub_soup.find_all("a", href=True):
href = a["href"]
if ".xlsx" in href.lower():
excel_url = make_absolute_url(href, DOH_BASE_URL)
break
if excel_url is None:
raise NISRADataNotFoundError(f"Could not find an Excel (.xlsx) file on {pub_url}")
logger.info(f"Found Excel URL: {excel_url}")
return excel_url
[docs]
def get_elective_waiting_times_url() -> dict[str, str]:
"""Scrape the Department of Health pages to find the latest Excel file URLs.
Returns:
Dictionary with keys ``"inpatient"`` and ``"outpatient"``, each mapping
to the absolute URL of the most recent quarterly Excel file.
Raises:
NISRADataNotFoundError: If either URL cannot be located.
"""
inpatient_url = _get_latest_excel_url(DOH_INPATIENT_PAGE, "inpatient-and-day-case")
outpatient_url = _get_latest_excel_url(DOH_OUTPATIENT_PAGE, "outpatient")
return {"inpatient": inpatient_url, "outpatient": outpatient_url}
def _parse_inpatient_sheet(file_path: str | Path, sheet_name: str) -> pd.DataFrame:
"""Parse one sheet from an inpatient/day case Excel file into long format.
Reads the given sheet, selects only the core identifier and granular band
columns (ignoring derived aggregate columns), melts the band columns, and
returns a tidy DataFrame.
Args:
file_path: Path to the inpatient Excel file.
sheet_name: Name of the sheet (``"Pre-encompass"`` or ``"encompass"``).
Returns:
Long-format DataFrame with columns: management, quarter_ending, trust,
specialty, programme_of_care, weeks_waited_band, patients_waiting.
"""
df = pd.read_excel(file_path, sheet_name=sheet_name)
# Select only columns that exist (band cols may differ between sheets)
available_band_cols = [c for c in IP_BAND_COLS if c in df.columns]
select_cols = IP_ID_COLS + available_band_cols + ["Total"]
df = df[[c for c in select_cols if c in df.columns]].copy()
# Rename for consistency
df = df.rename(
columns={
"Management": "management",
"Quarter Ending": "quarter_ending",
"HSC Trust": "trust",
"Specialty": "specialty",
"Programme Of Care": "programme_of_care",
"Total": "total",
}
)
# Melt band columns to long format
id_cols = ["management", "quarter_ending", "trust", "specialty", "programme_of_care"]
if "total" in df.columns:
id_cols.append("total")
return df.melt(
id_vars=id_cols,
value_vars=list(available_band_cols),
var_name="weeks_waited_band",
value_name="patients_waiting",
)
def _parse_outpatient_sheet(file_path: str | Path, sheet_name: str) -> pd.DataFrame:
"""Parse one sheet from an outpatient Excel file into long format.
Args:
file_path: Path to the outpatient Excel file.
sheet_name: Name of the sheet (``"Pre-encompass"`` or ``"encompass"``).
Returns:
Long-format DataFrame with columns: quarter_ending, trust, specialty,
programme_of_care, weeks_waited_band, patients_waiting.
"""
df = pd.read_excel(file_path, sheet_name=sheet_name)
available_band_cols = [c for c in OP_BAND_COLS if c in df.columns]
select_cols = OP_ID_COLS + available_band_cols + ["Total Waiting"]
df = df[[c for c in select_cols if c in df.columns]].copy()
df = df.rename(
columns={
"Quarter Ending": "quarter_ending",
"HSC Trust": "trust",
"Specialty": "specialty",
"Programme of Care": "programme_of_care",
"Total Waiting": "total",
}
)
id_cols = ["quarter_ending", "trust", "specialty", "programme_of_care"]
if "total" in df.columns:
id_cols.append("total")
return df.melt(
id_vars=id_cols,
value_vars=list(available_band_cols),
var_name="weeks_waited_band",
value_name="patients_waiting",
)
def _parse_file(file_path: str | Path, waiting_type: str) -> pd.DataFrame:
"""Parse both sheets of an elective waiting times Excel file.
Reads the ``Pre-encompass`` and ``encompass`` sheets, concatenates them,
adds a ``waiting_type`` label, converts the quarter_ending column to a
proper date, and derives ``date`` and ``year`` columns.
Args:
file_path: Path to the downloaded Excel file.
waiting_type: Label to populate the ``waiting_type`` column.
Use ``"inpatient_day_case"`` for inpatient files and
``"outpatient"`` for outpatient files.
Returns:
Long-format DataFrame.
"""
with pd.ExcelFile(file_path) as xl:
sheet_names = xl.sheet_names
parts = []
for sheet in [SHEET_PRE_ENCOMPASS, SHEET_ENCOMPASS]:
if sheet not in sheet_names:
logger.warning(f"Sheet '{sheet}' not found in {file_path} — skipping")
continue
if waiting_type == "inpatient_day_case":
chunk = _parse_inpatient_sheet(file_path, sheet)
else:
chunk = _parse_outpatient_sheet(file_path, sheet)
chunk["data_source"] = sheet # Pre-encompass vs encompass label
parts.append(chunk)
if not parts:
raise NISRAValidationError(f"No data sheets found in {file_path}")
df = pd.concat(parts, ignore_index=True)
df["waiting_type"] = waiting_type
# Normalise quarter_ending to datetime (mixed formats: datetime objects from
# openpyxl for inpatient pre-encompass, string dates like "30-Jun-08" for
# outpatient pre-encompass, and ISO-ish strings from the encompass sheet)
df["quarter_ending"] = pd.to_datetime(df["quarter_ending"], errors="coerce", format="mixed")
df = df.dropna(subset=["quarter_ending"])
# Add date (= quarter_ending) and year
df["date"] = df["quarter_ending"]
df["year"] = df["date"].dt.year.astype(int)
# Coerce patients_waiting to numeric
df["patients_waiting"] = pd.to_numeric(df["patients_waiting"], errors="coerce")
return df
[docs]
def parse_elective_waiting_times_file(file_path: str | Path) -> pd.DataFrame:
"""Parse a combined elective waiting times Excel file (inpatient or outpatient).
Reads both ``Pre-encompass`` and ``encompass`` sheets from the file, melts
the weekly waiting-band columns into long format, and returns a unified
DataFrame.
The parser auto-detects the file type (inpatient vs outpatient) by checking
whether a ``Management`` column (present only in inpatient files) exists in
the first data sheet.
Args:
file_path: Path to an Excel file downloaded from the Department of
Health inpatient or outpatient waiting times publication pages.
Returns:
Long-format DataFrame with columns:
- ``date`` (datetime): Quarter-end date (e.g. 2025-12-31)
- ``year`` (int): Calendar year of the quarter end
- ``quarter_ending`` (datetime): Same as ``date``
- ``trust`` (str): HSC Trust name
- ``specialty`` (str): Medical specialty
- ``programme_of_care`` (str): Programme of care grouping
- ``weeks_waited_band`` (str): Waiting band label (e.g. "0 - 6 weeks")
- ``patients_waiting`` (float): Number of patients/referrals in that band
- ``waiting_type`` (str): ``"inpatient_day_case"`` or ``"outpatient"``
For inpatient files, ``management`` (``"Day Case"`` or ``"Inpatient"``)
is also present.
Raises:
NISRAValidationError: If neither expected sheet is found in the file.
"""
with pd.ExcelFile(file_path) as xl:
sheet_names = xl.sheet_names
target_sheet = SHEET_PRE_ENCOMPASS if SHEET_PRE_ENCOMPASS in sheet_names else SHEET_ENCOMPASS
probe = pd.read_excel(file_path, sheet_name=target_sheet, nrows=0)
waiting_type = "inpatient_day_case" if "Management" in probe.columns else "outpatient"
return _parse_file(file_path, waiting_type)
[docs]
def get_latest_elective_waiting_times(force_refresh: bool = False) -> pd.DataFrame:
"""Download and return the latest elective waiting times data (both series).
Fetches the most recent quarterly publication for both the inpatient/day
case and outpatient series, parses each into long format, and returns a
combined DataFrame.
Args:
force_refresh: If ``True``, bypass the on-disk cache and re-download
the Excel files.
Returns:
Combined long-format DataFrame covering both inpatient/day case and
outpatient waiting times. See :func:`parse_elective_waiting_times_file`
for the column schema.
Raises:
NISRADataNotFoundError: If either publication page or Excel file cannot
be located.
NISRAValidationError: If the downloaded data fails schema validation.
"""
urls = get_elective_waiting_times_url()
parts = []
for key, url in urls.items():
logger.info(f"Downloading {key} waiting times from {url}")
file_path = download_file(url, force_refresh=force_refresh)
# Determine waiting_type from key so we don't need to re-probe
waiting_type = "inpatient_day_case" if key == "inpatient" else "outpatient"
parts.append(_parse_file(file_path, waiting_type))
df = pd.concat(parts, ignore_index=True)
validate_elective_waiting_times(df)
return df
[docs]
def validate_elective_waiting_times(df: pd.DataFrame) -> bool:
"""Validate that an elective waiting times DataFrame meets quality requirements.
Checks that the DataFrame is non-empty, contains all required columns, and
has no negative patient counts.
Args:
df: DataFrame as returned by :func:`get_latest_elective_waiting_times`
or :func:`parse_elective_waiting_times_file`.
Returns:
``True`` if all checks pass.
Raises:
NISRAValidationError: If the DataFrame is empty, missing required
columns, or contains negative ``patients_waiting`` values.
"""
if df is None or len(df) == 0:
raise NISRAValidationError("Elective waiting times DataFrame is empty")
missing = REQUIRED_COLUMNS - set(df.columns)
if missing:
raise NISRAValidationError(f"Missing required columns: {missing}")
valid = df["patients_waiting"].dropna()
if len(valid) > 0 and (valid < 0).any():
n_neg = (valid < 0).sum()
raise NISRAValidationError(f"patients_waiting contains {n_neg} negative value(s); min = {valid.min()}")
return True