"""NISRA Cancer Waiting Times Module.
This module provides access to Northern Ireland's cancer waiting times statistics,
measuring performance against key cancer treatment targets.
Cancer Waiting Time Targets:
- 14-day: Urgent breast cancer referrals seen within 14 days
- 31-day: Treatment started within 31 days of decision to treat
- 62-day: Treatment started within 62 days of urgent GP referral
Data Coverage:
- 31-day and 62-day by HSC Trust: April 2008 - Present (monthly)
- 31-day and 62-day by Tumour Site: December 2008 - Present (monthly)
- 14-day Breast (Historic by Trust): April 2008 - April 2025
- 14-day Breast (Regional): May 2025 - Present (new regional service)
- Breast Cancer Referrals: April 2016 - Present (monthly)
HSC Trusts:
- Belfast, Northern, South Eastern, Southern, Western
Tumour Sites:
- Brain/Central Nervous System, Breast Cancer, Gynaecological Cancers,
- Haematological Cancers, Head/Neck Cancer, Lower Gastrointestinal Cancer,
- Lung Cancer, Other, Skin Cancers, Upper Gastrointestinal Cancer,
- Urological Cancer
Data Source: Department of Health Northern Ireland provides cancer waiting times statistics
through their health publications at https://www.health-ni.gov.uk/articles/cancer-waiting-times.
The data tracks performance against cancer treatment targets across Health and Social Care Trusts
and by tumour site, providing comprehensive monitoring of cancer care pathways in Northern Ireland.
Update Frequency: Quarterly publications are released approximately 3 months after the end
of each quarter. Cancer waiting times statistics are published by the Department of Health
as part of their healthcare performance monitoring, with data updated four times per year
to track progress against key cancer treatment targets.
Example:
>>> from bolster.data_sources.nisra import cancer_waiting_times as cwt
>>> # Get latest 31-day waiting times by HSC Trust
>>> df = cwt.get_latest_31_day_by_trust()
>>> sorted(df.columns.tolist())
['date', 'month', 'over_target', 'performance_rate', 'total', 'trust', 'within_target', 'year']
>>> # Get 62-day waiting times by tumour site
>>> df_tumour = cwt.get_latest_62_day_by_tumour()
>>> 'tumour_site' in df_tumour.columns
True
Publication Details:
- Frequency: Quarterly (published ~3 months after quarter end)
- Published by: Department of Health / NISRA
- Source: https://www.health-ni.gov.uk/articles/cancer-waiting-times
"""
import logging
import re
from pathlib import Path
import pandas as pd
from bs4 import BeautifulSoup
from bolster.utils.web import session
from ._base import NISRADataNotFoundError, NISRAValidationError, add_date_columns, download_file, make_absolute_url
[docs]
logger = logging.getLogger(__name__)
# Base URLs
[docs]
DOH_CANCER_PAGE = "https://www.health-ni.gov.uk/articles/cancer-waiting-times"
[docs]
DOH_BASE_URL = "https://www.health-ni.gov.uk"
# Sheet configurations
[docs]
SHEET_31_DAY_TRUST = "31 Day Wait by HSC Trust"
[docs]
SHEET_31_DAY_TUMOUR = "31 Day Wait by Tumour Site"
[docs]
SHEET_62_DAY_TRUST = "62 Day Wait by HSC Trust"
[docs]
SHEET_62_DAY_TUMOUR = "62 Day Wait by Tumour Site"
[docs]
SHEET_14_DAY_REGIONAL = "14 d Wait - Breast Regional"
[docs]
SHEET_14_DAY_HISTORIC = "14 d Wait - Breast Historic"
# Pre-Q3-2025-26 files used a single combined sheet instead of the historic/regional split
[docs]
SHEET_14_DAY_LEGACY = "14 Day Wait - Breast Cancer"
[docs]
SHEET_BREAST_REFERRALS = "Breast Cancer Referrals"
# Column schemas vary by publication era:
# - Pre-Q3 2025-26: 5 cols (no median/percentile); breast referrals 4 cols (total before urgent, no routine)
# - Q3 2025-26+: 7 cols (+ median_days, percentile_95_days); breast referrals 5 cols (routine, urgent, total)
_WAIT_COLS_5 = ["period_month", "group", "within_target", "over_target", "total"]
_WAIT_COLS_7 = ["period_month", "group", "within_target", "over_target", "total", "median_days", "percentile_95_days"]
[docs]
def get_latest_publication_url() -> tuple[str, str]:
"""Find the latest cancer waiting times publication URL.
Scrapes the Department of Health cancer waiting times page to find the
most recent quarterly publication.
Returns:
Tuple of (excel_file_url, quarter_string)
Raises:
NISRADataNotFoundError: If publication cannot be found
"""
logger.info(f"Fetching latest publication from {DOH_CANCER_PAGE}")
try:
response = session.get(DOH_CANCER_PAGE, timeout=30)
response.raise_for_status()
except Exception as e:
raise NISRADataNotFoundError(f"Failed to fetch cancer waiting times page: {e}") from e
soup = BeautifulSoup(response.content, "html.parser")
# Find links to publications - look for "cancer waiting times" in link text
publication_links = []
for link in soup.find_all("a", href=True):
text = link.get_text(strip=True).lower()
href = link["href"]
if "cancer waiting times" in text and "publications" in href:
publication_links.append((link.get_text(strip=True), href))
if not publication_links:
raise NISRADataNotFoundError("Could not find any cancer waiting times publications")
# Get the first (most recent) publication
pub_text, pub_url = publication_links[0]
logger.info(f"Found publication: {pub_text}")
# Make absolute URL
pub_url = make_absolute_url(pub_url, DOH_BASE_URL)
# Extract quarter from publication text
quarter_match = re.search(
r"(January|February|March|April|May|June|July|August|September|October|November|December)"
r"[^0-9]*(\d{4})",
pub_text,
re.IGNORECASE,
)
quarter_str = quarter_match.group(0) if quarter_match else "Unknown"
# Now fetch the publication page to get the Excel file
try:
pub_response = session.get(pub_url, timeout=30)
pub_response.raise_for_status()
except Exception as e:
raise NISRADataNotFoundError(f"Failed to fetch publication page: {e}") from e
pub_soup = BeautifulSoup(pub_response.content, "html.parser")
# Find Excel download link (main data file, not ICD codes)
excel_url = None
for link in pub_soup.find_all("a", href=True):
href = link["href"]
text = link.get_text(strip=True).lower()
if ".xlsx" in href.lower() and "icd" not in href.lower() and "icd" not in text:
excel_url = href
break
if not excel_url:
raise NISRADataNotFoundError("Could not find Excel file in publication page")
# Make absolute URL
excel_url = make_absolute_url(excel_url, DOH_BASE_URL)
logger.info(f"Found Excel file: {excel_url}")
return excel_url, quarter_str
def _parse_wait_sheet(file_path: str | Path, sheet_name: str, group_col: str) -> pd.DataFrame:
"""Parse a wait-time sheet, handling 5-column (pre-Q3 2025-26) and 7-column (Q3 2025-26+) formats.
Args:
file_path: Path to the Excel file
sheet_name: Name of the sheet to parse
group_col: Output column name for the grouping dimension (e.g. 'trust' or 'tumour_site')
Returns:
DataFrame with columns: period_month, group_col, within_target, over_target, total
(median_days and percentile_95_days are dropped — not yet used downstream)
"""
df = pd.read_excel(file_path, sheet_name=sheet_name)
n_cols = len(df.columns)
if n_cols == 7:
df.columns = _WAIT_COLS_7
elif n_cols == 5:
df.columns = _WAIT_COLS_5
else:
raise NISRAValidationError(f"Unexpected column count ({n_cols}) in sheet '{sheet_name}' of {file_path}")
return df.rename(columns={"group": group_col})
[docs]
def parse_31_day_by_trust(file_path: str | Path) -> pd.DataFrame:
"""Parse 31-day waiting times by HSC Trust.
Handles both the 5-column format (pre-Q3 2025-26) and the 7-column format
(Q3 2025-26+, which added Median and 95th Percentile columns).
Args:
file_path: Path to the Excel file
Returns:
DataFrame with columns: date, year, month, trust, within_target,
over_target, total, performance_rate
"""
df = _parse_wait_sheet(file_path, SHEET_31_DAY_TRUST, "trust")
df = add_date_columns(df, "period_month")
df["performance_rate"] = df["within_target"] / df["total"]
return df[["date", "year", "month", "trust", "within_target", "over_target", "total", "performance_rate"]]
[docs]
def parse_31_day_by_tumour(file_path: str | Path) -> pd.DataFrame:
"""Parse 31-day waiting times by Tumour Site.
Handles both the 5-column format (pre-Q3 2025-26) and the 7-column format
(Q3 2025-26+, which added Median and 95th Percentile columns).
Args:
file_path: Path to the Excel file
Returns:
DataFrame with columns: date, year, month, tumour_site, within_target,
over_target, total, performance_rate
"""
df = _parse_wait_sheet(file_path, SHEET_31_DAY_TUMOUR, "tumour_site")
df = add_date_columns(df, "period_month")
df["performance_rate"] = df["within_target"] / df["total"]
return df[["date", "year", "month", "tumour_site", "within_target", "over_target", "total", "performance_rate"]]
[docs]
def parse_62_day_by_trust(file_path: str | Path) -> pd.DataFrame:
"""Parse 62-day waiting times by HSC Trust.
Handles both the 5-column format (pre-Q3 2025-26) and the 7-column format
(Q3 2025-26+, which added Median and 95th Percentile columns).
Args:
file_path: Path to the Excel file
Returns:
DataFrame with columns: date, year, month, trust, within_target,
over_target, total, performance_rate
Note:
62-day data may contain fractional patient counts due to shared care
arrangements between trusts.
"""
df = _parse_wait_sheet(file_path, SHEET_62_DAY_TRUST, "trust")
df = add_date_columns(df, "period_month")
df["performance_rate"] = df["within_target"] / df["total"]
return df[["date", "year", "month", "trust", "within_target", "over_target", "total", "performance_rate"]]
[docs]
def parse_62_day_by_tumour(file_path: str | Path) -> pd.DataFrame:
"""Parse 62-day waiting times by Tumour Site.
Handles both the 5-column format (pre-Q3 2025-26) and the 7-column format
(Q3 2025-26+, which added Median and 95th Percentile columns).
Args:
file_path: Path to the Excel file
Returns:
DataFrame with columns: date, year, month, tumour_site, within_target,
over_target, total, performance_rate
"""
df = _parse_wait_sheet(file_path, SHEET_62_DAY_TUMOUR, "tumour_site")
df = add_date_columns(df, "period_month")
df["performance_rate"] = df["within_target"] / df["total"]
return df[["date", "year", "month", "tumour_site", "within_target", "over_target", "total", "performance_rate"]]
[docs]
def parse_14_day_breast(file_path: str | Path) -> pd.DataFrame:
"""Parse 14-day breast cancer waiting times.
Handles three publication formats:
- Pre-Q4 2024-25: single sheet 'SHEET_14_DAY_LEGACY' (5 cols)
- Q4 2024-25 to Q2 2025-26: split into historic (5 cols) + regional (5 cols)
- Q3 2025-26+: split into historic (5 cols) + regional (7 cols, + median/95th pct)
Args:
file_path: Path to the Excel file
Returns:
DataFrame with columns: date, year, month, trust, within_target,
over_target, total, performance_rate
Note:
From May 2025, breast cancer services became regional. Historic data
(pre-May 2025) is by individual Trust. Regional data shows NI-wide figures.
"""
with pd.ExcelFile(file_path) as xl:
sheet_names = xl.sheet_names
if SHEET_14_DAY_HISTORIC in sheet_names:
# New split format (Q4 2024-25 onwards)
df_historic = _parse_wait_sheet(file_path, SHEET_14_DAY_HISTORIC, "trust")
df_historic = add_date_columns(df_historic, "period_month")
parts = [df_historic]
if SHEET_14_DAY_REGIONAL in sheet_names:
df_regional = _parse_wait_sheet(file_path, SHEET_14_DAY_REGIONAL, "trust")
df_regional = add_date_columns(df_regional, "period_month")
parts.append(df_regional)
df = pd.concat(parts, ignore_index=True)
else:
# Legacy single-sheet format (pre-Q4 2024-25)
df = _parse_wait_sheet(file_path, SHEET_14_DAY_LEGACY, "trust")
df = add_date_columns(df, "period_month")
df["performance_rate"] = df["within_target"] / df["total"]
return df[["date", "year", "month", "trust", "within_target", "over_target", "total", "performance_rate"]]
[docs]
def parse_breast_referrals(file_path: str | Path) -> pd.DataFrame:
"""Parse breast cancer referrals data.
Handles two formats:
- Pre-Q3 2025-26: 4 columns (referral_month, trust, total_referrals, urgent_referrals)
- Q3 2025-26+: 5 columns (referral_month, trust, routine_referrals, urgent_referrals, total_referrals)
Args:
file_path: Path to the Excel file
Returns:
DataFrame with columns: date, year, month, trust, total_referrals,
urgent_referrals, urgent_rate
"""
df = pd.read_excel(file_path, sheet_name=SHEET_BREAST_REFERRALS)
n_cols = len(df.columns)
if n_cols == 5:
# Q3 2025-26+: routine, urgent, total
df.columns = ["referral_month", "trust", "routine_referrals", "urgent_referrals", "total_referrals"]
elif n_cols == 4:
# Pre-Q3 2025-26: total before urgent, no routine column
df.columns = ["referral_month", "trust", "total_referrals", "urgent_referrals"]
else:
raise NISRAValidationError(f"Unexpected column count ({n_cols}) in breast referrals sheet of {file_path}")
df = add_date_columns(df, "referral_month")
df["urgent_rate"] = df["urgent_referrals"] / df["total_referrals"]
return df[["date", "year", "month", "trust", "total_referrals", "urgent_referrals", "urgent_rate"]]
# High-level functions with automatic download
[docs]
def get_latest_31_day_by_trust(force_refresh: bool = False) -> pd.DataFrame:
"""Get latest 31-day waiting times by HSC Trust.
Args:
force_refresh: Force re-download even if cached
Returns:
DataFrame with 31-day performance by Trust
"""
excel_url, _ = get_latest_publication_url()
file_path = download_file(excel_url, force_refresh=force_refresh)
return parse_31_day_by_trust(file_path)
[docs]
def get_latest_31_day_by_tumour(force_refresh: bool = False) -> pd.DataFrame:
"""Get latest 31-day waiting times by Tumour Site.
Args:
force_refresh: Force re-download even if cached
Returns:
DataFrame with 31-day performance by tumour site
"""
excel_url, _ = get_latest_publication_url()
file_path = download_file(excel_url, force_refresh=force_refresh)
return parse_31_day_by_tumour(file_path)
[docs]
def get_latest_62_day_by_trust(force_refresh: bool = False) -> pd.DataFrame:
"""Get latest 62-day waiting times by HSC Trust.
Args:
force_refresh: Force re-download even if cached
Returns:
DataFrame with 62-day performance by Trust
"""
excel_url, _ = get_latest_publication_url()
file_path = download_file(excel_url, force_refresh=force_refresh)
return parse_62_day_by_trust(file_path)
[docs]
def get_latest_62_day_by_tumour(force_refresh: bool = False) -> pd.DataFrame:
"""Get latest 62-day waiting times by Tumour Site.
Args:
force_refresh: Force re-download even if cached
Returns:
DataFrame with 62-day performance by tumour site
"""
excel_url, _ = get_latest_publication_url()
file_path = download_file(excel_url, force_refresh=force_refresh)
return parse_62_day_by_tumour(file_path)
[docs]
def get_latest_14_day_breast(force_refresh: bool = False) -> pd.DataFrame:
"""Get latest 14-day breast cancer waiting times.
Args:
force_refresh: Force re-download even if cached
Returns:
DataFrame with 14-day breast cancer performance
"""
excel_url, _ = get_latest_publication_url()
file_path = download_file(excel_url, force_refresh=force_refresh)
return parse_14_day_breast(file_path)
[docs]
def get_latest_breast_referrals(force_refresh: bool = False) -> pd.DataFrame:
"""Get latest breast cancer referrals data.
Args:
force_refresh: Force re-download even if cached
Returns:
DataFrame with breast cancer referrals
"""
excel_url, _ = get_latest_publication_url()
file_path = download_file(excel_url, force_refresh=force_refresh)
return parse_breast_referrals(file_path)
# Helper/analysis functions
[docs]
def get_data_by_year(df: pd.DataFrame, year: int) -> pd.DataFrame:
"""Filter data for a specific year.
Args:
df: DataFrame with 'year' column
year: Year to filter for
Returns:
Filtered DataFrame
"""
return df[df["year"] == year].reset_index(drop=True)
[docs]
def get_tumour_site_ranking(df: pd.DataFrame, year: int = None) -> pd.DataFrame:
"""Rank tumour sites by performance.
Args:
df: DataFrame with tumour site data
year: Optional year to filter (default: all years)
Returns:
DataFrame ranked by performance (worst to best)
"""
if year:
df = df[df["year"] == year]
ranking = (
df.groupby("tumour_site")
.agg(
total_patients=("total", "sum"),
within_target=("within_target", "sum"),
)
.reset_index()
)
ranking["performance_rate"] = ranking["within_target"] / ranking["total_patients"]
ranking = ranking.sort_values("performance_rate", ascending=True)
ranking["rank"] = range(1, len(ranking) + 1)
return ranking