"""NISRA Monthly Stillbirth Registrations Data Source.
Provides access to monthly stillbirth registration statistics for Northern Ireland.
A stillbirth is defined as a baby born after 24 weeks of pregnancy that did not
show any signs of life.
Data covers registrations by month from 2006 to present, updated monthly.
Data Source:
**Mother Page**: https://www.nisra.gov.uk/publications/monthly-stillbirths
The monthly Excel file contains a single "Stillbirths" sheet with counts
by month of registration (rows) and year (columns), covering 2006 to present.
Update Frequency: Monthly (published ~6 weeks after reference month)
Geographic Coverage: Northern Ireland (resident stillbirths)
Example:
>>> from bolster.data_sources.nisra import stillbirths
>>> df = stillbirths.get_latest_stillbirths()
>>> sorted(df.columns.tolist())
['date', 'month', 'stillbirths', 'year']
>>> # Total stillbirths in 2024
>>> total_2024 = df[df['year'] == 2024]['stillbirths'].sum()
>>> bool(total_2024 >= 0)
True
"""
import logging
from pathlib import Path
import pandas as pd
from bs4 import BeautifulSoup
from bolster.utils.web import session
from ._base import (
NISRADataNotFoundError,
NISRAValidationError,
download_file,
safe_int,
)
[docs]
logger = logging.getLogger(__name__)
[docs]
STILLBIRTHS_PUBLICATION_URL = "https://www.nisra.gov.uk/publications/monthly-stillbirths"
[docs]
MONTH_ORDER = [
"January",
"February",
"March",
"April",
"May",
"June",
"July",
"August",
"September",
"October",
"November",
"December",
]
[docs]
def get_latest_stillbirths_publication_url() -> str:
"""Scrape NISRA stillbirths publication page to find the latest Excel file.
Returns:
URL of the latest monthly stillbirths Excel file
Raises:
NISRADataNotFoundError: If publication or file not found
"""
try:
response = session.get(STILLBIRTHS_PUBLICATION_URL, timeout=30)
response.raise_for_status()
except Exception as e:
raise NISRADataNotFoundError(f"Failed to fetch stillbirths publication page: {e}") from e
soup = BeautifulSoup(response.content, "html.parser")
excel_url = None
for a_tag in soup.find_all("a", href=True):
href = a_tag["href"]
link_text = a_tag.get_text(strip=True).lower()
if "monthly stillbirths" in link_text and href.endswith(".xlsx"):
excel_url = f"https://www.nisra.gov.uk{href}" if href.startswith("/") else href
logger.info(f"Found stillbirths file: {excel_url}")
break
if not excel_url:
raise NISRADataNotFoundError("Could not find Monthly Stillbirths Excel file on publication page")
return excel_url
[docs]
def parse_stillbirths_file(file_path: str | Path) -> pd.DataFrame:
"""Parse NISRA monthly stillbirths Excel file into long-format DataFrame.
The file has a single "Stillbirths" sheet with months as rows and years
as columns (wide format). This function melts it into long format.
Args:
file_path: Path to the downloaded stillbirths Excel file
Returns:
DataFrame with columns:
- date: Timestamp (first day of registration month)
- year: int
- month: str (e.g. "January")
- stillbirths: int
Raises:
NISRAValidationError: If file structure is unexpected
"""
file_path = Path(file_path)
try:
raw = pd.read_excel(file_path, sheet_name="Stillbirths", header=None)
except Exception as e:
raise NISRAValidationError(f"Failed to read stillbirths file: {e}") from e
# Locate the header row: first row where column 0 is "January" or similar month
# or row where year integers appear across the row
header_row_idx = None
data_start_idx = None
for i, row in raw.iterrows():
cell = str(row.iloc[0]).strip()
# Header row has "Month of Registration" or similar in col 0, years in other cols
# Data rows start with month names
if cell in MONTH_ORDER:
data_start_idx = i
header_row_idx = i - 1
break
if header_row_idx is None or data_start_idx is None:
raise NISRAValidationError("Could not locate data rows in stillbirths sheet")
# Extract year headers from header row
header = raw.iloc[header_row_idx]
years = []
year_col_indices = []
for col_idx, val in enumerate(header):
if col_idx == 0:
continue
if val is None or (isinstance(val, float) and pd.isna(val)):
continue
# Year values may have notes appended like "2025\n[Note 1]"
year_str = str(val).strip().split("\n")[0].strip()
try:
year = int(float(year_str))
years.append(year)
year_col_indices.append(col_idx)
except (ValueError, TypeError):
continue
if not years:
raise NISRAValidationError("Could not extract year headers from stillbirths sheet")
# Extract monthly data rows (skip Total row and notes)
records = []
for i in range(data_start_idx, len(raw)):
row = raw.iloc[i]
month = str(row.iloc[0]).strip()
if month not in MONTH_ORDER:
break # hit Total row or notes
for year, col_idx in zip(years, year_col_indices, strict=False):
val = row.iloc[col_idx]
count = safe_int(val)
if count is None:
continue # not yet published (future months shown as "-")
records.append({"year": year, "month": month, "stillbirths": count})
if not records:
raise NISRAValidationError("No data rows found in stillbirths sheet")
df = pd.DataFrame(records)
df["date"] = pd.to_datetime(df["year"].astype(str) + " " + df["month"], format="%Y %B")
df = df[["date", "year", "month", "stillbirths"]]
df = df.sort_values("date").reset_index(drop=True)
logger.info(f"Parsed stillbirths: {len(df)} rows, {df['year'].min()}-{df['year'].max()}")
return df
[docs]
def get_latest_stillbirths(force_refresh: bool = False) -> pd.DataFrame:
"""Get the latest monthly stillbirth registrations for Northern Ireland.
Args:
force_refresh: If True, bypass cache and download fresh data
Returns:
DataFrame with columns:
- date: Timestamp (first of registration month)
- year: int
- month: str
- stillbirths: int
Raises:
NISRADataNotFoundError: If latest publication cannot be found
NISRAValidationError: If file structure is unexpected
Example:
>>> df = get_latest_stillbirths()
>>> sorted(df.columns.tolist())
['date', 'month', 'stillbirths', 'year']
>>> annual = df.groupby('year')['stillbirths'].sum()
>>> len(annual) > 0
True
"""
excel_url = get_latest_stillbirths_publication_url()
logger.info(f"Downloading stillbirths data from: {excel_url}")
file_path = download_file(excel_url, cache_ttl_hours=24 * 30, force_refresh=force_refresh)
return parse_stillbirths_file(file_path)
[docs]
def validate_stillbirths_data(df: pd.DataFrame) -> bool:
"""Validate stillbirths DataFrame for basic integrity.
Args:
df: DataFrame from get_latest_stillbirths()
Returns:
True if validation passes
Raises:
NISRAValidationError: If validation fails
"""
required_cols = {"date", "year", "month", "stillbirths"}
missing = required_cols - set(df.columns)
if missing:
raise NISRAValidationError(f"Missing required columns: {missing}")
if df.empty:
raise NISRAValidationError("DataFrame is empty")
if (df["stillbirths"] < 0).any():
raise NISRAValidationError("Negative stillbirth counts found")
# Annual totals should be within plausible range for NI (~40-130 per year)
annual = df.groupby("year")["stillbirths"].sum()
if (annual > 200).any():
raise NISRAValidationError(f"Annual stillbirths implausibly high: {annual[annual > 200].to_dict()}")
return True
[docs]
def get_stillbirths_by_year(df: pd.DataFrame, year: int) -> pd.DataFrame:
"""Filter stillbirths data to a specific year.
Args:
df: DataFrame from get_latest_stillbirths()
year: Year to filter
Returns:
Filtered DataFrame
Example:
>>> df = get_latest_stillbirths()
>>> df_2024 = get_stillbirths_by_year(df, 2024)
>>> 'stillbirths' in df_2024.columns
True
"""
return df[df["year"] == year].reset_index(drop=True)
[docs]
def get_stillbirth_rate(
stillbirths_df: pd.DataFrame,
births_df: pd.DataFrame,
) -> pd.DataFrame:
"""Calculate monthly stillbirth rate per 1,000 total births (live + still).
Args:
stillbirths_df: DataFrame from get_latest_stillbirths()
births_df: DataFrame from births.get_latest_births(event_type='registration')
Returns:
DataFrame with columns: date, year, month, stillbirths, live_births,
total_births, stillbirth_rate
Example:
>>> from bolster.data_sources.nisra import stillbirths, births
>>> sb = stillbirths.get_latest_stillbirths()
>>> lb = births.get_latest_births(event_type='registration')
>>> rate = stillbirths.get_stillbirth_rate(sb, lb)
"""
# births_df has 'tests_conducted' or 'births_persons' depending on event_type
births_col = next(
(c for c in births_df.columns if "persons" in c.lower() or "births" in c.lower()),
None,
)
if births_col is None:
raise NISRAValidationError("Could not identify births count column in births DataFrame")
# births_df has 'month' and a 'sex' column; filter to Persons and align on date
if "sex" in births_df.columns:
births_df = births_df[births_df["sex"] == "Persons"].copy()
date_col = "date" if "date" in births_df.columns else "month"
live = births_df[[date_col, births_col]].rename(columns={date_col: "date", births_col: "live_births"})
merged = stillbirths_df.merge(live, on="date", how="inner")
merged["total_births"] = merged["live_births"] + merged["stillbirths"]
merged["stillbirth_rate"] = ((merged["stillbirths"] / merged["total_births"]) * 1000).round(2)
return merged
[docs]
def get_annual_summary(df: pd.DataFrame) -> pd.DataFrame:
"""Calculate annual totals and trends for stillbirths.
Args:
df: DataFrame from get_latest_stillbirths()
Returns:
DataFrame with columns: year, total_stillbirths, yoy_change, yoy_pct_change
Example:
>>> df = get_latest_stillbirths()
>>> summary = get_annual_summary(df)
>>> sorted(summary.columns.tolist())
['total_stillbirths', 'year', 'yoy_change', 'yoy_pct_change']
"""
annual = df.groupby("year")["stillbirths"].sum().reset_index()
annual = annual.rename(columns={"stillbirths": "total_stillbirths"})
annual["yoy_change"] = annual["total_stillbirths"].diff()
annual["yoy_pct_change"] = annual["total_stillbirths"].pct_change().mul(100).round(1)
return annual