Source code for bolster.data_sources.nisra.registrar_general

"""NISRA Registrar General Quarterly Tables Data Source.

Provides access to quarterly vital statistics for Northern Ireland, including:
- Quarterly births and stillbirths (from Q1 2009)
- Quarterly deaths, marriages, and civil partnerships
- LGD-level breakdowns for the current quarter
- Birth and death rates per 1,000 population

These quarterly tables provide higher-level aggregated statistics compared to the
monthly data, with additional metrics like stillbirths, infant deaths, and rates.

Data Source:
    **Publications page**: https://www.nisra.gov.uk/statistics/births-deaths-and-marriages/registrar-general-quarterly-report

    The quarterly tables are published approximately 6 weeks after the end of each quarter.
    Historical data is available from Q1 2009.

Update Frequency: Quarterly (February, May, August, November)
Geographic Coverage: Northern Ireland (with LGD breakdowns)

Example:
    >>> from bolster.data_sources.nisra import registrar_general
    >>> data = registrar_general.get_quarterly_vital_statistics()
    >>> sorted(data.keys())
    ['births', 'deaths', 'lgd']
    >>> births = registrar_general.get_quarterly_births()
    >>> 'total_births' in births.columns
    True
    >>> lgd_df = registrar_general.get_lgd_statistics()
    >>> 'lgd' in lgd_df.columns
    True
"""

import logging
import re
from pathlib import Path

import pandas as pd
from openpyxl import load_workbook

from bolster.utils.web import session

from ._base import (
    NISRADataNotFoundError,
    NISRAValidationError,
    download_file,
    safe_float,
    safe_int,
)

[docs] logger = logging.getLogger(__name__)
# Base URL for registrar general statistics
[docs] REGISTRAR_GENERAL_BASE_URL = ( "https://www.nisra.gov.uk/statistics/births-deaths-and-marriages/registrar-general-quarterly-report" )
# List of 11 Local Government Districts in Northern Ireland
[docs] NI_LGDS = [ "Antrim and Newtownabbey", "Ards and North Down", "Armagh City, Banbridge and Craigavon", "Belfast", "Causeway Coast and Glens", "Derry City and Strabane", "Fermanagh and Omagh", "Lisburn and Castlereagh", "Mid and East Antrim", "Mid Ulster", "Newry, Mourne and Down", ]
[docs] def get_latest_publication_url() -> tuple[str, str, int, int]: """Scrape NISRA to find the latest Registrar General Quarterly Tables file. Navigates the publication structure: 1. Scrapes the Registrar General Quarterly Report page 2. Finds the latest quarterly tables publication link 3. Extracts the Excel file URL Returns: Tuple of (excel_url, publication_page_url, year, quarter) Raises: NISRADataNotFoundError: If publication or file not found """ from bs4 import BeautifulSoup mother_page = REGISTRAR_GENERAL_BASE_URL try: response = session.get(mother_page, timeout=30) response.raise_for_status() except Exception as e: raise NISRADataNotFoundError(f"Failed to fetch Registrar General page: {e}") from e soup = BeautifulSoup(response.content, "html.parser") # Find the latest quarterly tables publication link # Pattern: "Registrar General Quarterly Tables, Quarter X YYYY" pub_link = None year = None quarter = None for link in soup.find_all("a", href=True): link_text = link.get_text(strip=True) # Match pattern like "Registrar General Quarterly Tables, Quarter 3 2025" # or "Registrar General Quarterly Tables Quarter 3 2025" if "Quarterly Tables" in link_text and "Quarter" in link_text: href = link["href"] if href.startswith("/"): href = f"https://www.nisra.gov.uk{href}" # Extract quarter and year from link text match = re.search(r"Quarter\s*(\d)\s*(\d{4})", link_text) if match: quarter = int(match.group(1)) year = int(match.group(2)) pub_link = href logger.info(f"Found Registrar General Quarterly Tables: Q{quarter} {year}") break if not pub_link: raise NISRADataNotFoundError("Could not find Registrar General Quarterly Tables publication") # Scrape the publication page for Excel file try: pub_response = session.get(pub_link, timeout=30) pub_response.raise_for_status() except Exception as e: raise NISRADataNotFoundError(f"Failed to fetch publication page: {e}") from e pub_soup = BeautifulSoup(pub_response.content, "html.parser") # Find Excel file link # Pattern: "Quarter 3 2025 Tables.xlsx" or similar excel_url = None for link in pub_soup.find_all("a", href=True): href = link["href"] link_text = link.get_text(strip=True) if href.endswith(".xlsx") and "Tables" in href: if href.startswith("/"): href = f"https://www.nisra.gov.uk{href}" excel_url = href logger.info(f"Found Excel file: {href}") break if not excel_url: raise NISRADataNotFoundError("Could not find Excel file on publication page") return excel_url, pub_link, year, quarter
[docs] def parse_quarterly_births(sheet) -> pd.DataFrame: """Parse Table 1a - Quarterly Births from the Excel sheet. The births table contains quarterly data from Q1 2009 with: - Total births, birth rate, stillbirths - Births outside marriage (count and %) - Teenage births (count and %) - Births to mothers 30+ (count and %) Args: sheet: openpyxl worksheet object for Table 1a Returns: DataFrame with quarterly births data """ records = [] # Find header row - look for "Registration Year" or similar header_row = None for row_idx, row in enumerate(sheet.iter_rows(min_row=1, max_row=20, values_only=True), 1): row_str = [str(cell).lower() if cell else "" for cell in row] # Look for registration year in the row if any("registration year" in cell for cell in row_str): header_row = row_idx break if not header_row: logger.warning("Could not find header row in births table") return pd.DataFrame() logger.debug(f"Found births header row at row {header_row}") # Parse data rows - columns are fixed positions based on actual file structure # Col 0: Registration Year # Col 1: Registration Quarter # Col 2: Total Live Births # Col 3: Live Births Rate # Col 4: Births Outside Marriage # Col 5: % Births Outside Marriage # Col 6: Births To Teenage Mothers # Col 7: % Births To Teenage Mothers # Col 8: Births to Mothers Aged 30+ # Col 9: % Births to Mothers Aged 30+ # Col 10: Stillbirths for row in sheet.iter_rows(min_row=header_row + 1, max_row=200, values_only=True): year = safe_int(row[0]) quarter = safe_int(row[1]) if year is None or quarter is None: continue # Skip if year is before 2009 if year < 2009: continue total_births = safe_int(row[2]) if total_births is None: continue record = { "year": year, "quarter": quarter, "total_births": total_births, "birth_rate": safe_float(row[3]), "births_outside_marriage": safe_int(row[4]), "pct_outside_marriage": safe_float(row[5]), "teenage_births": safe_int(row[6]), "pct_teenage": safe_float(row[7]), "births_30_plus": safe_int(row[8]), "pct_30_plus": safe_float(row[9]), "stillbirths": safe_int(row[10]) if len(row) > 10 else None, } records.append(record) df = pd.DataFrame(records) if len(df) > 0: # Ensure year and quarter are integers df["year"] = df["year"].astype(int) df["quarter"] = df["quarter"].astype(int) # Create date column (first day of quarter) df["date"] = pd.to_datetime( df.apply( lambda r: f"{int(r['year'])}-{(int(r['quarter']) - 1) * 3 + 1:02d}-01", axis=1, ) ) df = df.sort_values(["year", "quarter"]).reset_index(drop=True) logger.info(f"Parsed {len(df)} quarterly birth records") return df
[docs] def parse_quarterly_deaths(sheet) -> pd.DataFrame: """Parse Table 1b - Quarterly Deaths from the Excel sheet. The deaths table contains quarterly data with: - Total deaths and death rate - Marriages and civil partnerships - Infant deaths Args: sheet: openpyxl worksheet object for Table 1b Returns: DataFrame with quarterly deaths/marriages data """ records = [] # Find header row - look for "Registration Year" or similar header_row = None for row_idx, row in enumerate(sheet.iter_rows(min_row=1, max_row=20, values_only=True), 1): row_str = [str(cell).lower() if cell else "" for cell in row] if any("registration year" in cell for cell in row_str): header_row = row_idx break if not header_row: logger.warning("Could not find header row in deaths table") return pd.DataFrame() logger.debug(f"Found deaths header row at row {header_row}") # Parse data rows - columns are fixed positions based on actual file structure # Col 0: Registration Year # Col 1: Registration Quarter # Col 2: Number of Deaths # Col 3: Death Rate # Col 4: Infant Deaths # Col 5: Total Marriages # Col 6: Civil Marriages (% of total) # Col 7: Same-Sex Marriage (% of total) # Col 8: Civil Partnerships for row in sheet.iter_rows(min_row=header_row + 1, max_row=200, values_only=True): year = safe_int(row[0]) quarter = safe_int(row[1]) if year is None or quarter is None: continue if year < 2009: continue deaths = safe_int(row[2]) if deaths is None: continue record = { "year": year, "quarter": quarter, "deaths": deaths, "death_rate": safe_float(row[3]), "infant_deaths": safe_int(row[4]), "marriages": safe_int(row[5]), "civil_marriage_pct": safe_float(row[6]), "same_sex_pct": safe_float(row[7]) if row[7] != "-" else None, "civil_partnerships": safe_int(row[8]) if len(row) > 8 else None, } records.append(record) df = pd.DataFrame(records) if len(df) > 0: # Ensure year and quarter are integers df["year"] = df["year"].astype(int) df["quarter"] = df["quarter"].astype(int) df["date"] = pd.to_datetime( df.apply( lambda r: f"{int(r['year'])}-{(int(r['quarter']) - 1) * 3 + 1:02d}-01", axis=1, ) ) df = df.sort_values(["year", "quarter"]).reset_index(drop=True) logger.info(f"Parsed {len(df)} quarterly deaths/marriages records") return df
[docs] def parse_lgd_statistics(sheet) -> pd.DataFrame: """Parse Table 3b - LGD-level statistics from the Excel sheet. The LGD table contains current quarter statistics by Local Government District: - Population estimate - Births and birth rate - Deaths and death rate - Marriages Args: sheet: openpyxl worksheet object for Table 3b Returns: DataFrame with LGD-level statistics """ records = [] # Find header row - look for "Area" column (first column header) header_row = None for row_idx, row in enumerate(sheet.iter_rows(min_row=1, max_row=20, values_only=True), 1): row_str = [str(cell).lower() if cell else "" for cell in row] # The header row has "area" as first column and "mid-year population" in second if "area" in row_str or any("population" in cell for cell in row_str): header_row = row_idx break if not header_row: logger.warning("Could not find header row in LGD table") return pd.DataFrame() logger.debug(f"Found LGD header row at row {header_row}") # Parse data rows - columns are fixed positions based on actual file structure # Col 0: Area (LGD name) # Col 1: Mid-Year population estimate # Col 2: Total Births # Col 3: Birth Rate # Col 4: Number of births outside marriage # Col 5: % of births outside marriage # Col 6: Stillbirths # Col 7: All Deaths # Col 8: Death Rate # Col 9: Deaths of children under 1 Year # Col 10: Deaths due to Cancer # Col 11: Deaths due to Covid-19 # Col 12: Deaths due to Ischaemic Heart Disease # Col 13: Marriages for row in sheet.iter_rows(min_row=header_row + 1, max_row=30, values_only=True): lgd_name = str(row[0]).strip() if row[0] else "" # Check if this is a valid LGD row is_lgd = any(lgd.lower() in lgd_name.lower() or lgd_name.lower() in lgd.lower() for lgd in NI_LGDS) if not is_lgd: continue record = { "lgd": lgd_name, "population": safe_int(row[1]), "births": safe_int(row[2]), "birth_rate": safe_float(row[3]), "births_outside_marriage": safe_int(row[4]), "pct_outside_marriage": safe_float(row[5]), "stillbirths": safe_int(row[6]), "deaths": safe_int(row[7]), "death_rate": safe_float(row[8]), "infant_deaths": safe_int(row[9]), "cancer_deaths": safe_int(row[10]), "covid_deaths": safe_int(row[11]), "heart_disease_deaths": safe_int(row[12]), "marriages": safe_int(row[13]) if len(row) > 13 else None, } records.append(record) df = pd.DataFrame(records) # Remove any rows with empty LGD names if not df.empty and "lgd" in df.columns: df = df[df["lgd"].str.strip() != ""] df = df.reset_index(drop=True) logger.info(f"Parsed {len(df)} LGD records") return df
[docs] def parse_quarterly_tables(file_path: str | Path) -> dict[str, pd.DataFrame]: """Parse the Registrar General Quarterly Tables Excel file. The file contains multiple tables: - Table 1a: Quarterly births and stillbirths - Table 1b: Quarterly deaths, marriages, civil partnerships - Table 3b: Current quarter by LGD Args: file_path: Path to the quarterly tables Excel file Returns: Dict with keys 'births', 'deaths', 'lgd' containing DataFrames """ file_path = Path(file_path) wb = load_workbook(file_path, data_only=True) results = {} # Parse Table 1a (births) - look for exact "Table 1a" match births_sheet = None for name in wb.sheetnames: # Must start with "Table" to avoid matching "Figure 1a" if name.lower().startswith("table") and "1a" in name.lower(): births_sheet = wb[name] logger.debug(f"Found births sheet: {name}") break if births_sheet: results["births"] = parse_quarterly_births(births_sheet) else: logger.warning("Could not find births table (Table 1a)") results["births"] = pd.DataFrame() # Parse Table 1b (deaths/marriages) - look for exact "Table 1b" match deaths_sheet = None for name in wb.sheetnames: if name.lower().startswith("table") and "1b" in name.lower(): deaths_sheet = wb[name] logger.debug(f"Found deaths sheet: {name}") break if deaths_sheet: results["deaths"] = parse_quarterly_deaths(deaths_sheet) else: logger.warning("Could not find deaths table (Table 1b)") results["deaths"] = pd.DataFrame() # Parse Table 3b (LGD statistics) - look for exact "Table 3b" match lgd_sheet = None for name in wb.sheetnames: if name.lower().startswith("table") and "3b" in name.lower(): lgd_sheet = wb[name] logger.debug(f"Found LGD sheet: {name}") break if lgd_sheet: results["lgd"] = parse_lgd_statistics(lgd_sheet) else: logger.warning("Could not find LGD table (Table 3b)") results["lgd"] = pd.DataFrame() return results
[docs] def get_quarterly_vital_statistics( force_refresh: bool = False, ) -> dict[str, pd.DataFrame]: """Get all quarterly vital statistics from the Registrar General Tables. Automatically discovers and downloads the most recent quarterly tables publication from NISRA. Args: force_refresh: If True, bypass cache and download fresh data Returns: Dict with keys: - 'births': DataFrame with quarterly births data - 'deaths': DataFrame with quarterly deaths/marriages data - 'lgd': DataFrame with LGD-level breakdowns Raises: NISRADataNotFoundError: If latest publication cannot be found Example: >>> data = get_quarterly_vital_statistics() >>> sorted(data.keys()) ['births', 'deaths', 'lgd'] """ excel_url, pub_url, year, quarter = get_latest_publication_url() logger.info(f"Downloading Q{quarter} {year} quarterly tables from: {excel_url}") # Cache for 90 days (quarterly data) cache_ttl_hours = 90 * 24 file_path = download_file(excel_url, cache_ttl_hours=cache_ttl_hours, force_refresh=force_refresh) return parse_quarterly_tables(file_path)
[docs] def get_quarterly_births(force_refresh: bool = False) -> pd.DataFrame: """Get quarterly births data. Convenience function to get only the births table. Args: force_refresh: If True, bypass cache and download fresh data Returns: DataFrame with columns: - year: int (2009+) - quarter: int (1-4) - date: datetime (first day of quarter) - total_births: int - birth_rate: float (per 1,000 population) - stillbirths: int (if available) Example: >>> births = get_quarterly_births() >>> 'total_births' in births.columns True """ data = get_quarterly_vital_statistics(force_refresh=force_refresh) return data["births"]
[docs] def get_quarterly_deaths(force_refresh: bool = False) -> pd.DataFrame: """Get quarterly deaths and marriages data. Convenience function to get only the deaths table. Args: force_refresh: If True, bypass cache and download fresh data Returns: DataFrame with columns: - year: int (2009+) - quarter: int (1-4) - date: datetime - deaths: int - death_rate: float (per 1,000 population) - marriages: int - civil_partnerships: int Example: >>> deaths = get_quarterly_deaths() >>> 'deaths' in deaths.columns True """ data = get_quarterly_vital_statistics(force_refresh=force_refresh) return data["deaths"]
[docs] def get_lgd_statistics(force_refresh: bool = False) -> pd.DataFrame: """Get current quarter LGD-level statistics. Convenience function to get the LGD breakdown table. Args: force_refresh: If True, bypass cache and download fresh data Returns: DataFrame with columns: - lgd: str (Local Government District name) - population: int (mid-year estimate) - births: int - birth_rate: float - deaths: int - death_rate: float - marriages: int Example: >>> lgd = get_lgd_statistics() >>> 'lgd' in lgd.columns True """ data = get_quarterly_vital_statistics(force_refresh=force_refresh) return data["lgd"]
[docs] def validate_against_monthly_births( quarterly_df: pd.DataFrame, monthly_df: pd.DataFrame | None = None, ) -> pd.DataFrame: """Compare quarterly births totals against aggregated monthly births. Cross-validates quarterly data against monthly data to verify consistency. Some differences are expected due to timing of registrations. Args: quarterly_df: Quarterly births DataFrame from get_quarterly_births() monthly_df: Monthly births DataFrame from births module (auto-loaded if None) Returns: DataFrame with columns: - year: int - quarter: int - quarterly_total: int - monthly_sum: int - difference: int - pct_diff: float (percentage difference) Example: >>> births_q = get_quarterly_births() >>> validation = validate_against_monthly_births(births_q) >>> 'quarterly_total' in validation.columns True """ if monthly_df is None: from . import births monthly_data = births.get_latest_births(event_type="registration") # Get Persons total for each month monthly_df = monthly_data[monthly_data["sex"] == "Persons"].copy() monthly_df["year"] = monthly_df["month"].dt.year monthly_df["month_num"] = monthly_df["month"].dt.month monthly_df["quarter"] = ((monthly_df["month_num"] - 1) // 3) + 1 # Aggregate monthly to quarterly monthly_quarterly = monthly_df.groupby(["year", "quarter"]).agg(monthly_sum=("births", "sum")).reset_index() # Merge with quarterly data comparison = quarterly_df[["year", "quarter", "total_births"]].merge( monthly_quarterly, on=["year", "quarter"], how="inner", ) comparison = comparison.rename(columns={"total_births": "quarterly_total"}) comparison["difference"] = comparison["quarterly_total"] - comparison["monthly_sum"] comparison["pct_diff"] = (comparison["difference"] / comparison["monthly_sum"] * 100).round(2) return comparison.sort_values(["year", "quarter"])
[docs] def validate_against_monthly_marriages( quarterly_df: pd.DataFrame, monthly_df: pd.DataFrame | None = None, ) -> pd.DataFrame: """Compare quarterly marriages totals against aggregated monthly marriages. Cross-validates quarterly data against monthly data to verify consistency. Args: quarterly_df: Quarterly deaths DataFrame from get_quarterly_deaths() monthly_df: Monthly marriages DataFrame from marriages module Returns: DataFrame with comparison columns Example: >>> deaths_q = get_quarterly_deaths() >>> validation = validate_against_monthly_marriages(deaths_q) >>> 'quarterly_total' in validation.columns True """ if monthly_df is None: from . import marriages monthly_df = marriages.get_latest_marriages() monthly_df["quarter"] = ((monthly_df["date"].dt.month - 1) // 3) + 1 # Aggregate monthly to quarterly monthly_quarterly = monthly_df.groupby(["year", "quarter"]).agg(monthly_sum=("marriages", "sum")).reset_index() # Merge with quarterly data comparison = quarterly_df[["year", "quarter", "marriages"]].merge( monthly_quarterly, on=["year", "quarter"], how="inner", ) comparison = comparison.rename(columns={"marriages": "quarterly_total"}) comparison["difference"] = comparison["quarterly_total"] - comparison["monthly_sum"] comparison["pct_diff"] = (comparison["difference"] / comparison["monthly_sum"] * 100).round(2) return comparison.sort_values(["year", "quarter"])
[docs] def get_validation_report( force_refresh: bool = False, ) -> dict[str, pd.DataFrame]: """Run all cross-validations and return comprehensive report. Compares quarterly data against monthly data sources to verify consistency. Args: force_refresh: If True, bypass cache and download fresh data Returns: Dict with keys: - 'births_validation': Quarterly vs monthly births comparison - 'marriages_validation': Quarterly vs monthly marriages comparison - 'summary': Overall validation statistics Example: >>> report = get_validation_report() >>> 'summary' in report True """ data = get_quarterly_vital_statistics(force_refresh=force_refresh) results = {} # Births validation if not data["births"].empty: try: results["births_validation"] = validate_against_monthly_births(data["births"]) except Exception as e: logger.warning(f"Could not validate births: {e}") results["births_validation"] = pd.DataFrame() # Marriages validation if not data["deaths"].empty and "marriages" in data["deaths"].columns: try: results["marriages_validation"] = validate_against_monthly_marriages(data["deaths"]) except Exception as e: logger.warning(f"Could not validate marriages: {e}") results["marriages_validation"] = pd.DataFrame() # Create summary summary_data = [] for key, validation_df in results.items(): if not validation_df.empty and "pct_diff" in validation_df.columns: avg_diff = validation_df["pct_diff"].abs().mean() max_diff = validation_df["pct_diff"].abs().max() quarters_validated = len(validation_df) summary_data.append( { "validation": key, "quarters_compared": quarters_validated, "avg_pct_diff": round(avg_diff, 2), "max_pct_diff": round(max_diff, 2), "within_2pct": (validation_df["pct_diff"].abs() <= 2).sum(), } ) results["summary"] = pd.DataFrame(summary_data) return results
[docs] def validate_data(df: pd.DataFrame, data_type: str = "births") -> bool: """Validate quarterly data for consistency and reasonable values. Args: df: DataFrame to validate data_type: Type of data ('births', 'deaths', or 'lgd') Returns: True if validation passes Raises: NISRAValidationError: If validation fails """ if df.empty: raise NISRAValidationError(f"Empty DataFrame for {data_type}") if data_type == "births": required_cols = ["year", "quarter", "total_births"] for col in required_cols: if col not in df.columns: raise NISRAValidationError(f"Missing required column: {col}") # Check no negative values if (df["total_births"] < 0).any(): raise NISRAValidationError("Negative birth counts found") # Check reasonable range (NI typically has 4,000-6,000 births per quarter) if df["total_births"].max() > 10000: raise NISRAValidationError("Unreasonably high quarterly birth count") # Check we have historical data if df["year"].min() > 2010: raise NISRAValidationError(f"Expected data from 2009, earliest is {df['year'].min()}") elif data_type == "deaths": required_cols = ["year", "quarter", "deaths"] for col in required_cols: if col not in df.columns: raise NISRAValidationError(f"Missing required column: {col}") if (df["deaths"] < 0).any(): raise NISRAValidationError("Negative death counts found") elif data_type == "lgd": if len(df) < 11: raise NISRAValidationError(f"Expected 11 LGDs, found {len(df)}") logger.info(f"Validation passed for {data_type} data") return True