Source code for bolster.data_sources.nisra.marriages

"""NISRA Marriage and Civil Partnership Registrations Data Source.

Provides access to monthly marriage and civil partnership registration data for Northern Ireland.

Data includes:
- Monthly marriage registrations from 2006 to present
- Monthly civil partnership registrations from 2006 to present
- Total registrations by month and year
- Historical time series for trend analysis

Registrations represent when the event was registered, not when the ceremony occurred.
The data is published monthly with provisional figures for the current year and final figures for
previous years.

Data Source:
    **Marriages Mother Page**: https://www.nisra.gov.uk/statistics/births-deaths-and-marriages/marriages
    **Civil Partnerships Page**: https://www.nisra.gov.uk/statistics/births-deaths-and-marriages/civil-partnerships

    These pages list all relevant statistics publications in reverse chronological order
    (newest first). The module automatically scrapes these pages to find the latest
    publications, then downloads the Excel files.

Update Frequency: Monthly (published around 11th of the following month)
Geographic Coverage: Northern Ireland
Reference Date: Month of registration

Example:
    >>> from bolster.data_sources.nisra import marriages
    >>> # Get latest marriage registrations
    >>> df = marriages.get_latest_marriages()
    >>> sorted(df.columns.tolist())
    ['date', 'marriages', 'month', 'year']

    >>> # Get latest civil partnership registrations
    >>> cp_df = marriages.get_latest_civil_partnerships()
    >>> sorted(cp_df.columns.tolist())
    ['civil_partnerships', 'date', 'month', 'year']

    >>> # Filter for a specific year
    >>> df_2024 = df[df['year'] == 2024]
    >>> len(df_2024) > 0
    True
"""

import logging
import re
from pathlib import Path

import pandas as pd

from bolster.utils.web import session

from ._base import NISRADataNotFoundError, NISRAValidationError, download_file

[docs] logger = logging.getLogger(__name__)
# Base URLs for marriage and civil partnership statistics
[docs] MARRIAGES_BASE_URL = "https://www.nisra.gov.uk/statistics/births-deaths-and-marriages/marriages"
[docs] CIVIL_PARTNERSHIPS_BASE_URL = "https://www.nisra.gov.uk/statistics/births-deaths-and-marriages/civil-partnerships"
[docs] def get_latest_marriages_publication_url() -> tuple[str, str]: """Scrape NISRA marriages mother page to find the latest monthly marriages file. Navigates the publication structure: 1. Scrapes mother page for latest "Monthly Marriages" publication 2. Follows link to publication detail page 3. Finds marriages Excel file Returns: Tuple of (excel_file_url, publication_date) Raises: NISRADataNotFoundError: If publication or file not found """ from bs4 import BeautifulSoup mother_page = MARRIAGES_BASE_URL try: response = session.get(mother_page, timeout=30) response.raise_for_status() except Exception as e: raise NISRADataNotFoundError(f"Failed to fetch marriages mother page: {e}") from e soup = BeautifulSoup(response.content, "html.parser") # Find latest "Monthly Marriages" publication # Pattern: "Monthly Marriages - November 2025" or similar pub_link = None pub_date = None for link in soup.find_all("a", href=True): link_text = link.get_text(strip=True) # Match "Monthly Marriages" publications if "Monthly Marriages" in link_text and "publications" in link["href"]: href = link["href"] if href.startswith("/"): href = f"https://www.nisra.gov.uk{href}" # Extract month/year from link text if available # Pattern: "Monthly Marriages - November 2025" date_match = re.search(r"([A-Z][a-z]+)\s+(\d{4})", link_text) if date_match: pub_date = f"{date_match.group(1)} {date_match.group(2)}" # Take first match (should be newest due to reverse chronological order) pub_link = href logger.info(f"Found Monthly Marriages publication: {link_text}") break if not pub_link: raise NISRADataNotFoundError("Could not find Monthly Marriages publication on mother page") # Scrape the publication page for Excel file try: pub_response = session.get(pub_link, timeout=30) pub_response.raise_for_status() except Exception as e: raise NISRADataNotFoundError(f"Failed to fetch publication page: {e}") from e pub_soup = BeautifulSoup(pub_response.content, "html.parser") # Find marriages Excel file # Pattern: "Monthly Marriages November 2025.xlsx" or similar excel_url = None for link in pub_soup.find_all("a", href=True): href = link["href"] if "Marriages" in href and href.endswith(".xlsx"): if href.startswith("/"): href = f"https://www.nisra.gov.uk{href}" excel_url = href logger.info(f"Found marriages Excel file: {href}") break if not excel_url: raise NISRADataNotFoundError("Could not find marriages Excel file on publication page") return excel_url, pub_date or "Unknown"
[docs] def parse_marriages_file(file_path: str | Path) -> pd.DataFrame: """Parse NISRA monthly marriages Excel file. The marriages file contains a single "Marriages" sheet with a wide-format table: - Rows: Months (January-December) - Columns: Years (2006-present) - Values: Number of marriage registrations Args: file_path: Path to the marriages Excel file Returns: DataFrame with columns: - month: datetime (first day of month) - year: int (year of registration) - marriages: int (number of marriage registrations) Raises: NISRAValidationError: If file structure is unexpected """ file_path = Path(file_path) try: # Read the Marriages sheet # Skip the header rows (first 3 rows) and read from row 4 df_raw = pd.read_excel( file_path, sheet_name="Marriages", skiprows=3, # Skip "All Marriages" title rows nrows=13, # Read months + total row (we'll filter out total) ) except Exception as e: raise NISRAValidationError(f"Failed to read marriages file: {e}") from e # First column should be month names if df_raw.iloc[:, 0].name != "Month of \nRegistration": # Try to find the month column month_col = None for col in df_raw.columns: if "Month" in str(col) or "Registration" in str(col): month_col = col break if not month_col: raise NISRAValidationError("Could not find month column in marriages data") else: month_col = df_raw.iloc[:, 0].name # Rename columns to clean year values # Columns are: Month of Registration, 2006, 2007, ..., 2025 df_raw = df_raw.rename(columns={month_col: "month"}) # Filter out the "Total" row df_raw = df_raw[df_raw["month"] != "Total"].copy() # Convert to long format df_long = df_raw.melt( id_vars=["month"], var_name="year", value_name="marriages", ) # Clean year column - extract just the year number # Handle cases like "2025\n[Note 1]\n[Note 2]" df_long["year"] = df_long["year"].astype(str).str.extract(r"(\d{4})")[0].astype(int) # Clean marriages column # Handle missing values ('-' or None) df_long["marriages"] = df_long["marriages"].replace(["-", None], pd.NA) df_long["marriages"] = pd.to_numeric(df_long["marriages"], errors="coerce") # Create datetime column (first day of month) # Handle month names month_map = { "January": 1, "February": 2, "March": 3, "April": 4, "May": 5, "June": 6, "July": 7, "August": 8, "September": 9, "October": 10, "November": 11, "December": 12, } df_long["month_num"] = df_long["month"].map(month_map) if df_long["month_num"].isna().any(): raise NISRAValidationError( f"Unrecognized month names: {df_long[df_long['month_num'].isna()]['month'].unique()}" ) # Create datetime (first day of each month) df_long["date"] = pd.to_datetime({"year": df_long["year"], "month": df_long["month_num"], "day": 1}) # Select and reorder final columns result = df_long[["date", "year", "month", "marriages"]].copy() # Sort by date result = result.sort_values("date").reset_index(drop=True) # Log summary total_records = len(result) missing_records = result["marriages"].isna().sum() date_range = f"{result['date'].min().strftime('%Y-%m')} to {result['date'].max().strftime('%Y-%m')}" logger.info(f"Parsed {total_records} monthly marriage records ({date_range})") if missing_records > 0: logger.info(f" {missing_records} records have missing data") return result
[docs] def get_latest_marriages(force_refresh: bool = False) -> pd.DataFrame: """Get the latest monthly marriage registrations data. Automatically discovers and downloads the most recent marriage registrations from the NISRA website. Args: force_refresh: If True, bypass cache and download fresh data Returns: DataFrame with columns: - date: datetime (first day of month) - year: int (year of registration) - month: str (month name) - marriages: int (number of marriage registrations) Raises: NISRADataNotFoundError: If latest publication cannot be found NISRAValidationError: If file structure is unexpected Example: >>> df = get_latest_marriages() >>> sorted(df.columns.tolist()) ['date', 'marriages', 'month', 'year'] >>> df_2024 = df[df['year'] == 2024] >>> total_2024 = df_2024['marriages'].sum() >>> bool(total_2024 > 0) True """ # Discover latest publication excel_url, pub_date = get_latest_marriages_publication_url() logger.info(f"Downloading marriages data ({pub_date}) from: {excel_url}") # Cache for 30 days (monthly data, but infrequent updates) cache_ttl_hours = 30 * 24 file_path = download_file(excel_url, cache_ttl_hours=cache_ttl_hours, force_refresh=force_refresh) # Parse the file return parse_marriages_file(file_path)
[docs] def validate_marriages_temporal_continuity(df: pd.DataFrame) -> bool: # pragma: no cover """Validate that marriage data has no unexpected gaps in time series. Args: df: DataFrame from parse_marriages_file or get_latest_marriages Returns: True if validation passes Raises: NISRAValidationError: If validation fails """ # Group by year and check that each year has 12 months (or less for current year) for year in df["year"].unique(): year_data = df[df["year"] == year] month_count = year_data["marriages"].notna().sum() # Allow incomplete years (current year) if month_count == 0: raise NISRAValidationError(f"Year {year} has no data") # Check for reasonable month count (1-12) if month_count > 12: raise NISRAValidationError(f"Year {year} has {month_count} months (expected max 12)") logger.info("Validation passed: Temporal continuity check") return True
[docs] def get_marriages_by_year(df: pd.DataFrame, year: int) -> pd.DataFrame: """Filter marriage data for a specific year. Args: df: DataFrame from get_latest_marriages() year: Year to filter Returns: Filtered DataFrame Example: >>> df = get_latest_marriages() >>> df_2024 = get_marriages_by_year(df, 2024) >>> total = df_2024['marriages'].sum() >>> bool(total > 0) True """ return df[df["year"] == year].reset_index(drop=True)
[docs] def get_marriages_summary_by_year(df: pd.DataFrame) -> pd.DataFrame: """Calculate annual marriage totals and statistics. Args: df: DataFrame from get_latest_marriages() Returns: DataFrame with columns: - year: int - total_marriages: int (sum for the year) - months_reported: int (number of months with data) - avg_per_month: float (average marriages per month) Example: >>> df = get_latest_marriages() >>> summary = get_marriages_summary_by_year(df) >>> sorted(summary.columns.tolist()) ['avg_per_month', 'months_reported', 'total_marriages', 'year'] """ summary = ( df.groupby("year") .agg( total_marriages=("marriages", lambda x: x.sum()), months_reported=("marriages", lambda x: x.notna().sum()), avg_per_month=("marriages", lambda x: x.mean()), ) .reset_index() ) # Round average summary["avg_per_month"] = summary["avg_per_month"].round(1) return summary
# ============================================================================ # Civil Partnership Functions # ============================================================================
[docs] def get_latest_civil_partnerships_publication_url() -> tuple[str, str]: """Scrape NISRA civil partnerships page to find the latest monthly civil partnerships file. Returns: Tuple of (excel_file_url, publication_date) Raises: NISRADataNotFoundError: If publication or file not found """ from bs4 import BeautifulSoup mother_page = CIVIL_PARTNERSHIPS_BASE_URL try: response = session.get(mother_page, timeout=30) response.raise_for_status() except Exception as e: raise NISRADataNotFoundError(f"Failed to fetch civil partnerships page: {e}") from e soup = BeautifulSoup(response.content, "html.parser") # Find "Monthly Civil Partnerships" publication link pub_link = None pub_date = None for link in soup.find_all("a", href=True): link_text = link.get_text(strip=True) if "monthly-civil-partnerships" in link["href"].lower(): href = link["href"] if href.startswith("/"): href = f"https://www.nisra.gov.uk{href}" pub_link = href logger.info(f"Found Monthly Civil Partnerships publication: {link_text}") break if not pub_link: raise NISRADataNotFoundError("Could not find Monthly Civil Partnerships publication") # Scrape the publication page for Excel file try: pub_response = session.get(pub_link, timeout=30) pub_response.raise_for_status() except Exception as e: raise NISRADataNotFoundError(f"Failed to fetch publication page: {e}") from e pub_soup = BeautifulSoup(pub_response.content, "html.parser") # Find civil partnerships Excel file excel_url = None for link in pub_soup.find_all("a", href=True): href = link["href"] if "Civil" in href and "Partnership" in href and href.endswith(".xlsx"): if href.startswith("/"): href = f"https://www.nisra.gov.uk{href}" # Extract date from filename if possible # Pattern: "Monthly Civil Partnerships December 2025.xlsx" date_match = re.search(r"([A-Z][a-z]+)\s+(\d{4})\.xlsx", href) if date_match: pub_date = f"{date_match.group(1)} {date_match.group(2)}" excel_url = href logger.info(f"Found civil partnerships Excel file: {href}") break if not excel_url: raise NISRADataNotFoundError("Could not find civil partnerships Excel file on publication page") return excel_url, pub_date or "Unknown"
[docs] def parse_civil_partnerships_file(file_path: str | Path) -> pd.DataFrame: """Parse NISRA monthly civil partnerships Excel file. The civil partnerships file contains a "Civil Partnerships" sheet with a wide-format table: - Rows: Months (January-December) - Columns: Years (2006-present) - Values: Number of civil partnership registrations Args: file_path: Path to the civil partnerships Excel file Returns: DataFrame with columns: - date: datetime (first day of month) - year: int (year of registration) - month: str (month name) - civil_partnerships: int (number of civil partnership registrations) Raises: NISRAValidationError: If file structure is unexpected """ file_path = Path(file_path) try: # Read the Civil Partnerships sheet # Row 0: Title # Row 1: "This sheet contains..." # Row 2: "All Civil Partnerships" # Row 3: Headers (Month of Registration, 2006, 2007, ...) # Row 4+: Data (January, February, ...) df_raw = pd.read_excel( file_path, sheet_name="Civil Partnerships", header=None, skiprows=3, # Skip to header row nrows=13, # Read header + 12 months ) except Exception as e: raise NISRAValidationError(f"Failed to read civil partnerships file: {e}") from e # First row is the header headers = df_raw.iloc[0].tolist() df_raw = df_raw.iloc[1:].reset_index(drop=True) df_raw.columns = headers # Find the month column month_col = None for col in df_raw.columns: col_str = str(col) if "Month" in col_str or "Registration" in col_str: month_col = col break if not month_col: month_col = df_raw.columns[0] # Rename month column df_raw = df_raw.rename(columns={month_col: "month"}) # Filter out Total row and any non-month rows month_names = [ "January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December", ] df_raw = df_raw[df_raw["month"].isin(month_names)].copy() # Identify year columns year_cols = [] for col in df_raw.columns: if col == "month": continue col_str = str(col) year_match = re.search(r"(\d{4})", col_str) if year_match: year_cols.append((col, int(year_match.group(1)))) # Build long-format data records = [] month_map = { "January": 1, "February": 2, "March": 3, "April": 4, "May": 5, "June": 6, "July": 7, "August": 8, "September": 9, "October": 10, "November": 11, "December": 12, } for _, row in df_raw.iterrows(): month_name = row["month"] month_num = month_map.get(month_name) if not month_num: continue for col, year in year_cols: val = row[col] if pd.notna(val) and val != "-": try: civil_partnerships = int(float(val)) except (ValueError, TypeError): civil_partnerships = None if civil_partnerships is not None: records.append( { "year": year, "month": month_name, "month_num": month_num, "civil_partnerships": civil_partnerships, } ) df = pd.DataFrame(records) # Create datetime column df["date"] = pd.to_datetime({"year": df["year"], "month": df["month_num"], "day": 1}) # Select and reorder columns result = df[["date", "year", "month", "civil_partnerships"]].copy() # Sort by date result = result.sort_values("date").reset_index(drop=True) logger.info( f"Parsed {len(result)} monthly civil partnership records " f"({result['date'].min().strftime('%Y-%m')} to {result['date'].max().strftime('%Y-%m')})" ) return result
[docs] def get_latest_civil_partnerships(force_refresh: bool = False) -> pd.DataFrame: """Get the latest monthly civil partnership registrations data. Automatically discovers and downloads the most recent civil partnership registrations from the NISRA website. Args: force_refresh: If True, bypass cache and download fresh data Returns: DataFrame with columns: - date: datetime (first day of month) - year: int (year of registration) - month: str (month name) - civil_partnerships: int (number of civil partnership registrations) Raises: NISRADataNotFoundError: If latest publication cannot be found NISRAValidationError: If file structure is unexpected Example: >>> df = get_latest_civil_partnerships() >>> sorted(df.columns.tolist()) ['civil_partnerships', 'date', 'month', 'year'] >>> df_2024 = df[df['year'] == 2024] >>> total = df_2024['civil_partnerships'].sum() >>> bool(total >= 0) True """ excel_url, pub_date = get_latest_civil_partnerships_publication_url() logger.info(f"Downloading civil partnerships data ({pub_date}) from: {excel_url}") cache_ttl_hours = 30 * 24 file_path = download_file(excel_url, cache_ttl_hours=cache_ttl_hours, force_refresh=force_refresh) return parse_civil_partnerships_file(file_path)
[docs] def get_civil_partnerships_by_year(df: pd.DataFrame, year: int) -> pd.DataFrame: """Filter civil partnership data for a specific year. Args: df: DataFrame from get_latest_civil_partnerships() year: Year to filter Returns: Filtered DataFrame """ return df[df["year"] == year].reset_index(drop=True)
[docs] def get_civil_partnerships_summary_by_year(df: pd.DataFrame) -> pd.DataFrame: """Calculate annual civil partnership totals and statistics. Args: df: DataFrame from get_latest_civil_partnerships() Returns: DataFrame with columns: - year: int - total_civil_partnerships: int - months_reported: int - avg_per_month: float """ summary = ( df.groupby("year") .agg( total_civil_partnerships=("civil_partnerships", "sum"), months_reported=("civil_partnerships", lambda x: x.notna().sum()), avg_per_month=("civil_partnerships", "mean"), ) .reset_index() ) summary["avg_per_month"] = summary["avg_per_month"].round(1) return summary