"""NISRA Marriage and Civil Partnership Registrations Data Source.
Provides access to monthly marriage and civil partnership registration data for Northern Ireland.
Data includes:
- Monthly marriage registrations from 2006 to present
- Monthly civil partnership registrations from 2006 to present
- Total registrations by month and year
- Historical time series for trend analysis
Registrations represent when the event was registered, not when the ceremony occurred.
The data is published monthly with provisional figures for the current year and final figures for
previous years.
Data Source:
**Marriages Mother Page**: https://www.nisra.gov.uk/statistics/births-deaths-and-marriages/marriages
**Civil Partnerships Page**: https://www.nisra.gov.uk/statistics/births-deaths-and-marriages/civil-partnerships
These pages list all relevant statistics publications in reverse chronological order
(newest first). The module automatically scrapes these pages to find the latest
publications, then downloads the Excel files.
Update Frequency: Monthly (published around 11th of the following month)
Geographic Coverage: Northern Ireland
Reference Date: Month of registration
Example:
>>> from bolster.data_sources.nisra import marriages
>>> # Get latest marriage registrations
>>> df = marriages.get_latest_marriages()
>>> sorted(df.columns.tolist())
['date', 'marriages', 'month', 'year']
>>> # Get latest civil partnership registrations
>>> cp_df = marriages.get_latest_civil_partnerships()
>>> sorted(cp_df.columns.tolist())
['civil_partnerships', 'date', 'month', 'year']
>>> # Filter for a specific year
>>> df_2024 = df[df['year'] == 2024]
>>> len(df_2024) > 0
True
"""
import logging
import re
from pathlib import Path
import pandas as pd
from bolster.utils.web import session
from ._base import NISRADataNotFoundError, NISRAValidationError, download_file
[docs]
logger = logging.getLogger(__name__)
# Base URLs for marriage and civil partnership statistics
[docs]
MARRIAGES_BASE_URL = "https://www.nisra.gov.uk/statistics/births-deaths-and-marriages/marriages"
[docs]
CIVIL_PARTNERSHIPS_BASE_URL = "https://www.nisra.gov.uk/statistics/births-deaths-and-marriages/civil-partnerships"
[docs]
def get_latest_marriages_publication_url() -> tuple[str, str]:
"""Scrape NISRA marriages mother page to find the latest monthly marriages file.
Navigates the publication structure:
1. Scrapes mother page for latest "Monthly Marriages" publication
2. Follows link to publication detail page
3. Finds marriages Excel file
Returns:
Tuple of (excel_file_url, publication_date)
Raises:
NISRADataNotFoundError: If publication or file not found
"""
from bs4 import BeautifulSoup
mother_page = MARRIAGES_BASE_URL
try:
response = session.get(mother_page, timeout=30)
response.raise_for_status()
except Exception as e:
raise NISRADataNotFoundError(f"Failed to fetch marriages mother page: {e}") from e
soup = BeautifulSoup(response.content, "html.parser")
# Find latest "Monthly Marriages" publication
# Pattern: "Monthly Marriages - November 2025" or similar
pub_link = None
pub_date = None
for link in soup.find_all("a", href=True):
link_text = link.get_text(strip=True)
# Match "Monthly Marriages" publications
if "Monthly Marriages" in link_text and "publications" in link["href"]:
href = link["href"]
if href.startswith("/"):
href = f"https://www.nisra.gov.uk{href}"
# Extract month/year from link text if available
# Pattern: "Monthly Marriages - November 2025"
date_match = re.search(r"([A-Z][a-z]+)\s+(\d{4})", link_text)
if date_match:
pub_date = f"{date_match.group(1)} {date_match.group(2)}"
# Take first match (should be newest due to reverse chronological order)
pub_link = href
logger.info(f"Found Monthly Marriages publication: {link_text}")
break
if not pub_link:
raise NISRADataNotFoundError("Could not find Monthly Marriages publication on mother page")
# Scrape the publication page for Excel file
try:
pub_response = session.get(pub_link, timeout=30)
pub_response.raise_for_status()
except Exception as e:
raise NISRADataNotFoundError(f"Failed to fetch publication page: {e}") from e
pub_soup = BeautifulSoup(pub_response.content, "html.parser")
# Find marriages Excel file
# Pattern: "Monthly Marriages November 2025.xlsx" or similar
excel_url = None
for link in pub_soup.find_all("a", href=True):
href = link["href"]
if "Marriages" in href and href.endswith(".xlsx"):
if href.startswith("/"):
href = f"https://www.nisra.gov.uk{href}"
excel_url = href
logger.info(f"Found marriages Excel file: {href}")
break
if not excel_url:
raise NISRADataNotFoundError("Could not find marriages Excel file on publication page")
return excel_url, pub_date or "Unknown"
[docs]
def parse_marriages_file(file_path: str | Path) -> pd.DataFrame:
"""Parse NISRA monthly marriages Excel file.
The marriages file contains a single "Marriages" sheet with a wide-format table:
- Rows: Months (January-December)
- Columns: Years (2006-present)
- Values: Number of marriage registrations
Args:
file_path: Path to the marriages Excel file
Returns:
DataFrame with columns:
- month: datetime (first day of month)
- year: int (year of registration)
- marriages: int (number of marriage registrations)
Raises:
NISRAValidationError: If file structure is unexpected
"""
file_path = Path(file_path)
try:
# Read the Marriages sheet
# Skip the header rows (first 3 rows) and read from row 4
df_raw = pd.read_excel(
file_path,
sheet_name="Marriages",
skiprows=3, # Skip "All Marriages" title rows
nrows=13, # Read months + total row (we'll filter out total)
)
except Exception as e:
raise NISRAValidationError(f"Failed to read marriages file: {e}") from e
# First column should be month names
if df_raw.iloc[:, 0].name != "Month of \nRegistration":
# Try to find the month column
month_col = None
for col in df_raw.columns:
if "Month" in str(col) or "Registration" in str(col):
month_col = col
break
if not month_col:
raise NISRAValidationError("Could not find month column in marriages data")
else:
month_col = df_raw.iloc[:, 0].name
# Rename columns to clean year values
# Columns are: Month of Registration, 2006, 2007, ..., 2025
df_raw = df_raw.rename(columns={month_col: "month"})
# Filter out the "Total" row
df_raw = df_raw[df_raw["month"] != "Total"].copy()
# Convert to long format
df_long = df_raw.melt(
id_vars=["month"],
var_name="year",
value_name="marriages",
)
# Clean year column - extract just the year number
# Handle cases like "2025\n[Note 1]\n[Note 2]"
df_long["year"] = df_long["year"].astype(str).str.extract(r"(\d{4})")[0].astype(int)
# Clean marriages column
# Handle missing values ('-' or None)
df_long["marriages"] = df_long["marriages"].replace(["-", None], pd.NA)
df_long["marriages"] = pd.to_numeric(df_long["marriages"], errors="coerce")
# Create datetime column (first day of month)
# Handle month names
month_map = {
"January": 1,
"February": 2,
"March": 3,
"April": 4,
"May": 5,
"June": 6,
"July": 7,
"August": 8,
"September": 9,
"October": 10,
"November": 11,
"December": 12,
}
df_long["month_num"] = df_long["month"].map(month_map)
if df_long["month_num"].isna().any():
raise NISRAValidationError(
f"Unrecognized month names: {df_long[df_long['month_num'].isna()]['month'].unique()}"
)
# Create datetime (first day of each month)
df_long["date"] = pd.to_datetime({"year": df_long["year"], "month": df_long["month_num"], "day": 1})
# Select and reorder final columns
result = df_long[["date", "year", "month", "marriages"]].copy()
# Sort by date
result = result.sort_values("date").reset_index(drop=True)
# Log summary
total_records = len(result)
missing_records = result["marriages"].isna().sum()
date_range = f"{result['date'].min().strftime('%Y-%m')} to {result['date'].max().strftime('%Y-%m')}"
logger.info(f"Parsed {total_records} monthly marriage records ({date_range})")
if missing_records > 0:
logger.info(f" {missing_records} records have missing data")
return result
[docs]
def get_latest_marriages(force_refresh: bool = False) -> pd.DataFrame:
"""Get the latest monthly marriage registrations data.
Automatically discovers and downloads the most recent marriage registrations
from the NISRA website.
Args:
force_refresh: If True, bypass cache and download fresh data
Returns:
DataFrame with columns:
- date: datetime (first day of month)
- year: int (year of registration)
- month: str (month name)
- marriages: int (number of marriage registrations)
Raises:
NISRADataNotFoundError: If latest publication cannot be found
NISRAValidationError: If file structure is unexpected
Example:
>>> df = get_latest_marriages()
>>> sorted(df.columns.tolist())
['date', 'marriages', 'month', 'year']
>>> df_2024 = df[df['year'] == 2024]
>>> total_2024 = df_2024['marriages'].sum()
>>> bool(total_2024 > 0)
True
"""
# Discover latest publication
excel_url, pub_date = get_latest_marriages_publication_url()
logger.info(f"Downloading marriages data ({pub_date}) from: {excel_url}")
# Cache for 30 days (monthly data, but infrequent updates)
cache_ttl_hours = 30 * 24
file_path = download_file(excel_url, cache_ttl_hours=cache_ttl_hours, force_refresh=force_refresh)
# Parse the file
return parse_marriages_file(file_path)
[docs]
def validate_marriages_temporal_continuity(df: pd.DataFrame) -> bool: # pragma: no cover
"""Validate that marriage data has no unexpected gaps in time series.
Args:
df: DataFrame from parse_marriages_file or get_latest_marriages
Returns:
True if validation passes
Raises:
NISRAValidationError: If validation fails
"""
# Group by year and check that each year has 12 months (or less for current year)
for year in df["year"].unique():
year_data = df[df["year"] == year]
month_count = year_data["marriages"].notna().sum()
# Allow incomplete years (current year)
if month_count == 0:
raise NISRAValidationError(f"Year {year} has no data")
# Check for reasonable month count (1-12)
if month_count > 12:
raise NISRAValidationError(f"Year {year} has {month_count} months (expected max 12)")
logger.info("Validation passed: Temporal continuity check")
return True
[docs]
def get_marriages_by_year(df: pd.DataFrame, year: int) -> pd.DataFrame:
"""Filter marriage data for a specific year.
Args:
df: DataFrame from get_latest_marriages()
year: Year to filter
Returns:
Filtered DataFrame
Example:
>>> df = get_latest_marriages()
>>> df_2024 = get_marriages_by_year(df, 2024)
>>> total = df_2024['marriages'].sum()
>>> bool(total > 0)
True
"""
return df[df["year"] == year].reset_index(drop=True)
[docs]
def get_marriages_summary_by_year(df: pd.DataFrame) -> pd.DataFrame:
"""Calculate annual marriage totals and statistics.
Args:
df: DataFrame from get_latest_marriages()
Returns:
DataFrame with columns:
- year: int
- total_marriages: int (sum for the year)
- months_reported: int (number of months with data)
- avg_per_month: float (average marriages per month)
Example:
>>> df = get_latest_marriages()
>>> summary = get_marriages_summary_by_year(df)
>>> sorted(summary.columns.tolist())
['avg_per_month', 'months_reported', 'total_marriages', 'year']
"""
summary = (
df.groupby("year")
.agg(
total_marriages=("marriages", lambda x: x.sum()),
months_reported=("marriages", lambda x: x.notna().sum()),
avg_per_month=("marriages", lambda x: x.mean()),
)
.reset_index()
)
# Round average
summary["avg_per_month"] = summary["avg_per_month"].round(1)
return summary
# ============================================================================
# Civil Partnership Functions
# ============================================================================
[docs]
def get_latest_civil_partnerships_publication_url() -> tuple[str, str]:
"""Scrape NISRA civil partnerships page to find the latest monthly civil partnerships file.
Returns:
Tuple of (excel_file_url, publication_date)
Raises:
NISRADataNotFoundError: If publication or file not found
"""
from bs4 import BeautifulSoup
mother_page = CIVIL_PARTNERSHIPS_BASE_URL
try:
response = session.get(mother_page, timeout=30)
response.raise_for_status()
except Exception as e:
raise NISRADataNotFoundError(f"Failed to fetch civil partnerships page: {e}") from e
soup = BeautifulSoup(response.content, "html.parser")
# Find "Monthly Civil Partnerships" publication link
pub_link = None
pub_date = None
for link in soup.find_all("a", href=True):
link_text = link.get_text(strip=True)
if "monthly-civil-partnerships" in link["href"].lower():
href = link["href"]
if href.startswith("/"):
href = f"https://www.nisra.gov.uk{href}"
pub_link = href
logger.info(f"Found Monthly Civil Partnerships publication: {link_text}")
break
if not pub_link:
raise NISRADataNotFoundError("Could not find Monthly Civil Partnerships publication")
# Scrape the publication page for Excel file
try:
pub_response = session.get(pub_link, timeout=30)
pub_response.raise_for_status()
except Exception as e:
raise NISRADataNotFoundError(f"Failed to fetch publication page: {e}") from e
pub_soup = BeautifulSoup(pub_response.content, "html.parser")
# Find civil partnerships Excel file
excel_url = None
for link in pub_soup.find_all("a", href=True):
href = link["href"]
if "Civil" in href and "Partnership" in href and href.endswith(".xlsx"):
if href.startswith("/"):
href = f"https://www.nisra.gov.uk{href}"
# Extract date from filename if possible
# Pattern: "Monthly Civil Partnerships December 2025.xlsx"
date_match = re.search(r"([A-Z][a-z]+)\s+(\d{4})\.xlsx", href)
if date_match:
pub_date = f"{date_match.group(1)} {date_match.group(2)}"
excel_url = href
logger.info(f"Found civil partnerships Excel file: {href}")
break
if not excel_url:
raise NISRADataNotFoundError("Could not find civil partnerships Excel file on publication page")
return excel_url, pub_date or "Unknown"
[docs]
def parse_civil_partnerships_file(file_path: str | Path) -> pd.DataFrame:
"""Parse NISRA monthly civil partnerships Excel file.
The civil partnerships file contains a "Civil Partnerships" sheet with a wide-format table:
- Rows: Months (January-December)
- Columns: Years (2006-present)
- Values: Number of civil partnership registrations
Args:
file_path: Path to the civil partnerships Excel file
Returns:
DataFrame with columns:
- date: datetime (first day of month)
- year: int (year of registration)
- month: str (month name)
- civil_partnerships: int (number of civil partnership registrations)
Raises:
NISRAValidationError: If file structure is unexpected
"""
file_path = Path(file_path)
try:
# Read the Civil Partnerships sheet
# Row 0: Title
# Row 1: "This sheet contains..."
# Row 2: "All Civil Partnerships"
# Row 3: Headers (Month of Registration, 2006, 2007, ...)
# Row 4+: Data (January, February, ...)
df_raw = pd.read_excel(
file_path,
sheet_name="Civil Partnerships",
header=None,
skiprows=3, # Skip to header row
nrows=13, # Read header + 12 months
)
except Exception as e:
raise NISRAValidationError(f"Failed to read civil partnerships file: {e}") from e
# First row is the header
headers = df_raw.iloc[0].tolist()
df_raw = df_raw.iloc[1:].reset_index(drop=True)
df_raw.columns = headers
# Find the month column
month_col = None
for col in df_raw.columns:
col_str = str(col)
if "Month" in col_str or "Registration" in col_str:
month_col = col
break
if not month_col:
month_col = df_raw.columns[0]
# Rename month column
df_raw = df_raw.rename(columns={month_col: "month"})
# Filter out Total row and any non-month rows
month_names = [
"January",
"February",
"March",
"April",
"May",
"June",
"July",
"August",
"September",
"October",
"November",
"December",
]
df_raw = df_raw[df_raw["month"].isin(month_names)].copy()
# Identify year columns
year_cols = []
for col in df_raw.columns:
if col == "month":
continue
col_str = str(col)
year_match = re.search(r"(\d{4})", col_str)
if year_match:
year_cols.append((col, int(year_match.group(1))))
# Build long-format data
records = []
month_map = {
"January": 1,
"February": 2,
"March": 3,
"April": 4,
"May": 5,
"June": 6,
"July": 7,
"August": 8,
"September": 9,
"October": 10,
"November": 11,
"December": 12,
}
for _, row in df_raw.iterrows():
month_name = row["month"]
month_num = month_map.get(month_name)
if not month_num:
continue
for col, year in year_cols:
val = row[col]
if pd.notna(val) and val != "-":
try:
civil_partnerships = int(float(val))
except (ValueError, TypeError):
civil_partnerships = None
if civil_partnerships is not None:
records.append(
{
"year": year,
"month": month_name,
"month_num": month_num,
"civil_partnerships": civil_partnerships,
}
)
df = pd.DataFrame(records)
# Create datetime column
df["date"] = pd.to_datetime({"year": df["year"], "month": df["month_num"], "day": 1})
# Select and reorder columns
result = df[["date", "year", "month", "civil_partnerships"]].copy()
# Sort by date
result = result.sort_values("date").reset_index(drop=True)
logger.info(
f"Parsed {len(result)} monthly civil partnership records "
f"({result['date'].min().strftime('%Y-%m')} to {result['date'].max().strftime('%Y-%m')})"
)
return result
[docs]
def get_latest_civil_partnerships(force_refresh: bool = False) -> pd.DataFrame:
"""Get the latest monthly civil partnership registrations data.
Automatically discovers and downloads the most recent civil partnership registrations
from the NISRA website.
Args:
force_refresh: If True, bypass cache and download fresh data
Returns:
DataFrame with columns:
- date: datetime (first day of month)
- year: int (year of registration)
- month: str (month name)
- civil_partnerships: int (number of civil partnership registrations)
Raises:
NISRADataNotFoundError: If latest publication cannot be found
NISRAValidationError: If file structure is unexpected
Example:
>>> df = get_latest_civil_partnerships()
>>> sorted(df.columns.tolist())
['civil_partnerships', 'date', 'month', 'year']
>>> df_2024 = df[df['year'] == 2024]
>>> total = df_2024['civil_partnerships'].sum()
>>> bool(total >= 0)
True
"""
excel_url, pub_date = get_latest_civil_partnerships_publication_url()
logger.info(f"Downloading civil partnerships data ({pub_date}) from: {excel_url}")
cache_ttl_hours = 30 * 24
file_path = download_file(excel_url, cache_ttl_hours=cache_ttl_hours, force_refresh=force_refresh)
return parse_civil_partnerships_file(file_path)
[docs]
def get_civil_partnerships_by_year(df: pd.DataFrame, year: int) -> pd.DataFrame:
"""Filter civil partnership data for a specific year.
Args:
df: DataFrame from get_latest_civil_partnerships()
year: Year to filter
Returns:
Filtered DataFrame
"""
return df[df["year"] == year].reset_index(drop=True)
[docs]
def get_civil_partnerships_summary_by_year(df: pd.DataFrame) -> pd.DataFrame:
"""Calculate annual civil partnership totals and statistics.
Args:
df: DataFrame from get_latest_civil_partnerships()
Returns:
DataFrame with columns:
- year: int
- total_civil_partnerships: int
- months_reported: int
- avg_per_month: float
"""
summary = (
df.groupby("year")
.agg(
total_civil_partnerships=("civil_partnerships", "sum"),
months_reported=("civil_partnerships", lambda x: x.notna().sum()),
avg_per_month=("civil_partnerships", "mean"),
)
.reset_index()
)
summary["avg_per_month"] = summary["avg_per_month"].round(1)
return summary