Source code for bolster.data_sources.nisra.births

"""NISRA Monthly Birth Registrations Data Source.

Provides access to monthly birth registration statistics for Northern Ireland with breakdowns by:
- Sex (Persons, Male, Female)
- Event type (Registration date vs Birth/Occurrence date)

Births data are based on residence of mother at time of birth. Data includes both:
- Births by month of registration: When the birth was officially registered
- Births by month of occurrence: When the birth actually occurred

Most births are registered within 42 days in Northern Ireland, so registration data lags
occurrence data slightly.

Data Source:
    **Mother Page**: https://www.nisra.gov.uk/statistics/births-deaths-and-marriages/births

    This page lists all births statistics publications in reverse chronological order
    (newest first). The module automatically scrapes this page to find the latest
    "Monthly Births" publication, then downloads the Excel file from that publication's
    detail page.

    The monthly files contain complete time series from 2006 to present, updated monthly.
    This ensures the module always retrieves the most recent data without hardcoding dates.

Update Frequency: Monthly (published 12th of following month at 9:30 AM)
Geographic Coverage: Northern Ireland (based on mother's residence)

Example:
    >>> from bolster.data_sources.nisra import births
    >>> # Get latest births by registration date
    >>> df = births.get_latest_births(event_type='registration')
    >>> sorted(df.columns.tolist())
    ['births', 'month', 'sex']
    >>> sorted(df['sex'].unique().tolist())
    ['Female', 'Male', 'Persons']

    >>> # Get latest births by occurrence date
    >>> df = births.get_latest_births(event_type='occurrence')
    >>> len(df) > 0
    True
"""

import logging
import re
from datetime import datetime
from pathlib import Path
from typing import Literal

import pandas as pd
from openpyxl import load_workbook

from bolster.utils.web import session

from ._base import (
    NISRADataNotFoundError,
    NISRAValidationError,
    download_file,
    safe_int,
)


[docs]
logger = logging.getLogger(__name__)


# Base URL for births statistics

[docs]
BIRTHS_BASE_URL = "https://www.nisra.gov.uk/statistics/births-deaths-and-marriages/births"




[docs]
def get_latest_births_publication_url() -> str:
    """Scrape NISRA births mother page to find the latest monthly births file.

    Navigates the publication structure:
    1. Scrapes mother page for "Monthly Births" publication
    2. Follows link to publication detail page
    3. Finds latest Excel file

    Returns:
        URL of the latest monthly births Excel file

    Raises:
        NISRADataNotFoundError: If publication or file not found
    """
    from bs4 import BeautifulSoup

    mother_page = BIRTHS_BASE_URL

    try:
        response = session.get(mother_page, timeout=30)
        response.raise_for_status()
    except Exception as e:
        raise NISRADataNotFoundError(f"Failed to fetch births mother page: {e}") from e

    soup = BeautifulSoup(response.content, "html.parser")

    # Find "Monthly Births" publication link
    pub_link = None
    for link in soup.find_all("a", href=True):
        link_text = link.get_text(strip=True)
        if "Monthly Births" in link_text and "publications" in link["href"]:
            href = link["href"]
            if href.startswith("/"):
                href = f"https://www.nisra.gov.uk{href}"
            pub_link = href
            logger.info(f"Found Monthly Births publication: {link_text}")
            break

    if not pub_link:
        raise NISRADataNotFoundError("Could not find 'Monthly Births' publication on mother page")

    # Scrape the publication page for Excel file
    try:
        pub_response = session.get(pub_link, timeout=30)
        pub_response.raise_for_status()
    except Exception as e:
        raise NISRADataNotFoundError(f"Failed to fetch publication page: {e}") from e

    pub_soup = BeautifulSoup(pub_response.content, "html.parser")

    # Find Excel file link
    excel_url = None
    for link in pub_soup.find_all("a", href=True):
        href = link["href"]
        if href.endswith(".xlsx") and "Monthly" in href and "Births" in href:
            if href.startswith("/"):
                href = f"https://www.nisra.gov.uk{href}"
            excel_url = href

            # Extract month/year from filename for logging
            # Pattern: "Monthly Births November 2025.xlsx"
            match = re.search(r"Monthly\s+Births\s+([A-Za-z]+)\s+(\d{4})", href)
            if match:
                month = match.group(1)
                year = match.group(2)
                logger.info(f"Found latest births file: {month} {year}")
            break

    if not excel_url:
        raise NISRADataNotFoundError("Could not find Excel file on publication page")

    return excel_url




[docs]
def parse_births_file(
    file_path: str | Path,
    event_type: Literal["registration", "occurrence", "both"] = "both",
) -> pd.DataFrame | dict[str, pd.DataFrame]:
    """Parse NISRA monthly births Excel file into long-format DataFrames.

    The births file contains two main sheets:
    - Births_Month of Registration: Births by when registered
    - Births_Month of Birth: Births by when occurred

    Each sheet has three stacked tables (Persons, Males, Females) in wide format
    (months as rows, years as columns). This function converts to long format.

    Args:
        file_path: Path to the births Excel file
        event_type: Which event type to parse:
            - "registration": Births by registration date only
            - "occurrence": Births by birth/occurrence date only
            - "both": Return dict with both types

    Returns:
        If event_type is "both": Dict with keys "registration" and "occurrence".
        Otherwise: Single DataFrame with columns:

        - month: datetime (first day of month)
        - sex: str (Persons, Male, Female)
        - births: int (number of births)

    Raises:
        NISRAValidationError: If file structure is unexpected
    """
    file_path = Path(file_path)
    wb = load_workbook(file_path, data_only=True)

    results = {}

    # Sheet mapping
    sheet_mapping = {
        "registration": "Births_Month of Registration",
        "occurrence": "Births_Month of Birth",
    }

    for event_key, sheet_name in sheet_mapping.items():
        if event_type != "both" and event_type != event_key:
            continue

        if sheet_name not in wb.sheetnames:
            raise NISRAValidationError(f"Expected sheet '{sheet_name}' not found in file")

        sheet = wb[sheet_name]

        # Parse the three stacked tables
        dfs = []

        # Table positions (adjusted based on actual data)
        # Persons: rows 4-17
        # Males: rows 19-32
        # Females: rows 34-47
        sex_tables = [
            ("Persons", 4, 17),
            ("Male", 19, 32),
            ("Female", 34, 47),
        ]

        for sex, header_row, end_row in sex_tables:
            df = _parse_births_table(sheet, header_row, end_row, sex, event_key)
            dfs.append(df)

        # Combine all sex breakdowns
        combined = pd.concat(dfs, ignore_index=True)

        # Sort by month and sex
        combined = combined.sort_values(["month", "sex"])

        results[event_key] = combined

    # Return based on event_type
    if event_type == "both":
        return results
    return results[event_type]



def _parse_births_table(
    sheet,
    header_row: int,
    end_row: int,
    sex: str,
    event_type: str,
) -> pd.DataFrame:
    """Parse a single births table (one sex) from the Excel sheet.

    Args:
        sheet: openpyxl worksheet
        header_row: Row number with year headers (1-indexed)
        end_row: Last row of data (1-indexed)
        sex: Sex category (Persons, Male, Female)
        event_type: registration or occurrence

    Returns:
        Long-format DataFrame with columns: month, sex, births
    """
    # Read header row to get years
    header_cells = list(sheet.iter_rows(min_row=header_row, max_row=header_row, values_only=True))[0]

    # First column is "Month of Registration" or "Month of Birth"
    # Remaining columns are years
    years = []
    for cell in header_cells[1:]:  # Skip first column
        if cell is not None:
            # Clean year (might have notes like "2025\n[Note 1]")
            year_str = str(cell).split("\n")[0].strip()
            try:
                year = int(year_str)
                years.append(year)
            except ValueError:
                break  # Stop at first non-year column

    if not years:
        raise NISRAValidationError(f"Could not parse years from header row {header_row}")

    # Parse data rows (months)
    month_names = [
        "January",
        "February",
        "March",
        "April",
        "May",
        "June",
        "July",
        "August",
        "September",
        "October",
        "November",
        "December",
    ]

    data_rows = []

    for row in sheet.iter_rows(min_row=header_row + 1, max_row=end_row, values_only=True):
        month_label = str(row[0]).strip() if row[0] else None

        # Skip "Total" row
        if month_label == "Total":
            continue

        # Match against expected month names
        if month_label not in month_names:
            continue

        month_index = month_names.index(month_label) + 1  # 1-12

        # Extract birth counts for each year
        for i, year in enumerate(years):
            value = row[i + 1]  # +1 to skip month label column

            # Handle missing data (dash or None)
            if value == "-" or value is None:
                continue

            births = safe_int(value)

            if births is None:
                continue

            # Create month datetime (first day of month)
            try:
                month_date = datetime(year, month_index, 1)
            except ValueError:
                continue

            data_rows.append(
                {
                    "month": month_date,
                    "sex": sex,
                    "births": births,
                }
            )

    return pd.DataFrame(data_rows)



[docs]
def get_latest_births(
    event_type: Literal["registration", "occurrence", "both"] = "both",
    force_refresh: bool = False,
) -> pd.DataFrame | dict[str, pd.DataFrame]:
    """Get the latest monthly births data.

    Automatically discovers and downloads the most recent monthly births publication
    from the NISRA website.

    Args:
        event_type: Which event type to retrieve:
            - "registration": Births by month registered
            - "occurrence": Births by month of birth (occurrence)
            - "both": Return dict with both types
        force_refresh: If True, bypass cache and download fresh data

    Returns:
        If event_type is "both": Dict with keys "registration" and "occurrence".
        Otherwise: Single DataFrame with columns:

        - month: datetime (first day of month)
        - sex: str (Persons, Male, Female)
        - births: int

    Raises:
        NISRADataNotFoundError: If latest publication cannot be found
        NISRAValidationError: If file structure is unexpected

    Example:
        >>> # Get births by registration date
        >>> df = get_latest_births(event_type='registration')
        >>> sorted(df.columns.tolist())
        ['births', 'month', 'sex']

        >>> # Get both types
        >>> data = get_latest_births(event_type='both')
        >>> sorted(data.keys())
        ['occurrence', 'registration']
    """
    # Discover latest publication
    excel_url = get_latest_births_publication_url()

    logger.info(f"Downloading latest births data from: {excel_url}")

    # Cache for 30 days (monthly data)
    cache_ttl_hours = 30 * 24
    file_path = download_file(excel_url, cache_ttl_hours=cache_ttl_hours, force_refresh=force_refresh)

    # Parse the file
    return parse_births_file(file_path, event_type=event_type)




[docs]
def validate_births_totals(df: pd.DataFrame) -> bool:
    """Validate that Male + Female births equal Persons births for each month.

    Args:
        df: DataFrame from parse_births_file or get_latest_births

    Returns:
        True if validation passes

    Raises:
        NISRAValidationError: If validation fails
    """
    months = df["month"].unique()

    for month in months:
        month_data = df[df["month"] == month]

        persons = month_data[month_data["sex"] == "Persons"]["births"].sum()
        male = month_data[month_data["sex"] == "Male"]["births"].sum()
        female = month_data[month_data["sex"] == "Female"]["births"].sum()

        if persons != male + female:
            raise NISRAValidationError(
                f"Month {month.date()}: Persons ({persons}) != Male ({male}) + Female ({female})"
            )

    logger.info(f"Validation passed: Male + Female = Persons for {len(months)} months")
    return True