Source code for bolster.data_sources.nisra.baby_names

"""NISRA Baby Names Northern Ireland Data Source.

Provides access to baby name statistics for Northern Ireland from the Northern Ireland
Statistics and Research Agency (NISRA), including:
- Full historical list of all first forenames given to babies registered in NI (1997–present)
- Annual rank and count for every name, by sex (Boys/Girls)

The module uses the Full Name List file which contains all registered names with their
rank and count for each year from 1997 to the most recent publication year.

Data Source:
    **Statistics Page**: https://www.nisra.gov.uk/statistics/births/baby-names

    The statistics page lists all Baby Names publications in reverse chronological order
    (newest first). The module automatically scrapes this page to find the latest
    Baby Names publication, then downloads the Full Names List Excel file from that
    publication's detail page.

    The full names list files contain complete time series from 1997 to the most recent
    year, updated annually in April.

Update Frequency: Annual (published April each year)
Geographic Coverage: Northern Ireland (births registered in NI)

Example:
    >>> from bolster.data_sources.nisra import baby_names
    >>> df = baby_names.get_baby_names()
    >>> sorted(df.columns.tolist())
    ['count', 'name', 'rank', 'sex', 'year']
    >>> sorted(df['sex'].unique().tolist())
    ['Boys', 'Girls']
    >>> df['year'].min() >= 1997
    True
"""

import logging
import re
from pathlib import Path

import pandas as pd
from bs4 import BeautifulSoup
from openpyxl import load_workbook

from bolster.utils.web import session

from ._base import (
    NISRADataNotFoundError,
    NISRAValidationError,
    download_file,
    safe_int,
)

[docs] logger = logging.getLogger(__name__)
# Statistics page listing all baby names publications
[docs] BABY_NAMES_STATS_URL = "https://www.nisra.gov.uk/statistics/births/baby-names"
[docs] NISRA_BASE_URL = "https://www.nisra.gov.uk"
# Header row in the Full Name List Tables (1-indexed) _FULL_LIST_HEADER_ROW = 6 # First data row (1-indexed) _FULL_LIST_DATA_START_ROW = 7 # Columns per year block (Name, Number of Babies, Rank) _COLS_PER_YEAR = 3
[docs] def get_baby_names_publication_url() -> str: """Scrape NISRA to find the latest Baby Names Full Name List Excel URL. Navigates the publication structure: 1. Scrapes the baby names statistics page for the latest publication link 2. Follows link to the publication detail page 3. Finds the Full Names List Excel file link Returns: URL of the latest Full Name List Excel file Raises: NISRADataNotFoundError: If publication or file not found """ try: response = session.get(BABY_NAMES_STATS_URL, timeout=30) response.raise_for_status() except Exception as e: raise NISRADataNotFoundError(f"Failed to fetch baby names statistics page: {e}") from e soup = BeautifulSoup(response.content, "html.parser") # Find the most recent "Baby Names YYYY" publication link (first match = latest) pub_link = None pub_year = None for link in soup.find_all("a", href=True): href = link["href"] text = link.get_text(strip=True) # Match links like /publications/baby-names-2025 if re.search(r"/publications/baby-names-\d{4}", href): # Extract year from href m = re.search(r"baby-names-(\d{4})", href) year = int(m.group(1)) if m else 0 if pub_year is None or year > pub_year: pub_year = year pub_link = href logger.info(f"Found Baby Names publication: {text.strip()!r} ({href})") if not pub_link: raise NISRADataNotFoundError("Could not find Baby Names publication link on statistics page") # Make absolute URL if pub_link.startswith("/"): pub_link = f"{NISRA_BASE_URL}{pub_link}" # Fetch the publication detail page try: pub_response = session.get(pub_link, timeout=30) pub_response.raise_for_status() except Exception as e: raise NISRADataNotFoundError(f"Failed to fetch publication page {pub_link}: {e}") from e pub_soup = BeautifulSoup(pub_response.content, "html.parser") # Find the "Full Names List" Excel file link # Link text is like "Full Names List, 1997 to 2025 TablesMicrosoft Excel ..." excel_url = None for link in pub_soup.find_all("a", href=True): href = link["href"] text = link.get_text(strip=True) if href.lower().endswith(".xlsx") and "full" in text.lower() and "name" in text.lower(): excel_url = href logger.info(f"Found Full Name List file: {text!r} -> {href}") break # Fallback: any xlsx link with "Full_Name_List" in the path if not excel_url: for link in pub_soup.find_all("a", href=True): href = link["href"] if href.lower().endswith(".xlsx") and "full_name_list" in href.lower(): excel_url = href logger.info(f"Found Full Name List file (fallback): {href}") break if not excel_url: raise NISRADataNotFoundError(f"Could not find Full Name List Excel file on publication page: {pub_link}") # Make absolute URL if excel_url.startswith("/"): excel_url = f"{NISRA_BASE_URL}{excel_url}" return excel_url
[docs] def parse_baby_names_file(file_path: str | Path) -> pd.DataFrame: """Parse NISRA Full Name List Excel file into long-format DataFrame. The Full Name List Excel file contains two sheets: - Table 1: Boys' names (1997 to present), wide format with 3 columns per year (Name, Number of Babies, Rank), 29+ year blocks across the row - Table 2: Girls' names, same structure as Table 1 Names with suppressed counts (shown as '..') are excluded (names with fewer than 3 occurrences are suppressed for disclosure control). Args: file_path: Path to the Full Name List Excel file Returns: Long-format DataFrame with columns: - year: int — registration year - name: str — first forename (title case as registered) - sex: str — "Boys" or "Girls" - rank: int — rank within that sex and year (1 = most popular) - count: int — number of babies registered with that name Raises: NISRAValidationError: If the file structure is unexpected or no data parsed """ file_path = Path(file_path) wb = load_workbook(file_path, data_only=True, read_only=True) sheet_sex_map = { "Table 1": "Boys", "Table 2": "Girls", } all_records: list[dict] = [] for sheet_name, sex in sheet_sex_map.items(): if sheet_name not in wb.sheetnames: raise NISRAValidationError(f"Expected sheet '{sheet_name}' not found. Available: {wb.sheetnames}") ws = wb[sheet_name] records = _parse_full_name_list_sheet(ws, sex) all_records.extend(records) logger.info(f"Parsed {len(records)} records from {sheet_name} ({sex})") if not all_records: raise NISRAValidationError("No records parsed from file — check file structure") df = pd.DataFrame(all_records) df["year"] = df["year"].astype(int) df["rank"] = df["rank"].astype(int) df["count"] = df["count"].astype(int) df = df.sort_values(["year", "sex", "rank"]).reset_index(drop=True) logger.info(f"Parsed {len(df)} total records covering {df['year'].min()}{df['year'].max()}") return df
def _parse_full_name_list_sheet(ws, sex: str) -> list[dict]: """Parse a single sex table (Table 1 or Table 2) from the Full Name List workbook. The sheet layout is: - Rows 1–5: Cover notes / metadata - Row 6: Headers — "YYYY Name", "Number of Babies", "Rank", repeated for each year - Row 7+: Data — one name per row position, repeated for each year block Each year occupies exactly 3 columns. Years run left-to-right in chronological order. Names within each year block are sorted by rank (ascending). Args: ws: openpyxl worksheet (read_only) sex: "Boys" or "Girls" Returns: List of record dicts with keys: year, name, sex, rank, count """ # Read header row to extract years headers = list(ws.iter_rows(min_row=_FULL_LIST_HEADER_ROW, max_row=_FULL_LIST_HEADER_ROW, values_only=True))[0] # Build list of (col_index, year) for each year block year_cols: list[tuple[int, int]] = [] for col_idx in range(0, len(headers), _COLS_PER_YEAR): cell_val = headers[col_idx] if cell_val is None: break # Header format: "1997 Name", "1998 Name", etc. m = re.match(r"(\d{4})\s+Name", str(cell_val).strip()) if m: year = int(m.group(1)) year_cols.append((col_idx, year)) if not year_cols: raise NISRAValidationError(f"Could not parse year columns from header row in sheet for {sex}") logger.debug(f"{sex}: Found {len(year_cols)} year columns, {year_cols[0][1]}{year_cols[-1][1]}") records: list[dict] = [] # Read all data rows at once for row in ws.iter_rows(min_row=_FULL_LIST_DATA_START_ROW, values_only=True): # Skip completely empty rows if all(v is None for v in row): continue for col_idx, year in year_cols: # Each year block: [Name, Number of Babies, Rank] if col_idx + 2 >= len(row): continue name = row[col_idx] count_val = row[col_idx + 1] rank_val = row[col_idx + 2] # Skip suppressed ('..' placeholder) or missing values if name is None or str(name).strip() in ("", "..", "-", "Contents"): continue if count_val is None or str(count_val).strip() in ("", "..", "-"): continue if rank_val is None or str(rank_val).strip() in ("", "..", "-"): continue count = safe_int(count_val) rank = safe_int(rank_val) if count is None or rank is None: continue records.append( { "year": year, "name": str(name).strip(), "sex": sex, "rank": rank, "count": count, } ) return records
[docs] def get_baby_names(force_refresh: bool = False) -> pd.DataFrame: """Get the full historical Baby Names series for Northern Ireland (1997–present). Automatically discovers and downloads the most recent Full Name List publication from the NISRA website, which contains the complete historical series from 1997. Args: force_refresh: If True, bypass cache and download fresh data Returns: Long-format DataFrame with columns: - year: int — registration year (1997–present) - name: str — first forename as registered - sex: str — "Boys" or "Girls" - rank: int — rank within sex and year (1 = most popular) - count: int — number of babies with that name Raises: NISRADataNotFoundError: If the latest publication cannot be found NISRAValidationError: If the file structure is unexpected Example: >>> df = get_baby_names() >>> sorted(df.columns.tolist()) ['count', 'name', 'rank', 'sex', 'year'] >>> df['year'].min() >= 1997 True >>> sorted(df['sex'].unique().tolist()) ['Boys', 'Girls'] >>> df[df['year'] == df['year'].max()].nsmallest(1, 'rank')['name'].iloc[0] is not None True """ excel_url = get_baby_names_publication_url() logger.info(f"Downloading baby names data from: {excel_url}") # Cache for 180 days — published once per year in April file_path = download_file(excel_url, cache_ttl_hours=180 * 24, force_refresh=force_refresh) return parse_baby_names_file(file_path)
[docs] def validate_baby_names(df: pd.DataFrame) -> bool: """Validate a baby names DataFrame for structural and data integrity. Checks: - Required columns are present - No null values in any column - Both sexes present ("Boys" and "Girls") - Year range starts at or before 1999 (data should go back to 1997) - Rank starts at 1 for at least one year/sex combination - All counts are positive (> 0) - No negative ranks or counts Args: df: DataFrame to validate (from parse_baby_names_file or get_baby_names) Returns: True if validation passes Raises: NISRAValidationError: If any validation check fails, with descriptive message Example: >>> import pandas as pd >>> valid_df = pd.DataFrame({ ... 'year': [2020, 2020], 'name': ['Noah', 'Jack'], ... 'sex': ['Boys', 'Boys'], 'rank': [1, 2], 'count': [100, 90] ... }) >>> validate_baby_names(valid_df) True """ required_columns = {"year", "name", "sex", "rank", "count"} # Check empty DataFrame if df is None or df.empty: raise NISRAValidationError("DataFrame is empty") # Check required columns missing_cols = required_columns - set(df.columns) if missing_cols: raise NISRAValidationError(f"Missing required columns: {missing_cols}") # Check for null values null_counts = df[list(required_columns)].isnull().sum() cols_with_nulls = null_counts[null_counts > 0] if not cols_with_nulls.empty: raise NISRAValidationError(f"Columns with null values: {cols_with_nulls.to_dict()}") # Check both sexes present actual_sexes = set(df["sex"].unique()) if "Boys" not in actual_sexes: raise NISRAValidationError(f"Missing 'Boys' sex category. Found: {actual_sexes}") if "Girls" not in actual_sexes: raise NISRAValidationError(f"Missing 'Girls' sex category. Found: {actual_sexes}") # Check year range min_year = df["year"].min() if min_year > 1999: raise NISRAValidationError( f"Year range starts at {min_year} — expected data back to at least 1999 (ideally 1997)" ) # Check no non-positive counts (before checking rank range) bad_counts = (df["count"] <= 0).sum() if bad_counts > 0: raise NISRAValidationError(f"{bad_counts} rows have non-positive count values") # Check no non-positive ranks (before checking min_rank == 1) bad_ranks = (df["rank"] <= 0).sum() if bad_ranks > 0: raise NISRAValidationError(f"{bad_ranks} rows have non-positive rank values") # Check rank starts at 1 min_rank = df["rank"].min() if min_rank != 1: raise NISRAValidationError(f"Minimum rank is {min_rank} — expected 1") logger.info( f"Validation passed: {len(df):,} records, years {df['year'].min()}{df['year'].max()}, " f"sexes: {sorted(actual_sexes)}" ) return True