Source code for bolster.data_sources.nisra.population

"""NISRA Mid-Year Population Estimates Data Source.

Provides access to mid-year population estimates for Northern Ireland with breakdowns by:
- Geography (Northern Ireland, Parliamentary Constituencies, Health and Social Care Trusts)
- Sex (All persons, Males, Females)
- Age (5-year age bands: 00-04, 05-09, ..., 85-89, 90+)
- Year (1971-present for NI overall, 2021-present for sub-geographies)

Mid-year estimates are referenced to June 30th of each year.

Data Source:
    **Mother Page**: https://www.nisra.gov.uk/statistics/people-and-communities/population

    This page lists all population statistics publications in reverse chronological order
    (newest first). The module automatically scrapes this page to find the latest
    "Mid-Year Population Estimates for Small Geographical Areas" publication, then downloads
    the age bands Excel file from that publication's detail page.

    The files contain complete time series data in a pre-processed "Flat" format, making
    this one of the most analysis-ready NISRA datasets.

Update Frequency: Annual (published ~6 months after reference date)
Geographic Coverage: Northern Ireland
Reference Date: June 30th of each year

Example:
    >>> from bolster.data_sources.nisra import population
    >>> # Get latest population estimates for all geographies
    >>> df = population.get_latest_population()
    >>> 'population' in df.columns
    True

    >>> # Get only Northern Ireland overall
    >>> ni_df = population.get_latest_population(area='Northern Ireland')
    >>> len(ni_df) > 0
    True
"""

import logging
import re
from pathlib import Path
from typing import Literal

import pandas as pd

from bolster.utils.web import session

from ._base import NISRADataNotFoundError, NISRAValidationError, download_file


[docs]
logger = logging.getLogger(__name__)


# Base URL for population statistics

[docs]
POPULATION_BASE_URL = "https://www.nisra.gov.uk/statistics/population/mid-year-population-estimates"




[docs]
def get_latest_population_publication_url() -> tuple[str, int]:
    """Scrape NISRA population mother page to find the latest MYE age bands file.

    Navigates the publication structure:
    1. Scrapes mother page for latest "Mid-Year Population Estimates" publication
    2. Follows link to publication detail page
    3. Finds age bands Excel file

    Returns:
        Tuple of (excel_file_url, year)

    Raises:
        NISRADataNotFoundError: If publication or file not found
    """
    from bs4 import BeautifulSoup

    mother_page = POPULATION_BASE_URL

    try:
        response = session.get(mother_page, timeout=30)
        response.raise_for_status()
    except Exception as e:
        raise NISRADataNotFoundError(f"Failed to fetch population mother page: {e}") from e

    soup = BeautifulSoup(response.content, "html.parser")

    # Find latest "Mid-Year Population Estimates for Small Geographical Areas" publication
    # Pattern: "2024 Mid-Year Population Estimates for Northern Ireland..."
    pub_link = None
    pub_year = None

    for link in soup.find_all("a", href=True):
        link_text = link.get_text(strip=True)

        # Match pattern with year - title format changed in 2024 (dropped "Small Geographical Areas" suffix)
        match = re.search(r"(\d{4})\s+Mid-Year Population Estimates for Northern Ireland", link_text, re.IGNORECASE)

        if match and "publications" in link["href"]:
            year = int(match.group(1))
            href = link["href"]

            if href.startswith("/"):
                href = f"https://www.nisra.gov.uk{href}"

            # Take first match (should be newest due to reverse chronological order)
            pub_link = href
            pub_year = year
            logger.info(f"Found {year} Mid-Year Population Estimates publication")
            break

    if not pub_link:
        raise NISRADataNotFoundError("Could not find Mid-Year Population Estimates publication on mother page")

    # Scrape the publication page for age bands Excel file
    try:
        pub_response = session.get(pub_link, timeout=30)
        pub_response.raise_for_status()
    except Exception as e:
        raise NISRADataNotFoundError(f"Failed to fetch publication page: {e}") from e

    pub_soup = BeautifulSoup(pub_response.content, "html.parser")

    # Find age bands Excel file
    # Pattern: "MYE24_AGE_BANDS_NI_HSCT_PC.xlsx" or similar
    excel_url = None

    for link in pub_soup.find_all("a", href=True):
        href = link["href"]
        link_text = link.get_text(strip=True).lower()

        # Match on link text rather than filename — NISRA filename conventions vary by year
        # (e.g. MYE22-AGE-BANDS.xlsx vs MYE23_AGE_BANDS_NI_LGD.xlsx)
        if "population by sex and age bands" in link_text and href.endswith(".xlsx"):
            if href.startswith("/"):
                href = f"https://www.nisra.gov.uk{href}"

            excel_url = href
            logger.info(f"Found age bands file for {pub_year}")
            break

    if not excel_url:
        raise NISRADataNotFoundError("Could not find age bands Excel file on publication page")

    return excel_url, pub_year




[docs]
def parse_population_file(
    file_path: str | Path,
    area: Literal[
        "all",
        "Northern Ireland",
        "Parliamentary Constituencies (2024)",
        "Health and Social Care Trusts",
        "Parliamentary Constituencies (2008)",
    ]
    | None = "all",
) -> pd.DataFrame:
    """Parse NISRA mid-year population estimates Excel file.

    The population file contains a "Flat" sheet with pre-processed long-format data,
    making this one of the easiest NISRA datasets to work with.

    Args:
        file_path: Path to the population Excel file
        area: Which geographic area(s) to return:
            - "all": All geographic breakdowns
            - "Northern Ireland": NI overall only (1971-present)
            - "Parliamentary Constituencies (2024)": 2024 constituencies (2021-present)
            - "Health and Social Care Trusts": HSC Trusts (2021-present)
            - "Parliamentary Constituencies (2008)": 2008 constituencies (2021-present)

    Returns:
        DataFrame with columns:
            - area: str (e.g., "1. Northern Ireland")
            - area_code: str (ONS geography code)
            - area_name: str (full area name)
            - year: int (reference year)
            - sex: str ("All persons", "Males", "Females")
            - age_5: str (5-year age band: "00-04", "05-09", ..., "90+")
            - age_band: str (custom age band)
            - age_broad: str (broad age band: "00-15", "16-39", "40-64", "65+")
            - population: int (mid-year estimate)

    Raises:
        NISRAValidationError: If file structure is unexpected
    """
    file_path = Path(file_path)

    try:
        # Read the Flat sheet - it's already in perfect long format
        df = pd.read_excel(file_path, sheet_name="Flat")
    except Exception as e:
        raise NISRAValidationError(f"Failed to read population file: {e}") from e

    # Validate expected columns
    expected_cols = {"area", "area_code", "area_name", "year", "sex", "age_5", "age_band", "age_broad", "MYE"}
    if not expected_cols.issubset(df.columns):
        missing = expected_cols - set(df.columns)
        raise NISRAValidationError(f"Missing expected columns: {missing}")

    # Rename MYE to population for clarity
    df = df.rename(columns={"MYE": "population"})

    # Filter by area if specified
    if area and area != "all":
        # Map user-friendly names to area column values
        area_map = {
            "Northern Ireland": "1. Northern Ireland",
            "Parliamentary Constituencies (2024)": "2. Parliamentary Constituencies (2024)",
            "Health and Social Care Trusts": "3. Health and Social Care Trusts",
            "Parliamentary Constituencies (2008)": "4. Parliamentary Constituencies (2008)",
        }

        area_value = area_map.get(area)
        if not area_value:
            raise ValueError(f"Invalid area: {area}. Choose from: {list(area_map.keys())}")

        df = df[df["area"] == area_value].copy()

        if df.empty:
            raise NISRAValidationError(f"No data found for area: {area}")

    # Sort for consistent output
    return df.sort_values(["area", "year", "sex", "age_5"]).reset_index(drop=True)




[docs]
def get_latest_population(
    area: Literal[
        "all",
        "Northern Ireland",
        "Parliamentary Constituencies (2024)",
        "Health and Social Care Trusts",
        "Parliamentary Constituencies (2008)",
    ]
    | None = "all",
    force_refresh: bool = False,
) -> pd.DataFrame:
    """Get the latest mid-year population estimates.

    Automatically discovers and downloads the most recent population estimates
    from the NISRA website.

    Args:
        area: Which geographic area(s) to return (default: "all")
        force_refresh: If True, bypass cache and download fresh data

    Returns:
        DataFrame with columns:
            - area, area_code, area_name: Geographic identifiers
            - year: Reference year
            - sex: "All persons", "Males", or "Females"
            - age_5: 5-year age band
            - age_band, age_broad: Alternative age groupings
            - population: Mid-year estimate

    Raises:
        NISRADataNotFoundError: If latest publication cannot be found
        NISRAValidationError: If file structure is unexpected

    Example:
        >>> df = get_latest_population()
        >>> 'population' in df.columns
        True

        >>> ni_df = get_latest_population(area='Northern Ireland')
        >>> sorted(df.columns.tolist())
        ['age_5', 'age_band', 'age_broad', 'area', 'area_code', 'area_name', 'population', 'sex', 'year']
    """
    # Discover latest publication
    excel_url, year = get_latest_population_publication_url()

    logger.info(f"Downloading {year} mid-year population estimates from: {excel_url}")

    # Cache for 180 days (annual data, infrequent updates)
    cache_ttl_hours = 180 * 24
    file_path = download_file(excel_url, cache_ttl_hours=cache_ttl_hours, force_refresh=force_refresh)

    # Parse the file
    return parse_population_file(file_path, area=area)




[docs]
def validate_population_totals(df: pd.DataFrame) -> bool:
    """Validate that Males + Females population equals All persons for each group.

    Args:
        df: DataFrame from parse_population_file or get_latest_population

    Returns:
        True if validation passes

    Raises:
        NISRAValidationError: If validation fails
    """
    # Get unique combinations of area, year, age_5
    groups = df.groupby(["area_name", "year", "age_5"])

    for (area_name, year, age_band), group_data in groups:
        all_persons = group_data[group_data["sex"] == "All persons"]["population"].sum()
        males = group_data[group_data["sex"] == "Males"]["population"].sum()
        females = group_data[group_data["sex"] == "Females"]["population"].sum()

        if all_persons != males + females:
            raise NISRAValidationError(
                f"{area_name} {year} {age_band}: All persons ({all_persons}) != Males ({males}) + Females ({females})"
            )

    num_groups = len(groups)
    logger.info(f"Validation passed: Males + Females = All persons for {num_groups} groups")
    return True




[docs]
def get_population_by_year(
    df: pd.DataFrame,
    year: int,
    sex: Literal["All persons", "Males", "Females"] | None = "All persons",
) -> pd.DataFrame:
    """Filter population data for a specific year and optional sex.

    Args:
        df: DataFrame from get_latest_population()
        year: Year to filter
        sex: Sex category to filter (default: "All persons")

    Returns:
        Filtered DataFrame

    Example:
        >>> df = get_latest_population(area='Northern Ireland')
        >>> pop_2024 = get_population_by_year(df, 2024)
        >>> total = pop_2024['population'].sum()
        >>> bool(total > 0)
        True
    """
    filtered = df[df["year"] == year].copy()

    if sex:
        filtered = filtered[filtered["sex"] == sex]

    return filtered.reset_index(drop=True)




[docs]
def get_population_pyramid_data(
    df: pd.DataFrame,
    year: int,
    area_name: str | None = "NORTHERN IRELAND",
) -> pd.DataFrame:
    """Prepare data for population pyramid visualization.

    Returns males and females by age band for a specific year and area,
    formatted for easy pyramid plotting.

    Args:
        df: DataFrame from get_latest_population()
        year: Year to visualize
        area_name: Area name to filter (default: "NORTHERN IRELAND")

    Returns:
        DataFrame with columns:
            - age_5: Age band
            - males: Male population (positive values)
            - females: Female population (negative values for pyramid)

    Example:
        >>> df = get_latest_population(area='Northern Ireland')
        >>> pyramid = get_population_pyramid_data(df, 2024)
        >>> sorted(pyramid.columns.tolist())
        ['age_5', 'females', 'males']
    """
    filtered = df[(df["year"] == year) & (df["area_name"] == area_name)].copy()

    # Get males and females separately and aggregate by age_5
    # (file has multiple rows per age_5 due to different age_band groupings)
    males = filtered[filtered["sex"] == "Males"].groupby("age_5")["population"].sum().reset_index()
    males = males.rename(columns={"population": "males"})

    females = filtered[filtered["sex"] == "Females"].groupby("age_5")["population"].sum().reset_index()
    females = females.rename(columns={"population": "females"})
    # Make females negative for pyramid visualization
    females["females"] = -females["females"]

    # Merge
    pyramid = males.merge(females, on="age_5", how="outer")

    # Sort by age band
    return pyramid.sort_values("age_5").reset_index(drop=True)