Source code for bolster.data_sources.nisra.population_projections

"""NISRA Population Projections for Northern Ireland.

Provides access to official NISRA population projections with demographic breakdowns
by year, age, sex, and projection variant.

NI-level projections (2024-based, 2024-2074) are served via the PxStat API.
LGD sub-area projections are not yet available via PxStat and remain Excel-based.

Data Source:
    **PxStat API** (NI-level, used by this module):
        https://ws-data.nisra.gov.uk/public/api.restful/PxStat.Data.Cube_API.ReadDataset/{MATRIX}/CSV/1.0/en

    Matrix codes:
        - ``PPMY02T01``: NI projections by single year of age (0-90+) and sex — principal + variants
        - ``PPMY02T02``: NI projections by 5-year age bands and sex — principal only
        - ``PPMY02T03``: Variant projections (high/low fertility, life expectancy, migration)

    **Original publication pages** (for reference and LGD projections):
        - Principal: https://www.nisra.gov.uk/publications/2024-based-population-projections-northern-ireland
        - Variants: https://www.nisra.gov.uk/publications/2024-based-population-projections-northern-ireland-variant-projections
        - LGD sub-areas: https://www.nisra.gov.uk/publications/2022-based-population-projections-areas-within-northern-ireland

Update Frequency: Biennial (NI-level)
Geographic Coverage: Northern Ireland overall (LGD projections not yet in PxStat)
Projection Horizon: 2024-2074 (NI-level via API)

Example:
    >>> from bolster.data_sources.nisra import population_projections
    >>> df = population_projections.get_latest_projections()
    >>> 'population' in df.columns
    True
    >>> df_decade = population_projections.get_latest_projections(
    ...     start_year=2025,
    ...     end_year=2035
    ... )
    >>> len(df_decade) > 0
    True
"""

import logging

import pandas as pd

from ._base import NISRAValidationError
from .pxstat import PxStatError, read_dataset  # noqa: F401 — re-exported for callers


[docs]
logger = logging.getLogger(__name__)


# PxStat matrix codes
_MATRIX_SYA = "PPMY02T01"  # single year of age, principal projection
_MATRIX_5YR = "PPMY02T02"  # 5-year age bands, principal projection
_MATRIX_VARIANTS = "PPMY02T03"  # variant projections



[docs]
def get_latest_projections(
    start_year: int | None = None,
    end_year: int | None = None,
    age_groups: str = "5yr",
    force_refresh: bool = False,
) -> pd.DataFrame:
    """Retrieve NI population projections (principal projection).

    Args:
        start_year: First projection year to include (default: first available).
        end_year: Last projection year to include (default: last available).
        age_groups: Age breakdown format:
            - ``'5yr'``: 5-year age bands (default) — smaller result set
            - ``'single'``: Single year of age (0-90+) — larger result set
        force_refresh: Ignored — kept for API compatibility. The PxStat API
            always returns current data.

    Returns:
        DataFrame with columns:
            ``year``, ``age_group``, ``sex``, ``population``, ``base_year``

    Raises:
        NISRAValidationError: If the API returns empty or invalid data.
        PxStatError: If the API request fails.

    Example:
        >>> df = get_latest_projections()
        >>> 'population' in df.columns
        True
    """
    if force_refresh:
        logger.debug("force_refresh is ignored for PxStat-backed modules")

    matrix = _MATRIX_SYA if age_groups == "single" else _MATRIX_5YR
    age_col = "Single year of age" if age_groups == "single" else "Five year age bands"

    df = read_dataset(matrix)

    result = df[["Year", age_col, "Sex Label", "VALUE"]].rename(
        columns={"Year": "year", age_col: "age_group", "Sex Label": "sex", "VALUE": "population"}
    )
    result["base_year"] = 2024

    if start_year:
        result = result[result["year"] >= start_year]
    if end_year:
        result = result[result["year"] <= end_year]

    result = result.sort_values(["year", "age_group", "sex"]).reset_index(drop=True)

    if result.empty:
        raise NISRAValidationError("Population projections data is empty")

    return result




[docs]
def get_variant_projections(
    variant: str | None = None,
    start_year: int | None = None,
    end_year: int | None = None,
    force_refresh: bool = False,
) -> pd.DataFrame:
    """Retrieve NI population projections including variant scenarios.

    Args:
        variant: Filter to a specific variant label (partial match, case-insensitive).
            E.g. ``'high fertility'``, ``'low fertility'``, ``'high life expectancy'``.
            If None, all variants are returned.
        start_year: First projection year to include.
        end_year: Last projection year to include.
        force_refresh: Ignored — kept for API compatibility.

    Returns:
        DataFrame with columns:
            ``year``, ``age_group``, ``sex``, ``variant``, ``population``
    """
    if force_refresh:
        logger.debug("force_refresh is ignored for PxStat-backed modules")

    df = read_dataset(_MATRIX_VARIANTS)

    result = df[["Year", "Single year of age", "Sex Label", "Variant Label", "VALUE"]].rename(
        columns={
            "Year": "year",
            "Single year of age": "age_group",
            "Sex Label": "sex",
            "Variant Label": "variant",
            "VALUE": "population",
        }
    )

    if variant:
        result = result[result["variant"].str.lower().str.contains(variant.lower())]
    if start_year:
        result = result[result["year"] >= start_year]
    if end_year:
        result = result[result["year"] <= end_year]

    return result.sort_values(["variant", "year", "age_group", "sex"]).reset_index(drop=True)




[docs]
def validate_projections(df: pd.DataFrame) -> bool:
    """Validate a projections DataFrame for basic integrity.

    Args:
        df: DataFrame from :func:`get_latest_projections`.

    Returns:
        True if valid.

    Raises:
        NISRAValidationError: If validation fails.
    """
    required = {"year", "age_group", "sex", "population"}
    missing = required - set(df.columns)
    if missing:
        raise NISRAValidationError(f"Missing required columns: {missing}")
    if df.empty:
        raise NISRAValidationError("DataFrame is empty")
    if (df["population"] < 0).any():
        raise NISRAValidationError("Negative population values found")
    return True