Source code for bolster.data_sources.nisra.wellbeing

"""NISRA Individual Wellbeing Module.

This module provides access to Northern Ireland's individual wellbeing statistics,
measuring subjective wellbeing across the population aged 16 and over.

The report covers four main areas of individual wellbeing:
- Personal Wellbeing (ONS4 measures): Life Satisfaction, Worthwhile, Happiness, Anxiety
- Loneliness: Frequency of feeling lonely
- Self-efficacy: Belief in one's capabilities
- Locus of Control: Perceived control over life events

Data Source: Northern Ireland Statistics and Research Agency provides individual wellbeing
statistics through their Wellbeing section at https://www.nisra.gov.uk/statistics/wellbeing/individual-wellbeing-northern-ireland.
The data measures subjective wellbeing using the ONS4 personal wellbeing questions alongside
additional measures of loneliness, self-efficacy, and locus of control for adults in Northern Ireland.

Update Frequency: Annual publications released in January each year, covering the financial
year period from April to March. The wellbeing statistics provide the official measure of
subjective wellbeing for Northern Ireland, with data updated once per year as part of NISRA's
social statistics programme.

Data Coverage:
    - Personal Wellbeing (ONS4): 2014/15 - Present (annual, mean scores 0-10)
    - Loneliness: 2017/18 - Present (annual, proportions)
    - Self-efficacy: 2014/15 - Present (annual, mean scores 5-25)
    - Locus of Control: Available in recent years

Demographics available:
    - Sex, Age Group, Marital Status, Sexual Orientation
    - Religion, Dependant status, Health status, Employment status

Examples:
    >>> from bolster.data_sources.nisra import wellbeing
    >>> df = wellbeing.get_latest_personal_wellbeing()
    >>> 'life_satisfaction' in df.columns
    True
    >>> df_lonely = wellbeing.get_latest_loneliness()
    >>> 'lonely_some_of_time' in df_lonely.columns
    True
    >>> summary = wellbeing.get_wellbeing_summary()
    >>> 'life_satisfaction' in summary.columns
    True

Publication Details:
    - Frequency: Annual (January publication)
    - Reference period: Financial year (April - March)
    - Published by: NISRA / The Executive Office
    - Contact: pfganalytics@executiveoffice-ni.gov.uk
    - Population: Adults aged 16+ in Northern Ireland
"""

import logging
import re
from pathlib import Path

import pandas as pd

from ._base import NISRADataNotFoundError, download_file


[docs]
logger = logging.getLogger(__name__)


# Base URL for wellbeing publications (hosted on Executive Office site)

[docs]
WELLBEING_BASE_URL = "https://www.nisra.gov.uk/statistics/wellbeing/individual-wellbeing-northern-ireland"


[docs]
EXEC_OFFICE_TOPIC_URL = "https://www.executiveoffice-ni.gov.uk/topics/individual-wellbeing-northern-ireland"


[docs]
EXEC_OFFICE_BASE_URL = "https://www.executiveoffice-ni.gov.uk"




[docs]
def get_latest_wellbeing_publication_url() -> tuple[str, str]:
    """Get the URL of the latest Individual Wellbeing publication and its year.

    Scrapes the Executive Office topic page to find the most recent publication.

    Returns:
        Tuple of (publication_url, year_string) e.g. ("https://...", "2024/25")

    Raises:
        NISRADataNotFoundError: If unable to find the latest publication

    Example:
        >>> url, year = get_latest_wellbeing_publication_url()
        >>> url.startswith('https://')
        True
    """
    from bs4 import BeautifulSoup

    from bolster.utils.web import session

    logger.info("Fetching latest Individual Wellbeing publication URL...")

    try:
        response = session.get(EXEC_OFFICE_TOPIC_URL, timeout=30)
        response.raise_for_status()
    except Exception as e:
        raise NISRADataNotFoundError(f"Failed to fetch wellbeing page: {e}") from e

    soup = BeautifulSoup(response.content, "html.parser")

    # Find links to publications - pattern: "Individual Wellbeing in Northern Ireland Report 2024/25"
    publication_links = soup.find_all("a", href=True)

    # Collect all matching publications and find the latest
    publications = []
    for link in publication_links:
        link_text = link.get_text(strip=True)
        href = link["href"]

        # Match "Report 2024/25" or similar year patterns
        match = re.search(r"(\d{4})/(\d{2})", link_text)
        if match and "Report" in link_text:
            year_str = f"{match.group(1)}/{match.group(2)}"
            start_year = int(match.group(1))

            pub_url = href
            if not pub_url.startswith("http"):
                pub_url = f"{EXEC_OFFICE_BASE_URL}{pub_url}"

            publications.append((start_year, year_str, pub_url))

    if not publications:
        raise NISRADataNotFoundError("Could not find latest Individual Wellbeing publication")

    # Sort by year and get the latest
    publications.sort(key=lambda x: x[0], reverse=True)
    _, year_str, pub_url = publications[0]

    logger.info(f"Found latest wellbeing publication: {year_str} at {pub_url}")
    return pub_url, year_str




[docs]
def get_wellbeing_file_url(year_str: str) -> str:
    """Construct URL for the wellbeing data tables Excel file.

    Args:
        year_str: Financial year string (e.g., "2024/25")

    Returns:
        URL to the Excel data tables file

    Example:
        >>> url = get_wellbeing_file_url("2024/25")
        >>> url.startswith('https://')
        True
    """
    # Convert "2024/25" to "202425"
    year_code = year_str.replace("/", "")

    # Pattern: individual-wellbeing-ni-{yearcode}-data-tables.xlsx
    # Published in January of the following year
    # e.g., 2024/25 data published in January 2026
    start_year = int(year_str.split("/")[0])
    pub_year = start_year + 2  # Publication year is 2 years after start

    filename = f"individual-wellbeing-ni-{year_code}-data-tables.xlsx"
    url = f"https://www.executiveoffice-ni.gov.uk/sites/default/files/{pub_year}-01/{filename}"

    logger.info(f"Constructed wellbeing file URL: {url}")
    return url




[docs]
def parse_personal_wellbeing(file_path: str | Path) -> pd.DataFrame:
    """Parse personal wellbeing (ONS4) measures from the Excel file.

    Extracts Life Satisfaction, Worthwhile, Happiness, and Anxiety mean scores
    from the time series data.

    Args:
        file_path: Path to the wellbeing data tables Excel file

    Returns:
        DataFrame with columns:
            - year: str (financial year, e.g., "2024/25")
            - life_satisfaction: float (mean score 0-10)
            - worthwhile: float (mean score 0-10)
            - happiness: float (mean score 0-10)
            - anxiety: float (mean score 0-10, lower is better)

    Example:
        >>> _, year = get_latest_wellbeing_publication_url()
        >>> path = download_file(get_wellbeing_file_url(year), cache_ttl_hours=90*24)
        >>> df = parse_personal_wellbeing(path)
        >>> 'life_satisfaction' in df.columns
        True
    """
    logger.info(f"Parsing personal wellbeing from {file_path}")

    # Sheet names for ONS4 measures
    sheet_configs = {
        "life_satisfaction": "Life_Satisfaction_Avg",
        "worthwhile": "Worthwhile_Avg",
        "happiness": "Happiness_Avg",
        "anxiety": "Anxiety_Avg ",  # Note: trailing space in sheet name
    }

    results = {}

    for metric, sheet_name in sheet_configs.items():
        try:
            # Read the sheet
            df_raw = pd.read_excel(
                file_path,
                sheet_name=sheet_name,
                header=None,
                skiprows=4,  # Skip to header row
                nrows=15,  # Enough rows for the time series
            )

            # Extract year and estimate columns (columns 1 and 2)
            data = []
            for _, row in df_raw.iterrows():
                year_val = row.iloc[1]
                estimate = row.iloc[2]

                # Check if this is a valid year row
                if isinstance(year_val, str) and "/" in year_val and pd.notna(estimate):
                    try:
                        data.append({"year": year_val, metric: float(estimate)})
                    except (ValueError, TypeError):
                        continue

            results[metric] = pd.DataFrame(data)

        except Exception as e:
            logger.warning(f"Failed to parse {metric} from {sheet_name}: {e}")
            continue

    # Merge all metrics on year
    if not results:
        raise NISRADataNotFoundError("Could not parse any personal wellbeing metrics")

    df = None
    for _metric, df_metric in results.items():
        df = df_metric if df is None else df.merge(df_metric, on="year", how="outer")

    # Sort by year
    df = df.sort_values("year").reset_index(drop=True)

    logger.info(f"Parsed {len(df)} years of personal wellbeing data")
    return df




[docs]
def parse_loneliness(file_path: str | Path) -> pd.DataFrame:
    """Parse loneliness data from the Excel file.

    Extracts the proportion of people who feel lonely at least some of the time.

    Args:
        file_path: Path to the wellbeing data tables Excel file

    Returns:
        DataFrame with columns:
            - year: str (financial year, e.g., "2024/25")
            - lonely_some_of_time: float (proportion, e.g., 0.179 = 17.9%)
            - confidence_interval: str (e.g., "+/- 1.1")

    Example:
        >>> _, year = get_latest_wellbeing_publication_url()
        >>> path = download_file(get_wellbeing_file_url(year), cache_ttl_hours=90*24)
        >>> df = parse_loneliness(path)
        >>> 'lonely_some_of_time' in df.columns
        True
    """
    logger.info(f"Parsing loneliness from {file_path}")

    df_raw = pd.read_excel(
        file_path,
        sheet_name="Loneliness - some of the time",
        header=None,
        skiprows=4,  # Skip to header row
        nrows=12,  # Time series rows
    )

    data = []
    for _, row in df_raw.iterrows():
        year_val = row.iloc[1]
        estimate = row.iloc[2]
        ci = row.iloc[3]

        if isinstance(year_val, str) and "/" in year_val and pd.notna(estimate):
            try:
                data.append(
                    {
                        "year": year_val,
                        "lonely_some_of_time": float(estimate),
                        "confidence_interval": str(ci) if pd.notna(ci) else None,
                    }
                )
            except (ValueError, TypeError):
                continue

    df = pd.DataFrame(data)
    df = df.sort_values("year").reset_index(drop=True)

    logger.info(f"Parsed {len(df)} years of loneliness data")
    return df




[docs]
def parse_self_efficacy(file_path: str | Path) -> pd.DataFrame:
    """Parse self-efficacy data from the Excel file.

    Self-efficacy measures a person's belief in their capabilities to influence
    events in their lives. Scores range from 5 to 25.

    Args:
        file_path: Path to the wellbeing data tables Excel file

    Returns:
        DataFrame with columns:
            - year: str (financial year, e.g., "2024/25")
            - self_efficacy_mean: float (mean score 5-25)
            - confidence_interval: str (e.g., "+/- 0.1")

    Example:
        >>> _, year = get_latest_wellbeing_publication_url()
        >>> path = download_file(get_wellbeing_file_url(year), cache_ttl_hours=90*24)
        >>> df = parse_self_efficacy(path)
        >>> 'self_efficacy_mean' in df.columns
        True
    """
    logger.info(f"Parsing self-efficacy from {file_path}")

    df_raw = pd.read_excel(
        file_path,
        sheet_name="Self-efficacy_avg",
        header=None,
        skiprows=3,  # Skip to header row
        nrows=15,  # Time series rows
    )

    data = []
    for _, row in df_raw.iterrows():
        year_val = row.iloc[1]
        estimate = row.iloc[2]
        ci = row.iloc[3]

        if isinstance(year_val, str) and "/" in year_val and pd.notna(estimate):
            try:
                data.append(
                    {
                        "year": year_val,
                        "self_efficacy_mean": float(estimate),
                        "confidence_interval": str(ci) if pd.notna(ci) else None,
                    }
                )
            except (ValueError, TypeError):
                continue

    df = pd.DataFrame(data)
    df = df.sort_values("year").reset_index(drop=True)

    logger.info(f"Parsed {len(df)} years of self-efficacy data")
    return df




[docs]
def get_latest_personal_wellbeing(force_refresh: bool = False) -> pd.DataFrame:
    """Get the latest personal wellbeing (ONS4) data.

    Downloads and parses the latest Individual Wellbeing publication to extract
    the four ONS personal wellbeing measures: Life Satisfaction, Worthwhile,
    Happiness, and Anxiety.

    Args:
        force_refresh: Force re-download even if cached

    Returns:
        DataFrame with columns:
            - year: str (financial year)
            - life_satisfaction: float (mean 0-10, higher is better)
            - worthwhile: float (mean 0-10, higher is better)
            - happiness: float (mean 0-10, higher is better)
            - anxiety: float (mean 0-10, lower is better)

    Example:
        >>> df = get_latest_personal_wellbeing()
        >>> 'life_satisfaction' in df.columns
        True
    """
    _, year_str = get_latest_wellbeing_publication_url()
    file_url = get_wellbeing_file_url(year_str)

    # Cache for 90 days (quarterly publication)
    file_path = download_file(file_url, cache_ttl_hours=90 * 24, force_refresh=force_refresh)

    return parse_personal_wellbeing(file_path)




[docs]
def get_latest_loneliness(force_refresh: bool = False) -> pd.DataFrame:
    """Get the latest loneliness data.

    Downloads and parses the latest Individual Wellbeing publication to extract
    loneliness statistics (proportion feeling lonely at least some of the time).

    Args:
        force_refresh: Force re-download even if cached

    Returns:
        DataFrame with columns:
            - year: str (financial year)
            - lonely_some_of_time: float (proportion)
            - confidence_interval: str

    Example:
        >>> df = get_latest_loneliness()
        >>> 'lonely_some_of_time' in df.columns
        True
    """
    _, year_str = get_latest_wellbeing_publication_url()
    file_url = get_wellbeing_file_url(year_str)

    file_path = download_file(file_url, cache_ttl_hours=90 * 24, force_refresh=force_refresh)

    return parse_loneliness(file_path)




[docs]
def get_latest_self_efficacy(force_refresh: bool = False) -> pd.DataFrame:
    """Get the latest self-efficacy data.

    Downloads and parses the latest Individual Wellbeing publication to extract
    self-efficacy statistics (mean scores 5-25).

    Args:
        force_refresh: Force re-download even if cached

    Returns:
        DataFrame with columns:
            - year: str (financial year)
            - self_efficacy_mean: float (mean 5-25)
            - confidence_interval: str

    Example:
        >>> df = get_latest_self_efficacy()
        >>> 'self_efficacy_mean' in df.columns
        True
    """
    _, year_str = get_latest_wellbeing_publication_url()
    file_url = get_wellbeing_file_url(year_str)

    file_path = download_file(file_url, cache_ttl_hours=90 * 24, force_refresh=force_refresh)

    return parse_self_efficacy(file_path)




[docs]
def get_wellbeing_summary(force_refresh: bool = False) -> pd.DataFrame:
    """Get a summary of all wellbeing measures for the latest year.

    Combines personal wellbeing (ONS4), loneliness, and self-efficacy data
    into a single summary for the most recent year.

    Args:
        force_refresh: Force re-download even if cached

    Returns:
        DataFrame with one row containing:
            - year: str
            - life_satisfaction: float
            - worthwhile: float
            - happiness: float
            - anxiety: float
            - lonely_some_of_time: float
            - self_efficacy_mean: float

    Example:
        >>> summary = get_wellbeing_summary()
        >>> 'life_satisfaction' in summary.columns
        True
    """
    # Get all data
    df_personal = get_latest_personal_wellbeing(force_refresh=force_refresh)
    df_loneliness = get_latest_loneliness(force_refresh=False)  # Already cached
    df_efficacy = get_latest_self_efficacy(force_refresh=False)  # Already cached

    # Get the latest year from personal wellbeing
    latest_year = df_personal["year"].iloc[-1]

    # Build summary
    summary = {"year": latest_year}

    # Add personal wellbeing
    latest_personal = df_personal[df_personal["year"] == latest_year].iloc[0]
    for col in ["life_satisfaction", "worthwhile", "happiness", "anxiety"]:
        if col in latest_personal:
            summary[col] = latest_personal[col]

    # Add loneliness
    if latest_year in df_loneliness["year"].values:
        latest_lonely = df_loneliness[df_loneliness["year"] == latest_year].iloc[0]
        summary["lonely_some_of_time"] = latest_lonely["lonely_some_of_time"]

    # Add self-efficacy
    if latest_year in df_efficacy["year"].values:
        latest_efficacy = df_efficacy[df_efficacy["year"] == latest_year].iloc[0]
        summary["self_efficacy_mean"] = latest_efficacy["self_efficacy_mean"]

    return pd.DataFrame([summary])




[docs]
def get_personal_wellbeing_by_year(df: pd.DataFrame, year: str) -> pd.DataFrame:
    """Filter personal wellbeing data for a specific year.

    Args:
        df: DataFrame from get_latest_personal_wellbeing()
        year: Financial year string (e.g., "2024/25")

    Returns:
        DataFrame filtered to the specified year

    Example:
        >>> df = get_latest_personal_wellbeing()
        >>> df_2024 = get_personal_wellbeing_by_year(df, "2024/25")
        >>> 'life_satisfaction' in df_2024.columns
        True
    """
    return df[df["year"] == year].copy()




[docs]
def validate_personal_wellbeing(df: pd.DataFrame) -> bool:  # pragma: no cover
    """Validate personal wellbeing data for consistency.

    Checks that:
    - All ONS4 measures are present
    - Scores are within expected ranges
    - No duplicate years

    Args:
        df: DataFrame from get_latest_personal_wellbeing()

    Returns:
        True if validation passes

    Raises:
        ValueError: If validation fails
    """
    required_cols = {"year", "life_satisfaction", "worthwhile", "happiness", "anxiety"}
    if not required_cols.issubset(df.columns):
        raise ValueError(f"Missing columns: {required_cols - set(df.columns)}")

    # Check score ranges (0-10 for ONS4 measures)
    for col in ["life_satisfaction", "worthwhile", "happiness", "anxiety"]:
        if col in df.columns and (df[col].min() < 0 or df[col].max() > 10):
            raise ValueError(f"{col} scores outside valid range 0-10")

    # Check for duplicates
    if df["year"].duplicated().any():
        raise ValueError("Duplicate years found")

    return True