Source code for bolster.data_sources.nisra.work_quality

"""NISRA Work Quality in Northern Ireland Module.

This module provides access to Work Quality statistics for Northern Ireland,
covering seventeen indicators of job quality for employees aged 18 and over.

The publication draws on two main sources:
- Labour Force Survey (LFS): subjective and contractual indicators
- Annual Survey of Hours and Earnings (ASHE): earnings-based indicator

Work quality indicators covered:
    1.  Real Living Wage: proportion earning above the Real Living Wage
    2.  Zero-hours contracts: proportion not on zero-hours contracts
    3.  Secure employment: permanent or preferred temporary contracts
    4.  Job satisfaction: satisfied or very satisfied with job
    5.  Career progression: opportunities for career progression
    6.  Decision making: involvement in workplace decisions
    7.  Meaningful work: performing work perceived as meaningful
    8.  Flexible work: access to flexible working arrangements
    9.  Line manager support: positive line manager relationship
    10. Bullying & harassment: not bullied or harassed at work
    11. Skills match: has the skills required for current duties
    12. Workplace accidents: no accident reported at work
    13. Underemployment: employees who are underemployed
    14. Overemployment: employees who are overemployed
    15. Trade union membership: member of a trade union
    16. Training participation: participated in training (last 13 weeks)
    17. Overtime working: reported working overtime

Data Source: NISRA Labour Market and Social Welfare statistics.
    Index page: https://www.nisra.gov.uk/statistics/labour-market-and-social-welfare/work-quality

Update Frequency: Annual (published in spring for the preceding calendar year).

Data Coverage:
    - Most indicators: 2020 to present (calendar year)
    - Line manager support, bullying & harassment, skills match: 2022 to present
      (reported as rolling 12-month periods, e.g., "July 2024 to June 2025")

Breakdowns available per indicator:
    - Overall (NI)
    - By sex (Male / Female)
    - By age group (18-39 / 40 and over)
    - By deprivation quintile (most deprived / least deprived)
    - By skill level (low skilled / high skilled)

Examples:
    >>> from bolster.data_sources.nisra import work_quality
    >>> df = work_quality.get_latest_work_quality()
    >>> 'indicator' in df.columns
    True
    >>> 'value' in df.columns
    True
    >>> url, year = work_quality.get_work_quality_publication_url()
    >>> url.startswith('https://')
    True
    >>> year >= 2025
    True

Publication Details:
    - Frequency: Annual
    - Published by: NISRA
    - Coverage: Employees aged 18 and over in Northern Ireland
    - Source surveys: Labour Force Survey (LFS) and ASHE
"""

import logging
import re
from pathlib import Path

import openpyxl
import pandas as pd

from bolster.utils.web import session

from ._base import NISRADataNotFoundError, NISRAValidationError, download_file


[docs]
logger = logging.getLogger(__name__)


# Stable index page listing all work quality publications

[docs]
WORK_QUALITY_INDEX_URL = "https://www.nisra.gov.uk/statistics/labour-market-and-social-welfare/work-quality"


[docs]
NISRA_BASE_URL = "https://www.nisra.gov.uk"


# All 17 indicator sheet names and their canonical indicator names
_INDICATOR_SHEETS: dict[str, str] = {
    "Table_1.1": "real_living_wage",
    "Table_1.2": "no_zero_hours_contract",
    "Table_1.3": "secure_employment",
    "Table_1.4": "job_satisfaction",
    "Table_1.5": "career_progression",
    "Table_1.6": "decision_making",
    "Table_1.7": "meaningful_work",
    "Table_1.8": "flexible_work",
    "Table_1.9": "line_manager_support",
    "Table_1.10": "no_bullying_harassment",
    "Table_1.11": "skills_match",
    "Table_1.12": "no_workplace_accident",
    "Table_1.13": "underemployed",
    "Table_1.14": "overemployed",
    "Table_1.15": "trade_union_member",
    "Table_1.16": "recent_training",
    "Table_1.17": "working_overtime",
}

# Row-type label for the overall NI figure (sub-table 'a')
_OVERALL_TABLE_SUFFIX = "a"



[docs]
def get_work_quality_publication_url() -> tuple[str, int]:
    """Get the URL of the latest Work Quality Excel file and publication year.

    Scrapes the NISRA work quality index page to discover the most recent
    publication link, then follows to the publication page to find the Excel file.

    Returns:
        Tuple of (excel_url, year) e.g.
        ("https://www.nisra.gov.uk/system/files/.../work-quality-2025.xlsx", 2025)

    Raises:
        NISRADataNotFoundError: If unable to find the publication or Excel file

    Example:
        >>> url, year = get_work_quality_publication_url()
        >>> url.startswith('https://')
        True
        >>> year >= 2025
        True
    """
    from bs4 import BeautifulSoup

    logger.info("Fetching Work Quality publication URL from index page...")

    try:
        response = session.get(WORK_QUALITY_INDEX_URL, timeout=30)
        response.raise_for_status()
    except Exception as e:
        raise NISRADataNotFoundError(f"Failed to fetch Work Quality index page: {e}") from e

    soup = BeautifulSoup(response.content, "html.parser")

    # Collect all links whose text matches "Work Quality in Northern Ireland YYYY"
    publications: list[tuple[int, str]] = []
    for a_tag in soup.find_all("a", href=True):
        text = a_tag.get_text(strip=True)
        href = a_tag["href"]
        match = re.search(r"Work Quality in Northern Ireland\s+(\d{4})", text, re.IGNORECASE)
        if match:
            year = int(match.group(1))
            pub_url = href if href.startswith("http") else f"{NISRA_BASE_URL}{href}"
            publications.append((year, pub_url))
            logger.debug(f"Found publication: {year} → {pub_url}")

    if not publications:
        raise NISRADataNotFoundError(
            f"Could not find any Work Quality publication links on the index page. Check: {WORK_QUALITY_INDEX_URL}"
        )

    # Use the most recent publication
    publications.sort(key=lambda x: x[0], reverse=True)
    latest_year, pub_url = publications[0]
    logger.info(f"Latest Work Quality publication: {latest_year} at {pub_url}")

    # Follow to the publication page and find the Excel file
    try:
        pub_response = session.get(pub_url, timeout=30)
        pub_response.raise_for_status()
    except Exception as e:
        raise NISRADataNotFoundError(f"Failed to fetch Work Quality publication page {pub_url}: {e}") from e

    pub_soup = BeautifulSoup(pub_response.content, "html.parser")

    excel_url: str | None = None
    for a_tag in pub_soup.find_all("a", href=True):
        href = a_tag["href"]
        if ".xlsx" in href.lower() and "work-quality" in href.lower():
            excel_url = href if href.startswith("http") else f"{NISRA_BASE_URL}{href}"
            break

    if excel_url is None:
        # Fallback: any .xlsx link on the publication page
        for a_tag in pub_soup.find_all("a", href=True):
            href = a_tag["href"]
            if ".xlsx" in href.lower():
                excel_url = href if href.startswith("http") else f"{NISRA_BASE_URL}{href}"
                logger.warning(f"Using fallback Excel link (no 'work-quality' in path): {excel_url}")
                break

    if excel_url is None:
        raise NISRADataNotFoundError(f"Could not find Excel file on Work Quality publication page: {pub_url}")

    logger.info(f"Found Work Quality Excel URL: {excel_url}")
    return excel_url, latest_year



def _normalise_year_label(raw: str | int | float) -> str | None:
    """Normalise a raw year cell value to a compact string label.

    Accepts integer years (2020), float years (2025.0), and period strings
    ("July 2024 to June 2025").  Returns None for blank / unsupported values.

    Args:
        raw: The raw cell value from the Excel sheet.

    Returns:
        Normalised label (e.g. "2025" or "July 2024 to June 2025") or None.

    Example:
        >>> _normalise_year_label(2025)
        '2025'
        >>> _normalise_year_label(2025.0)
        '2025'
        >>> _normalise_year_label('July 2024 to June 2025')
        'July 2024 to June 2025'
        >>> _normalise_year_label(None) is None
        True
    """
    if raw is None:
        return None
    if isinstance(raw, int | float):
        try:
            return str(int(raw))
        except (ValueError, TypeError):
            return None
    raw_str = str(raw).strip()
    if not raw_str or raw_str.lower() in ("none", "nan", "year"):
        return None
    return raw_str


def _extract_year_int(label: str) -> int:
    """Extract the latest calendar year from a year label.

    For plain year strings ("2025") returns 2025.
    For period strings ("July 2024 to June 2025") returns 2025 (end year).

    Args:
        label: Normalised year label as returned by _normalise_year_label.

    Returns:
        Integer calendar year.

    Example:
        >>> _extract_year_int('2025')
        2025
        >>> _extract_year_int('July 2024 to June 2025')
        2025
    """
    # Find all 4-digit years
    years_found = [int(m) for m in re.findall(r"\b(\d{4})\b", label)]
    if not years_found:
        raise ValueError(f"No year found in label: {label!r}")
    return max(years_found)


def _parse_indicator_sheet(
    wb: openpyxl.Workbook,
    sheet_name: str,
    indicator: str,
) -> list[dict]:
    """Parse the overall NI row (sub-table 'a') from one indicator sheet.

    Each sheet contains multiple sub-tables stacked vertically.  Sub-table 'a'
    always holds the overall NI proportions; we only extract that for the
    canonical long-format output.

    Args:
        wb: Open openpyxl workbook (read_only=True, data_only=True).
        sheet_name: Name of the worksheet (e.g. "Table_1.1").
        indicator: Canonical snake_case indicator name.

    Returns:
        List of dicts with keys: indicator, year_label, year, value, geography.
    """
    ws = wb[sheet_name]
    rows: list[dict] = []
    in_overall_table = False

    for row_vals in ws.iter_rows(values_only=True):
        if not any(v is not None for v in row_vals):
            continue

        cell0 = str(row_vals[0]).strip() if row_vals[0] is not None else ""

        # Detect the start of sub-table 'a' (overall NI)
        # Pattern: "Table X.Xa: ..."  (letter 'a' suffix)
        if re.match(r"Table\s+\d+\.\d+a\s*:", cell0, re.IGNORECASE):
            in_overall_table = True
            continue

        # Detect start of any OTHER sub-table (b, c, d, e) — stop reading
        if re.match(r"Table\s+\d+\.\d+[b-zA-Z]\s*:", cell0, re.IGNORECASE):
            if in_overall_table:
                break  # done with sub-table 'a'
            continue

        if not in_overall_table:
            continue

        # Skip the header row ("Year", "Proportion …")
        if cell0.lower() == "year":
            continue

        # Try to parse a data row: col0 = year, col1 = value
        year_label = _normalise_year_label(row_vals[0])
        if year_label is None:
            continue

        raw_val = row_vals[1] if len(row_vals) > 1 else None
        # Skip suppressed values marked [x]
        if isinstance(raw_val, str) and "[x]" in raw_val:
            logger.debug(f"{indicator} {year_label}: suppressed value [x]")
            continue
        if raw_val is None:
            continue

        try:
            value = float(raw_val)
        except (ValueError, TypeError):
            logger.debug(f"{indicator} {year_label}: could not convert {raw_val!r} to float")
            continue

        try:
            year_int = _extract_year_int(year_label)
        except ValueError:
            logger.debug(f"{indicator}: could not extract year from {year_label!r}")
            continue

        rows.append(
            {
                "indicator": indicator,
                "year_label": year_label,
                "year": year_int,
                "value": value,
                "geography": "Northern Ireland",
            }
        )

    return rows



[docs]
def parse_work_quality_file(file_path: str | Path, year: int) -> pd.DataFrame:
    """Parse a Work Quality Excel file into a tidy long-format DataFrame.

    Extracts the overall NI proportion for each of the 17 work quality
    indicators.  One row per (indicator, year_label) combination.

    Args:
        file_path: Path to the Work Quality Excel (.xlsx) file.
        year: Publication year (e.g. 2025 for the 2025 publication).

    Returns:
        DataFrame with columns:
            - indicator: str  — snake_case indicator name (e.g. "job_satisfaction")
            - year_label: str — year as printed in the source (e.g. "2025" or
              "July 2024 to June 2025" for rolling-period indicators)
            - year: int       — latest calendar year in the period
            - value: float    — proportion (percentage, 0-100)
            - geography: str  — always "Northern Ireland" for this dataset

    Raises:
        NISRADataNotFoundError: If the file cannot be opened or no data is found.

    Example:
        >>> url, yr = get_work_quality_publication_url()
        >>> path = download_file(url, cache_ttl_hours=90 * 24)
        >>> df = parse_work_quality_file(path, yr)
        >>> sorted(df.columns.tolist())
        ['geography', 'indicator', 'value', 'year', 'year_label']
        >>> len(df) > 0
        True
    """
    logger.info(f"Parsing Work Quality file: {file_path} (publication year {year})")
    file_path = Path(file_path)

    try:
        wb = openpyxl.load_workbook(file_path, read_only=True, data_only=True)
    except Exception as e:
        raise NISRADataNotFoundError(f"Cannot open Work Quality file {file_path}: {e}") from e

    all_rows: list[dict] = []
    for sheet_name, indicator in _INDICATOR_SHEETS.items():
        if sheet_name not in wb.sheetnames:
            logger.warning(f"Sheet {sheet_name!r} not found in workbook — skipping {indicator}")
            continue
        rows = _parse_indicator_sheet(wb, sheet_name, indicator)
        if not rows:
            logger.warning(f"No data rows extracted from {sheet_name} ({indicator})")
        all_rows.extend(rows)

    wb.close()

    if not all_rows:
        raise NISRADataNotFoundError(f"No data extracted from Work Quality file: {file_path}")

    df = pd.DataFrame(all_rows)
    df = df.sort_values(["indicator", "year", "year_label"]).reset_index(drop=True)
    logger.info(
        f"Parsed {len(df)} rows across {df['indicator'].nunique()} indicators "
        f"(years {df['year'].min()}–{df['year'].max()})"
    )
    return df




[docs]
def get_latest_work_quality(force_refresh: bool = False) -> pd.DataFrame:
    """Download and parse the latest Work Quality publication.

    Automatically discovers the current publication URL by scraping the NISRA
    index page, downloads the Excel file (caching for 90 days), and returns a
    tidy long-format DataFrame.

    Args:
        force_refresh: If True, bypass cache and re-download the file.

    Returns:
        DataFrame with columns:
            - indicator: str  (e.g. "job_satisfaction")
            - year_label: str (e.g. "2025" or "July 2024 to June 2025")
            - year: int
            - value: float    (percentage, 0-100)
            - geography: str  ("Northern Ireland")

    Raises:
        NISRADataNotFoundError: If the publication or file cannot be found.

    Example:
        >>> df = get_latest_work_quality()
        >>> 'indicator' in df.columns
        True
        >>> df['year'].max() >= 2025
        True
    """
    excel_url, year = get_work_quality_publication_url()
    file_path = download_file(excel_url, cache_ttl_hours=90 * 24, force_refresh=force_refresh)
    return parse_work_quality_file(file_path, year)




[docs]
def validate_work_quality_data(df: pd.DataFrame) -> bool:
    """Validate a Work Quality DataFrame for expected structure and values.

    Args:
        df: DataFrame returned by parse_work_quality_file or get_latest_work_quality.

    Returns:
        True if all checks pass.

    Raises:
        NISRAValidationError: If any validation check fails.

    Example:
        >>> df = get_latest_work_quality()
        >>> validate_work_quality_data(df)
        True
    """
    if df.empty:
        raise NISRAValidationError("DataFrame is empty")

    required_columns = {"indicator", "year_label", "year", "value", "geography"}
    missing = required_columns - set(df.columns)
    if missing:
        raise NISRAValidationError(f"Missing required columns: {missing}")

    if df["indicator"].nunique() < 10:
        raise NISRAValidationError(f"Expected at least 10 indicators, found {df['indicator'].nunique()}")

    if df["year"].max() < 2024:
        raise NISRAValidationError(f"No recent data found — latest year is {df['year'].max()}")

    out_of_range = df[(df["value"] < 0) | (df["value"] > 100)]
    if not out_of_range.empty:
        raise NISRAValidationError(f"Values outside valid percentage range [0, 100]:\n{out_of_range}")

    if df["geography"].nunique() != 1 or df["geography"].iloc[0] != "Northern Ireland":
        raise NISRAValidationError(f"Unexpected geography values: {df['geography'].unique().tolist()}")

    logger.info(
        f"Work Quality data validation passed: {len(df)} rows, "
        f"{df['indicator'].nunique()} indicators, years {df['year'].min()}–{df['year'].max()}"
    )
    return True