Source code for bolster.data_sources.nisra.quarterly_employment_survey

"""NISRA Quarterly Employment Survey (QES) for Northern Ireland.

Provides quarterly employee job estimates for Northern Ireland by broad industry
sector, both seasonally adjusted and unadjusted. The QES is used by the ONS as
the primary source for NI estimates within UK-wide quarterly workforce statistics.

Sectors covered:
- Manufacturing
- Construction
- Services
- Other Industries
- All Industries (total)

Data Source:
    **Statistics page**: https://www.nisra.gov.uk/statistics/work-pay-and-benefits/quarterly-employment-survey
    The module scrapes this page to find the latest quarterly supplementary tables,
    then downloads the Excel file.

Update Frequency: Quarterly (published ~2 weeks after reference quarter)
Geographic Coverage: Northern Ireland
Time Series: Q1 2005 to present (seasonally adjusted); Q1 2005 unadjusted

Example:
    >>> from bolster.data_sources.nisra import quarterly_employment_survey as qes
    >>> df = qes.get_latest_qes()
    >>> 'total_jobs' in df.columns
    True
    >>> growth = qes.get_qes_growth(df)
    >>> 'total_yoy' in growth.columns
    True
"""

import logging
import re
from pathlib import Path

import pandas as pd
from bs4 import BeautifulSoup

from bolster.utils.web import session

from ._base import NISRADataNotFoundError, NISRAValidationError, download_file


[docs]
logger = logging.getLogger(__name__)



[docs]
QES_STATS_URL = "https://www.nisra.gov.uk/statistics/work-pay-and-benefits/quarterly-employment-survey"


[docs]
QES_BASE_URL = "https://www.nisra.gov.uk"


# Quarter abbreviations used in QES file date labels
_QUARTER_MONTHS = {"Mar": (3, 1), "Jun": (6, 2), "Sep": (9, 3), "Dec": (12, 4)}



[docs]
def get_latest_qes_publication_url() -> str:
    """Scrape the QES statistics page to find the latest supplementary tables Excel.

    Returns:
        URL of the latest QES supplementary tables Excel file

    Raises:
        NISRADataNotFoundError: If publication cannot be found
    """
    try:
        response = session.get(QES_STATS_URL, timeout=30)
        response.raise_for_status()
    except Exception as e:
        raise NISRADataNotFoundError(f"Failed to fetch QES statistics page: {e}") from e

    soup = BeautifulSoup(response.content, "html.parser")

    # Find the supplementary tables publication link
    for link in soup.find_all("a", href=True):
        href = link["href"]
        if "supplementary-tables" not in href or "quarterly-employment-survey" not in href:
            continue

        pub_url = href if href.startswith("http") else f"{QES_BASE_URL}{href}"
        try:
            pub_resp = session.get(pub_url, timeout=30)
            pub_resp.raise_for_status()
        except Exception as e:
            logger.warning(f"Failed to fetch QES publication page {pub_url}: {e}")
            continue

        pub_soup = BeautifulSoup(pub_resp.content, "html.parser")
        for file_link in pub_soup.find_all("a", href=True):
            file_href = file_link["href"]
            if "supplementary_tables" in file_href and file_href.endswith(".xlsx"):
                excel_url = file_href if file_href.startswith("http") else f"{QES_BASE_URL}{file_href}"
                logger.info(f"Found QES supplementary tables: {excel_url}")
                return excel_url

    raise NISRADataNotFoundError("Could not find QES supplementary tables Excel file")



def _parse_qes_date(label: str) -> pd.Timestamp | None:
    """Parse QES quarter label to Timestamp.

    Handles both Excel-parsed datetime objects (coerced to str) and
    string labels like "Dec-25 (Provisional)" or "Mar-24 (Revised)".

    Args:
        label: Quarter label string

    Returns:
        Timestamp for start of quarter, or None if unparseable
    """
    label = str(label).strip()

    # Try to detect if label looks like an ISO date string from Excel parsing
    # e.g. "2005-03-01 00:00:00" or "2005-06-01"
    iso_match = re.match(r"^(\d{4})-(\d{2})-\d{2}", label)
    if iso_match:
        year = int(iso_match.group(1))
        month = int(iso_match.group(2))
        for _abbr, (m, _q) in _QUARTER_MONTHS.items():
            if m == month:
                return pd.Timestamp(year=year, month=month, day=1)
        return None

    # Label format: "Mar-25 (Provisional)" or "Dec-24 (Revised)"
    str_match = re.match(r"^([A-Za-z]{3})-(\d{2})", label)
    if str_match:
        month_abbr = str_match.group(1).capitalize()
        year_2d = int(str_match.group(2))
        year = 2000 + year_2d if year_2d < 50 else 1900 + year_2d
        if month_abbr in _QUARTER_MONTHS:
            month, _ = _QUARTER_MONTHS[month_abbr]
            return pd.Timestamp(year=year, month=month, day=1)

    return None


def _quarter_from_month(month: int) -> int:
    """Map month number to quarter (1-4)."""
    return (month - 1) // 3 + 1



[docs]
def parse_qes_file(file_path: str | Path, adjusted: bool = True) -> pd.DataFrame:
    """Parse QES supplementary tables Excel file into long-format DataFrame.

    Reads Table 5.2 (seasonally adjusted) or Table 5.3 (unadjusted), which
    contain quarterly employee job counts by broad industry sector.

    Args:
        file_path: Path to downloaded QES supplementary tables Excel file
        adjusted: If True, read Table 5.2 (seasonally adjusted).
                  If False, read Table 5.3 (unadjusted).

    Returns:
        DataFrame with columns:
            - date: Timestamp (first day of quarter)
            - year: int
            - quarter: int (1-4)
            - quarter_label: str (e.g. "Q1 2025")
            - manufacturing_jobs: int
            - construction_jobs: int
            - services_jobs: int
            - other_jobs: int
            - total_jobs: int
            - adjusted: bool (True if seasonally adjusted)

    Raises:
        NISRAValidationError: If file structure is unexpected
    """
    file_path = Path(file_path)
    sheet_name = "Table 5.2" if adjusted else "Table 5.3"

    try:
        raw = pd.read_excel(file_path, sheet_name=sheet_name, header=None)
    except Exception as e:
        raise NISRAValidationError(f"Failed to read QES file sheet {sheet_name}: {e}") from e

    # Row 2 (index 2) is the header row
    header_row = raw.iloc[2]

    # Map column positions for the "Employee Jobs" columns (not "Quarterly Change")
    col_map = {}
    for i, val in enumerate(header_row):
        val_str = str(val).replace("\n", " ").strip().upper()
        if "MANUFACTURING" in val_str and "EMPLOYEE JOBS" in val_str:
            col_map["manufacturing_jobs"] = i
        elif "CONSTRUCTION" in val_str and "EMPLOYEE JOBS" in val_str:
            col_map["construction_jobs"] = i
        elif "SERVICES" in val_str and "EMPLOYEE JOBS" in val_str:
            col_map["services_jobs"] = i
        elif "OTHER" in val_str and "EMPLOYEE JOBS" in val_str:
            col_map["other_jobs"] = i
        elif "ALL" in val_str and "EMPLOYEE JOBS" in val_str:
            col_map["total_jobs"] = i

    if len(col_map) < 5:
        raise NISRAValidationError(
            f"Could not locate all sector columns in {sheet_name}. Found: {list(col_map.keys())}"
        )

    # Data rows start at index 3
    records = []
    for i in range(3, len(raw)):
        row = raw.iloc[i]
        label = str(row.iloc[0]).strip()
        if not label or label == "nan":
            continue

        ts = _parse_qes_date(label)
        if ts is None:
            continue

        record = {
            "date": ts,
            "year": ts.year,
            "quarter": _quarter_from_month(ts.month),
            "quarter_label": f"Q{_quarter_from_month(ts.month)} {ts.year}",
            "adjusted": adjusted,
        }
        for col_name, col_idx in col_map.items():
            val = row.iloc[col_idx]
            record[col_name] = int(val) if not pd.isna(val) and val != "-" else None

        records.append(record)

    if not records:
        raise NISRAValidationError(f"No valid data rows found in QES {sheet_name}")

    df = pd.DataFrame(records)
    col_order = [
        "date",
        "year",
        "quarter",
        "quarter_label",
        "manufacturing_jobs",
        "construction_jobs",
        "services_jobs",
        "other_jobs",
        "total_jobs",
        "adjusted",
    ]
    df = df[col_order]
    df = df.sort_values("date").reset_index(drop=True)

    label = "adjusted" if adjusted else "unadjusted"
    logger.info(
        f"Parsed QES ({label}): {len(df)} quarters, "
        f"{df['year'].min()} Q{df['quarter'].iloc[0]}-{df['year'].max()} Q{df['quarter'].iloc[-1]}"
    )
    return df




[docs]
def get_latest_qes(force_refresh: bool = False, adjusted: bool = True) -> pd.DataFrame:
    """Get the latest NI Quarterly Employment Survey data.

    Args:
        force_refresh: If True, bypass cache and download fresh data
        adjusted: If True (default), return seasonally adjusted series (Table 5.2).
                  If False, return unadjusted series (Table 5.3).

    Returns:
        DataFrame with columns: date, year, quarter, quarter_label,
        manufacturing_jobs, construction_jobs, services_jobs, other_jobs,
        total_jobs, adjusted

    Raises:
        NISRADataNotFoundError: If latest publication cannot be found
        NISRAValidationError: If file structure is unexpected

    Example:
        >>> df = get_latest_qes()
        >>> 'total_jobs' in df.columns
        True
    """
    excel_url = get_latest_qes_publication_url()
    logger.info(f"Downloading QES supplementary tables from: {excel_url}")
    file_path = download_file(excel_url, cache_ttl_hours=24 * 90, force_refresh=force_refresh)
    return parse_qes_file(file_path, adjusted=adjusted)




[docs]
def validate_qes_data(df: pd.DataFrame) -> bool:
    """Validate QES DataFrame for basic integrity.

    Args:
        df: DataFrame from get_latest_qes()

    Returns:
        True if validation passes

    Raises:
        NISRAValidationError: If validation fails
    """
    required = {
        "date",
        "year",
        "quarter",
        "total_jobs",
        "manufacturing_jobs",
        "construction_jobs",
        "services_jobs",
        "other_jobs",
    }
    missing = required - set(df.columns)
    if missing:
        raise NISRAValidationError(f"Missing required columns: {missing}")

    if df.empty:
        raise NISRAValidationError("DataFrame is empty")

    if len(df) < 40:
        raise NISRAValidationError(f"Too few rows ({len(df)}); expected 40+ quarters from 2005")

    if (df["total_jobs"] < 0).any():
        raise NISRAValidationError("Negative total_jobs values found")

    # Total jobs in NI should be roughly 600k-900k across the series
    if (df["total_jobs"] > 1_000_000).any():
        raise NISRAValidationError("total_jobs implausibly high (>1,000,000)")

    if (df["total_jobs"] < 500_000).any():
        raise NISRAValidationError("total_jobs implausibly low (<500,000)")

    return True




[docs]
def get_qes_by_year(df: pd.DataFrame, year: int) -> pd.DataFrame:
    """Filter QES data to a specific year.

    Args:
        df: DataFrame from get_latest_qes()
        year: Year to filter

    Returns:
        Filtered DataFrame (up to 4 quarters)

    Example:
        >>> df = get_latest_qes()
        >>> df_2024 = get_qes_by_year(df, 2024)
        >>> len(df_2024) <= 4
        True
    """
    return df[df["year"] == year].reset_index(drop=True)




[docs]
def get_qes_growth(df: pd.DataFrame) -> pd.DataFrame:
    """Calculate quarter-on-quarter and year-on-year growth for total employee jobs.

    Args:
        df: DataFrame from get_latest_qes()

    Returns:
        DataFrame with additional columns:
            - total_qoq: Total jobs quarter-on-quarter change (absolute)
            - total_yoy: Total jobs year-on-year % change
            - services_yoy: Services jobs year-on-year % change
            - manufacturing_yoy: Manufacturing jobs year-on-year % change

    Example:
        >>> df = get_latest_qes()
        >>> growth = get_qes_growth(df)
        >>> 'total_yoy' in growth.columns
        True
    """
    result = df.copy()
    result["total_qoq"] = result["total_jobs"].diff(1)
    result["total_yoy"] = result["total_jobs"].pct_change(4).mul(100).round(2)
    result["services_yoy"] = result["services_jobs"].pct_change(4).mul(100).round(2)
    result["manufacturing_yoy"] = result["manufacturing_jobs"].pct_change(4).mul(100).round(2)
    return result




[docs]
def get_sector_shares(df: pd.DataFrame) -> pd.DataFrame:
    """Calculate each sector's share of total employee jobs.

    Args:
        df: DataFrame from get_latest_qes()

    Returns:
        DataFrame with additional columns:
            - manufacturing_share: Manufacturing % of total
            - construction_share: Construction % of total
            - services_share: Services % of total
            - other_share: Other industries % of total

    Example:
        >>> df = get_latest_qes()
        >>> shares = get_sector_shares(df)
        >>> 'services_share' in shares.columns
        True
    """
    result = df.copy()
    for sector in ("manufacturing", "construction", "services", "other"):
        result[f"{sector}_share"] = (result[f"{sector}_jobs"] / result["total_jobs"] * 100).round(1)
    return result