Source code for bolster.data_sources.nisra.index_of_services

"""NISRA Index of Services (IOS) for Northern Ireland.

Provides quarterly index of services sector output, comparing Northern Ireland
against the UK average. Base year 2020=100.

Services sectors covered:
- Wholesale and retail trade; repair of motor vehicles
- Accommodation and food service activities
- Information and communication
- Financial and insurance activities
- Professional, scientific and technical activities
- Public administration and defence
- Education
- Human health and social work activities
- Other services

Data Source:
    **Statistics page**: https://www.nisra.gov.uk/statistics/economic-output/index-services
    The module scrapes this page to find the latest quarterly publication,
    then downloads the Excel tables file.

Update Frequency: Quarterly (published ~3 months after reference quarter)
Geographic Coverage: Northern Ireland and UK comparison
Base Year: 2020=100

Example:
    >>> from bolster.data_sources.nisra import index_of_services as ios
    >>> df = ios.get_latest_ios()
    >>> 'ni_index' in df.columns
    True
    >>> growth = ios.get_ios_growth(df)
    >>> 'ni_yoy' in growth.columns
    True
"""

import logging
import re
from pathlib import Path

import pandas as pd
from bs4 import BeautifulSoup

from bolster.utils.web import session

from ._base import NISRADataNotFoundError, NISRAValidationError, download_file

[docs] logger = logging.getLogger(__name__)
[docs] IOS_STATS_URL = "https://www.nisra.gov.uk/statistics/economic-output/index-services"
[docs] IOS_BASE_URL = "https://www.nisra.gov.uk"
[docs] def get_latest_ios_publication_url() -> tuple[str, int, int]: """Scrape the IOS statistics page to find the latest quarterly Excel file. Returns: Tuple of (excel_url, year, quarter) Raises: NISRADataNotFoundError: If publication cannot be found """ try: response = session.get(IOS_STATS_URL, timeout=30) response.raise_for_status() except Exception as e: raise NISRADataNotFoundError(f"Failed to fetch IOS statistics page: {e}") from e soup = BeautifulSoup(response.content, "html.parser") # Pub list uses slugs like: /publications/index-services-ios-...-quarter-4-2025... quarter_slug_pat = re.compile(r"quarter[- _](\d+)[- _](\d{4})", re.IGNORECASE) for link in soup.find_all("a", href=True): href = link["href"] if "index-services-ios" not in href or "publications" not in href: continue match = quarter_slug_pat.search(href) if not match: continue quarter = int(match.group(1)) year = int(match.group(2)) pub_url = href if href.startswith("http") else f"{IOS_BASE_URL}{href}" try: pub_resp = session.get(pub_url, timeout=30) pub_resp.raise_for_status() except Exception as e: logger.warning(f"Failed to fetch publication page {pub_url}: {e}") continue pub_soup = BeautifulSoup(pub_resp.content, "html.parser") for file_link in pub_soup.find_all("a", href=True): file_href = file_link["href"] file_text = file_link.get_text(strip=True).lower() if "ios" in file_text and "tables" in file_text and file_href.endswith(".xlsx"): excel_url = file_href if file_href.startswith("http") else f"{IOS_BASE_URL}{file_href}" logger.info(f"Found IOS Q{quarter} {year}: {excel_url}") return excel_url, year, quarter raise NISRADataNotFoundError("Could not find latest IOS publication")
[docs] def parse_ios_file(file_path: str | Path) -> pd.DataFrame: """Parse IOS Excel tables file into long-format DataFrame. Reads Table_1_1 which contains the headline NI and UK services index series. Args: file_path: Path to downloaded IOS tables Excel file Returns: DataFrame with columns: - date: Timestamp (first day of quarter) - year: int - quarter: int (1-4) - quarter_label: str (e.g. "Q1 2025") - ni_index: float (NI Index of Services, 2020=100) - uk_index: float (UK Index of Services, 2020=100) Raises: NISRAValidationError: If file structure is unexpected """ file_path = Path(file_path) try: df = pd.read_excel(file_path, sheet_name="Table_1_1", skiprows=2, header=0) except Exception as e: raise NISRAValidationError(f"Failed to read IOS file: {e}") from e col0 = df.columns[0] df = df.rename(columns={col0: "quarter_label"}) # Strip whitespace from all column names df.columns = [str(c).strip() for c in df.columns] df = df.rename(columns={"NI Services": "ni_index", "UK Services": "uk_index"}) df = df[df["quarter_label"].notna()].copy() df["quarter_label"] = df["quarter_label"].astype(str).str.strip() quarter_pattern = re.compile(r"^Q([1-4])\s+(\d{4})$") mask = df["quarter_label"].str.match(quarter_pattern) df = df[mask].copy() if df.empty: raise NISRAValidationError("No valid quarter rows found in IOS Table_1_1") parsed = df["quarter_label"].str.extract(quarter_pattern) df["quarter"] = parsed[0].astype(int) df["year"] = parsed[1].astype(int) quarter_month = {1: 1, 2: 4, 3: 7, 4: 10} df["date"] = pd.to_datetime(df["year"].astype(str) + "-" + df["quarter"].map(quarter_month).astype(str) + "-01") df["ni_index"] = pd.to_numeric(df["ni_index"], errors="coerce") df["uk_index"] = pd.to_numeric(df["uk_index"], errors="coerce") df = df[["date", "year", "quarter", "quarter_label", "ni_index", "uk_index"]] df = df.sort_values("date").reset_index(drop=True) logger.info( f"Parsed IOS: {len(df)} quarters, {df['year'].min()} Q{df['quarter'].iloc[0]}-{df['year'].max()} Q{df['quarter'].iloc[-1]}" ) return df
[docs] def get_latest_ios(force_refresh: bool = False) -> pd.DataFrame: """Get the latest NI Index of Services quarterly series. Args: force_refresh: If True, bypass cache and download fresh data Returns: DataFrame with columns: date, year, quarter, quarter_label, ni_index, uk_index (base year 2020=100) Raises: NISRADataNotFoundError: If latest publication cannot be found NISRAValidationError: If file structure is unexpected Example: >>> df = get_latest_ios() >>> 'ni_index' in df.columns True """ excel_url, year, quarter = get_latest_ios_publication_url() logger.info(f"Downloading IOS Q{quarter} {year} from: {excel_url}") file_path = download_file(excel_url, cache_ttl_hours=24 * 90, force_refresh=force_refresh) return parse_ios_file(file_path)
[docs] def validate_ios_data(df: pd.DataFrame) -> bool: """Validate IOS DataFrame for basic integrity. Args: df: DataFrame from get_latest_ios() Returns: True if validation passes Raises: NISRAValidationError: If validation fails """ required = {"date", "year", "quarter", "ni_index", "uk_index"} missing = required - set(df.columns) if missing: raise NISRAValidationError(f"Missing required columns: {missing}") if df.empty: raise NISRAValidationError("DataFrame is empty") if df["year"].min() > 2010: raise NISRAValidationError(f"Expected data from before 2010, got {df['year'].min()}") if (df["ni_index"] <= 0).any(): raise NISRAValidationError("Non-positive NI index values found") return True
[docs] def get_ios_by_year(df: pd.DataFrame, year: int) -> pd.DataFrame: """Filter IOS data to a specific year. Args: df: DataFrame from get_latest_ios() year: Year to filter Returns: Filtered DataFrame (up to 4 quarters) Example: >>> df = get_latest_ios() >>> df_2024 = get_ios_by_year(df, 2024) >>> len(df_2024) <= 4 True """ return df[df["year"] == year].reset_index(drop=True)
[docs] def get_ios_by_quarter(df: pd.DataFrame, quarter: int, year: int) -> pd.DataFrame: """Filter IOS data to a specific quarter. Args: df: DataFrame from get_latest_ios() quarter: Quarter number (1-4) year: Year Returns: Filtered DataFrame (single row) Example: >>> df = get_latest_ios() >>> q1_2024 = get_ios_by_quarter(df, 1, 2024) >>> len(q1_2024) == 1 True """ return df[(df["quarter"] == quarter) & (df["year"] == year)].reset_index(drop=True)
[docs] def get_ios_summary_statistics(df: pd.DataFrame, start_year: int | None = None, end_year: int | None = None) -> dict: """Calculate summary statistics for a period of IOS data. Args: df: DataFrame from get_latest_ios() start_year: Optional start year filter (inclusive) end_year: Optional end year filter (inclusive) Returns: Dictionary with keys: period, ni_mean, ni_min, ni_max, uk_mean, uk_min, uk_max, quarters_count Example: >>> df = get_latest_ios() >>> stats = get_ios_summary_statistics(df, start_year=2020) >>> 'ni_mean' in stats True """ filtered = df.copy() if start_year is not None: filtered = filtered[filtered["year"] >= start_year] if end_year is not None: filtered = filtered[filtered["year"] <= end_year] return { "period": f"{filtered['year'].min()}-{filtered['year'].max()}", "ni_mean": float(filtered["ni_index"].mean()), "ni_min": float(filtered["ni_index"].min()), "ni_max": float(filtered["ni_index"].max()), "uk_mean": float(filtered["uk_index"].mean()), "uk_min": float(filtered["uk_index"].min()), "uk_max": float(filtered["uk_index"].max()), "quarters_count": len(filtered), }
[docs] def get_ios_growth(df: pd.DataFrame) -> pd.DataFrame: """Calculate quarter-on-quarter and year-on-year growth rates. Args: df: DataFrame from get_latest_ios() Returns: DataFrame with additional columns: - ni_qoq: NI quarter-on-quarter % change - ni_yoy: NI year-on-year % change - uk_qoq: UK quarter-on-quarter % change - uk_yoy: UK year-on-year % change Example: >>> df = get_latest_ios() >>> growth = get_ios_growth(df) >>> 'ni_yoy' in growth.columns True """ result = df.copy() result["ni_qoq"] = result["ni_index"].pct_change(1).mul(100).round(2) result["ni_yoy"] = result["ni_index"].pct_change(4).mul(100).round(2) result["uk_qoq"] = result["uk_index"].pct_change(1).mul(100).round(2) result["uk_yoy"] = result["uk_index"].pct_change(4).mul(100).round(2) return result