Source code for bolster.data_sources.nisra.index_of_production

"""NISRA Index of Production (IOP) for Northern Ireland.

Provides quarterly index of manufacturing and production output, comparing
Northern Ireland against the UK average. Base year 2020=100.

Sectors covered (with approximate weights):
- Manufacturing (78.8%): food, drink, tobacco, textiles, chemicals, machinery, etc.
- Electricity, gas, steam (11.0%)
- Water supply and waste management (8.4%)
- Mining and quarrying (1.8%)

Data Source:
    **Statistics page**: https://www.nisra.gov.uk/statistics/economic-output/index-production
    The module scrapes this page to find the latest quarterly publication,
    then downloads the Excel tables file.

Update Frequency: Quarterly (published ~3 months after reference quarter)
Geographic Coverage: Northern Ireland and UK comparison
Base Year: 2020=100

Example:
    >>> from bolster.data_sources.nisra import index_of_production as iop
    >>> df = iop.get_latest_iop()
    >>> 'ni_index' in df.columns
    True
"""

import logging
import re
from pathlib import Path

import pandas as pd
from bs4 import BeautifulSoup

from bolster.utils.web import session

from ._base import NISRADataNotFoundError, NISRAValidationError, download_file

[docs] logger = logging.getLogger(__name__)
[docs] IOP_STATS_URL = "https://www.nisra.gov.uk/statistics/economic-output/index-production"
[docs] IOP_BASE_URL = "https://www.nisra.gov.uk"
[docs] def get_latest_iop_publication_url() -> tuple[str, int, int]: """Scrape the IOP statistics page to find the latest quarterly Excel file. Returns: Tuple of (excel_url, year, quarter) Raises: NISRADataNotFoundError: If publication cannot be found """ try: response = session.get(IOP_STATS_URL, timeout=30) response.raise_for_status() except Exception as e: raise NISRADataNotFoundError(f"Failed to fetch IOP statistics page: {e}") from e soup = BeautifulSoup(response.content, "html.parser") # Find latest publication link — pattern: "Index of Production ... Q# YYYY" # Pub list uses slugs like: /publications/index-production-iop-...-quarter-4-2025 quarter_word = {"1": 1, "2": 2, "3": 3, "4": 4, "one": 1, "two": 2, "three": 3, "four": 4} quarter_slug_pat = re.compile(r"quarter[- _](\d+)[- _](\d{4})", re.IGNORECASE) for link in soup.find_all("a", href=True): href = link["href"] if "index-production-iop" not in href or "publications" not in href: continue match = quarter_slug_pat.search(href) if not match: continue q_str = match.group(1) quarter = quarter_word.get(q_str.lower(), int(q_str)) year = int(match.group(2)) pub_url = href if href.startswith("http") else f"{IOP_BASE_URL}{href}" try: pub_resp = session.get(pub_url, timeout=30) pub_resp.raise_for_status() except Exception as e: logger.warning(f"Failed to fetch publication page {pub_url}: {e}") continue pub_soup = BeautifulSoup(pub_resp.content, "html.parser") for file_link in pub_soup.find_all("a", href=True): file_href = file_link["href"] file_text = file_link.get_text(strip=True).lower() if "iop" in file_text and "tables" in file_text and file_href.endswith(".xlsx"): excel_url = file_href if file_href.startswith("http") else f"{IOP_BASE_URL}{file_href}" logger.info(f"Found IOP Q{quarter} {year}: {excel_url}") return excel_url, year, quarter raise NISRADataNotFoundError("Could not find latest IOP publication")
[docs] def parse_iop_file(file_path: str | Path) -> pd.DataFrame: """Parse IOP Excel tables file into long-format DataFrame. Reads Table_1 which contains the headline NI and UK index series. Args: file_path: Path to downloaded IOP tables Excel file Returns: DataFrame with columns: - date: Timestamp (first day of quarter) - year: int - quarter: int (1-4) - quarter_label: str (e.g. "Q1 2025") - ni_index: float (NI Index of Production, 2020=100) - uk_index: float (UK Index of Production, 2020=100) Raises: NISRAValidationError: If file structure is unexpected """ file_path = Path(file_path) try: df = pd.read_excel(file_path, sheet_name="Table_1", skiprows=2, header=0) except Exception as e: raise NISRAValidationError(f"Failed to read IOP file: {e}") from e # First column is quarter label, remaining are NI and UK col0 = df.columns[0] df = df.rename(columns={col0: "quarter_label"}) # Strip trailing whitespace from column names df.columns = [str(c).strip() for c in df.columns] df = df.rename(columns={"NI": "ni_index", "UK": "uk_index"}) # Drop rows without a valid quarter label df = df[df["quarter_label"].notna()].copy() df["quarter_label"] = df["quarter_label"].astype(str).str.strip() # Keep only rows matching "Q# YYYY" pattern quarter_pattern = re.compile(r"^Q([1-4])\s+(\d{4})$") mask = df["quarter_label"].str.match(quarter_pattern) df = df[mask].copy() if df.empty: raise NISRAValidationError("No valid quarter rows found in IOP Table_1") # Parse year and quarter parsed = df["quarter_label"].str.extract(quarter_pattern) df["quarter"] = parsed[0].astype(int) df["year"] = parsed[1].astype(int) # Quarter start dates: Q1=Jan, Q2=Apr, Q3=Jul, Q4=Oct quarter_month = {1: 1, 2: 4, 3: 7, 4: 10} df["date"] = pd.to_datetime(df["year"].astype(str) + "-" + df["quarter"].map(quarter_month).astype(str) + "-01") df["ni_index"] = pd.to_numeric(df["ni_index"], errors="coerce") df["uk_index"] = pd.to_numeric(df["uk_index"], errors="coerce") df = df[["date", "year", "quarter", "quarter_label", "ni_index", "uk_index"]] df = df.sort_values("date").reset_index(drop=True) logger.info( f"Parsed IOP: {len(df)} quarters, {df['year'].min()} Q{df['quarter'].iloc[0]}-{df['year'].max()} Q{df['quarter'].iloc[-1]}" ) return df
[docs] def get_latest_iop(force_refresh: bool = False) -> pd.DataFrame: """Get the latest NI Index of Production quarterly series. Args: force_refresh: If True, bypass cache and download fresh data Returns: DataFrame with columns: date, year, quarter, quarter_label, ni_index, uk_index (base year 2020=100) Raises: NISRADataNotFoundError: If latest publication cannot be found NISRAValidationError: If file structure is unexpected Example: >>> df = get_latest_iop() >>> 'ni_index' in df.columns True """ excel_url, year, quarter = get_latest_iop_publication_url() logger.info(f"Downloading IOP Q{quarter} {year} from: {excel_url}") file_path = download_file(excel_url, cache_ttl_hours=24 * 90, force_refresh=force_refresh) return parse_iop_file(file_path)
[docs] def validate_iop_data(df: pd.DataFrame) -> bool: """Validate IOP DataFrame for basic integrity. Args: df: DataFrame from get_latest_iop() Returns: True if validation passes Raises: NISRAValidationError: If validation fails """ required = {"date", "year", "quarter", "ni_index", "uk_index"} missing = required - set(df.columns) if missing: raise NISRAValidationError(f"Missing required columns: {missing}") if df.empty: raise NISRAValidationError("DataFrame is empty") if df["year"].min() > 2010: raise NISRAValidationError(f"Expected data from before 2010, got {df['year'].min()}") if (df["ni_index"] <= 0).any(): raise NISRAValidationError("Non-positive NI index values found") return True
[docs] def get_iop_by_year(df: pd.DataFrame, year: int) -> pd.DataFrame: """Filter IOP data to a specific year. Args: df: DataFrame from get_latest_iop() year: Year to filter Returns: Filtered DataFrame (up to 4 quarters) Example: >>> df = get_latest_iop() >>> df_2024 = get_iop_by_year(df, 2024) >>> len(df_2024) <= 4 True """ return df[df["year"] == year].reset_index(drop=True)
[docs] def get_iop_by_quarter(df: pd.DataFrame, quarter: int, year: int) -> pd.DataFrame: """Filter IOP data to a specific quarter. Args: df: DataFrame from get_latest_iop() quarter: Quarter number (1-4) year: Year Returns: Filtered DataFrame (single row) Example: >>> df = get_latest_iop() >>> q1_2024 = get_iop_by_quarter(df, 1, 2024) >>> len(q1_2024) == 1 True """ return df[(df["quarter"] == quarter) & (df["year"] == year)].reset_index(drop=True)
[docs] def get_iop_summary_statistics(df: pd.DataFrame, start_year: int | None = None, end_year: int | None = None) -> dict: """Calculate summary statistics for a period of IOP data. Args: df: DataFrame from get_latest_iop() start_year: Optional start year filter (inclusive) end_year: Optional end year filter (inclusive) Returns: Dictionary with keys: period, ni_mean, ni_min, ni_max, uk_mean, uk_min, uk_max, quarters_count Example: >>> df = get_latest_iop() >>> stats = get_iop_summary_statistics(df, start_year=2020) >>> 'ni_mean' in stats True """ filtered = df.copy() if start_year is not None: filtered = filtered[filtered["year"] >= start_year] if end_year is not None: filtered = filtered[filtered["year"] <= end_year] return { "period": f"{filtered['year'].min()}-{filtered['year'].max()}", "ni_mean": float(filtered["ni_index"].mean()), "ni_min": float(filtered["ni_index"].min()), "ni_max": float(filtered["ni_index"].max()), "uk_mean": float(filtered["uk_index"].mean()), "uk_min": float(filtered["uk_index"].min()), "uk_max": float(filtered["uk_index"].max()), "quarters_count": len(filtered), }
[docs] def get_iop_growth(df: pd.DataFrame) -> pd.DataFrame: """Calculate quarter-on-quarter and year-on-year growth rates. Args: df: DataFrame from get_latest_iop() Returns: DataFrame with additional columns: - ni_qoq: NI quarter-on-quarter % change - ni_yoy: NI year-on-year % change - uk_qoq: UK quarter-on-quarter % change - uk_yoy: UK year-on-year % change Example: >>> df = get_latest_iop() >>> growth = get_iop_growth(df) >>> 'ni_yoy' in growth.columns True """ result = df.copy() result["ni_qoq"] = result["ni_index"].pct_change(1).mul(100).round(2) result["ni_yoy"] = result["ni_index"].pct_change(4).mul(100).round(2) result["uk_qoq"] = result["uk_index"].pct_change(1).mul(100).round(2) result["uk_yoy"] = result["uk_index"].pct_change(4).mul(100).round(2) return result