"""NISRA Index of Production (IOP) for Northern Ireland.
Provides quarterly index of manufacturing and production output, comparing
Northern Ireland against the UK average. Base year 2020=100.
Sectors covered (with approximate weights):
- Manufacturing (78.8%): food, drink, tobacco, textiles, chemicals, machinery, etc.
- Electricity, gas, steam (11.0%)
- Water supply and waste management (8.4%)
- Mining and quarrying (1.8%)
Data Source:
**Statistics page**: https://www.nisra.gov.uk/statistics/economic-output/index-production
The module scrapes this page to find the latest quarterly publication,
then downloads the Excel tables file.
Update Frequency: Quarterly (published ~3 months after reference quarter)
Geographic Coverage: Northern Ireland and UK comparison
Base Year: 2020=100
Example:
>>> from bolster.data_sources.nisra import index_of_production as iop
>>> df = iop.get_latest_iop()
>>> 'ni_index' in df.columns
True
"""
import logging
import re
from pathlib import Path
import pandas as pd
from bs4 import BeautifulSoup
from bolster.utils.web import session
from ._base import NISRADataNotFoundError, NISRAValidationError, download_file
[docs]
logger = logging.getLogger(__name__)
[docs]
IOP_STATS_URL = "https://www.nisra.gov.uk/statistics/economic-output/index-production"
[docs]
IOP_BASE_URL = "https://www.nisra.gov.uk"
[docs]
def get_latest_iop_publication_url() -> tuple[str, int, int]:
"""Scrape the IOP statistics page to find the latest quarterly Excel file.
Returns:
Tuple of (excel_url, year, quarter)
Raises:
NISRADataNotFoundError: If publication cannot be found
"""
try:
response = session.get(IOP_STATS_URL, timeout=30)
response.raise_for_status()
except Exception as e:
raise NISRADataNotFoundError(f"Failed to fetch IOP statistics page: {e}") from e
soup = BeautifulSoup(response.content, "html.parser")
# Find latest publication link — pattern: "Index of Production ... Q# YYYY"
# Pub list uses slugs like: /publications/index-production-iop-...-quarter-4-2025
quarter_word = {"1": 1, "2": 2, "3": 3, "4": 4, "one": 1, "two": 2, "three": 3, "four": 4}
quarter_slug_pat = re.compile(r"quarter[- _](\d+)[- _](\d{4})", re.IGNORECASE)
for link in soup.find_all("a", href=True):
href = link["href"]
if "index-production-iop" not in href or "publications" not in href:
continue
match = quarter_slug_pat.search(href)
if not match:
continue
q_str = match.group(1)
quarter = quarter_word.get(q_str.lower(), int(q_str))
year = int(match.group(2))
pub_url = href if href.startswith("http") else f"{IOP_BASE_URL}{href}"
try:
pub_resp = session.get(pub_url, timeout=30)
pub_resp.raise_for_status()
except Exception as e:
logger.warning(f"Failed to fetch publication page {pub_url}: {e}")
continue
pub_soup = BeautifulSoup(pub_resp.content, "html.parser")
for file_link in pub_soup.find_all("a", href=True):
file_href = file_link["href"]
file_text = file_link.get_text(strip=True).lower()
if "iop" in file_text and "tables" in file_text and file_href.endswith(".xlsx"):
excel_url = file_href if file_href.startswith("http") else f"{IOP_BASE_URL}{file_href}"
logger.info(f"Found IOP Q{quarter} {year}: {excel_url}")
return excel_url, year, quarter
raise NISRADataNotFoundError("Could not find latest IOP publication")
[docs]
def parse_iop_file(file_path: str | Path) -> pd.DataFrame:
"""Parse IOP Excel tables file into long-format DataFrame.
Reads Table_1 which contains the headline NI and UK index series.
Args:
file_path: Path to downloaded IOP tables Excel file
Returns:
DataFrame with columns:
- date: Timestamp (first day of quarter)
- year: int
- quarter: int (1-4)
- quarter_label: str (e.g. "Q1 2025")
- ni_index: float (NI Index of Production, 2020=100)
- uk_index: float (UK Index of Production, 2020=100)
Raises:
NISRAValidationError: If file structure is unexpected
"""
file_path = Path(file_path)
try:
df = pd.read_excel(file_path, sheet_name="Table_1", skiprows=2, header=0)
except Exception as e:
raise NISRAValidationError(f"Failed to read IOP file: {e}") from e
# First column is quarter label, remaining are NI and UK
col0 = df.columns[0]
df = df.rename(columns={col0: "quarter_label"})
# Strip trailing whitespace from column names
df.columns = [str(c).strip() for c in df.columns]
df = df.rename(columns={"NI": "ni_index", "UK": "uk_index"})
# Drop rows without a valid quarter label
df = df[df["quarter_label"].notna()].copy()
df["quarter_label"] = df["quarter_label"].astype(str).str.strip()
# Keep only rows matching "Q# YYYY" pattern
quarter_pattern = re.compile(r"^Q([1-4])\s+(\d{4})$")
mask = df["quarter_label"].str.match(quarter_pattern)
df = df[mask].copy()
if df.empty:
raise NISRAValidationError("No valid quarter rows found in IOP Table_1")
# Parse year and quarter
parsed = df["quarter_label"].str.extract(quarter_pattern)
df["quarter"] = parsed[0].astype(int)
df["year"] = parsed[1].astype(int)
# Quarter start dates: Q1=Jan, Q2=Apr, Q3=Jul, Q4=Oct
quarter_month = {1: 1, 2: 4, 3: 7, 4: 10}
df["date"] = pd.to_datetime(df["year"].astype(str) + "-" + df["quarter"].map(quarter_month).astype(str) + "-01")
df["ni_index"] = pd.to_numeric(df["ni_index"], errors="coerce")
df["uk_index"] = pd.to_numeric(df["uk_index"], errors="coerce")
df = df[["date", "year", "quarter", "quarter_label", "ni_index", "uk_index"]]
df = df.sort_values("date").reset_index(drop=True)
logger.info(
f"Parsed IOP: {len(df)} quarters, {df['year'].min()} Q{df['quarter'].iloc[0]}-{df['year'].max()} Q{df['quarter'].iloc[-1]}"
)
return df
[docs]
def get_latest_iop(force_refresh: bool = False) -> pd.DataFrame:
"""Get the latest NI Index of Production quarterly series.
Args:
force_refresh: If True, bypass cache and download fresh data
Returns:
DataFrame with columns: date, year, quarter, quarter_label,
ni_index, uk_index (base year 2020=100)
Raises:
NISRADataNotFoundError: If latest publication cannot be found
NISRAValidationError: If file structure is unexpected
Example:
>>> df = get_latest_iop()
>>> 'ni_index' in df.columns
True
"""
excel_url, year, quarter = get_latest_iop_publication_url()
logger.info(f"Downloading IOP Q{quarter} {year} from: {excel_url}")
file_path = download_file(excel_url, cache_ttl_hours=24 * 90, force_refresh=force_refresh)
return parse_iop_file(file_path)
[docs]
def validate_iop_data(df: pd.DataFrame) -> bool:
"""Validate IOP DataFrame for basic integrity.
Args:
df: DataFrame from get_latest_iop()
Returns:
True if validation passes
Raises:
NISRAValidationError: If validation fails
"""
required = {"date", "year", "quarter", "ni_index", "uk_index"}
missing = required - set(df.columns)
if missing:
raise NISRAValidationError(f"Missing required columns: {missing}")
if df.empty:
raise NISRAValidationError("DataFrame is empty")
if df["year"].min() > 2010:
raise NISRAValidationError(f"Expected data from before 2010, got {df['year'].min()}")
if (df["ni_index"] <= 0).any():
raise NISRAValidationError("Non-positive NI index values found")
return True
[docs]
def get_iop_by_year(df: pd.DataFrame, year: int) -> pd.DataFrame:
"""Filter IOP data to a specific year.
Args:
df: DataFrame from get_latest_iop()
year: Year to filter
Returns:
Filtered DataFrame (up to 4 quarters)
Example:
>>> df = get_latest_iop()
>>> df_2024 = get_iop_by_year(df, 2024)
>>> len(df_2024) <= 4
True
"""
return df[df["year"] == year].reset_index(drop=True)
[docs]
def get_iop_by_quarter(df: pd.DataFrame, quarter: int, year: int) -> pd.DataFrame:
"""Filter IOP data to a specific quarter.
Args:
df: DataFrame from get_latest_iop()
quarter: Quarter number (1-4)
year: Year
Returns:
Filtered DataFrame (single row)
Example:
>>> df = get_latest_iop()
>>> q1_2024 = get_iop_by_quarter(df, 1, 2024)
>>> len(q1_2024) == 1
True
"""
return df[(df["quarter"] == quarter) & (df["year"] == year)].reset_index(drop=True)
[docs]
def get_iop_summary_statistics(df: pd.DataFrame, start_year: int | None = None, end_year: int | None = None) -> dict:
"""Calculate summary statistics for a period of IOP data.
Args:
df: DataFrame from get_latest_iop()
start_year: Optional start year filter (inclusive)
end_year: Optional end year filter (inclusive)
Returns:
Dictionary with keys: period, ni_mean, ni_min, ni_max,
uk_mean, uk_min, uk_max, quarters_count
Example:
>>> df = get_latest_iop()
>>> stats = get_iop_summary_statistics(df, start_year=2020)
>>> 'ni_mean' in stats
True
"""
filtered = df.copy()
if start_year is not None:
filtered = filtered[filtered["year"] >= start_year]
if end_year is not None:
filtered = filtered[filtered["year"] <= end_year]
return {
"period": f"{filtered['year'].min()}-{filtered['year'].max()}",
"ni_mean": float(filtered["ni_index"].mean()),
"ni_min": float(filtered["ni_index"].min()),
"ni_max": float(filtered["ni_index"].max()),
"uk_mean": float(filtered["uk_index"].mean()),
"uk_min": float(filtered["uk_index"].min()),
"uk_max": float(filtered["uk_index"].max()),
"quarters_count": len(filtered),
}
[docs]
def get_iop_growth(df: pd.DataFrame) -> pd.DataFrame:
"""Calculate quarter-on-quarter and year-on-year growth rates.
Args:
df: DataFrame from get_latest_iop()
Returns:
DataFrame with additional columns:
- ni_qoq: NI quarter-on-quarter % change
- ni_yoy: NI year-on-year % change
- uk_qoq: UK quarter-on-quarter % change
- uk_yoy: UK year-on-year % change
Example:
>>> df = get_latest_iop()
>>> growth = get_iop_growth(df)
>>> 'ni_yoy' in growth.columns
True
"""
result = df.copy()
result["ni_qoq"] = result["ni_index"].pct_change(1).mul(100).round(2)
result["ni_yoy"] = result["ni_index"].pct_change(4).mul(100).round(2)
result["uk_qoq"] = result["uk_index"].pct_change(1).mul(100).round(2)
result["uk_yoy"] = result["uk_index"].pct_change(4).mul(100).round(2)
return result