Source code for bolster.data_sources.nisra.construction_output

"""NISRA Construction Output Statistics Module.

This module provides access to Northern Ireland's quarterly construction output statistics:
- All Work: Total construction output index
- New Work: New construction projects
- Repair and Maintenance: Repair and maintenance work

Data is published quarterly by NISRA's Economic & Labour Market Statistics Branch.

Data Source: Northern Ireland Statistics and Research Agency provides quarterly construction
output statistics through their Economic Output section at https://www.nisra.gov.uk/statistics/economic-output/construction-output-statistics.
The data tracks construction activity across all sectors using a chained volume measure
approach to provide comparable time series data for construction output analysis.

Update Frequency: Quarterly publications are released approximately 3 months after the end
of each quarter. Construction output statistics are published as part of NISRA's Economic
Output series, providing the official measure of construction sector performance in Northern
Ireland with data updated four times per year.

Data Coverage:
    - All Work: Q1 2000 - Present (quarterly, non-seasonally adjusted)
    - New Work: Q1 2000 - Present (quarterly, non-seasonally adjusted)
    - Repair and Maintenance: Q1 2000 - Present (quarterly, seasonally adjusted)
    - Base year: 2022 = 100 (chained volume measure)

Examples:
    >>> from bolster.data_sources.nisra import construction_output
    >>> df = construction_output.get_latest_construction_output()
    >>> 'all_work_index' in df.columns
    True
    >>> df_2024 = construction_output.get_construction_by_year(df, 2024)
    >>> len(df_2024) <= 4
    True
    >>> df_growth = construction_output.calculate_growth_rates(df)
    >>> 'all_work_yoy_growth' in df_growth.columns
    True

Publication Details:
    - Frequency: Quarterly
    - Published by: NISRA Economic & Labour Market Statistics Branch
    - Contact: economicstats@nisra.gov.uk
    - Next release: Approximately 3 months after quarter end
    - Base year: 2022 (index = 100)
"""

import logging
import re
from datetime import datetime
from pathlib import Path

import pandas as pd

from bolster.utils.web import session

from ._base import NISRADataNotFoundError, download_file

[docs] logger = logging.getLogger(__name__)
# Base URL for NISRA construction output statistics
[docs] CONSTRUCTION_BASE_URL = "https://www.nisra.gov.uk/statistics/economic-output/construction-output-statistics"
[docs] def get_latest_construction_publication_url() -> tuple[str, datetime]: """Get the URL of the latest Construction Output publication. Scrapes the NISRA Construction Output page to find the most recent publication. Returns: Tuple of (excel_url, publication_date) Raises: NISRADataNotFoundError: If unable to find the latest publication Example: >>> url, pub_date = get_latest_construction_publication_url() >>> url.startswith('https://') True """ from bs4 import BeautifulSoup logger.info("Fetching latest Construction Output publication URL...") try: response = session.get(CONSTRUCTION_BASE_URL, timeout=30) response.raise_for_status() except Exception as e: raise NISRADataNotFoundError(f"Failed to fetch Construction Output page: {e}") from e soup = BeautifulSoup(response.content, "html.parser") # Find the link to the latest publication # Pattern: "Construction Output Statistics - Q2 - 2025" or similar publication_links = soup.find_all("a", href=True) for link in publication_links: link_text = link.get_text(strip=True) if "Construction Output Statistics" in link_text and "Q" in link_text: pub_url = link["href"] if not pub_url.startswith("http"): pub_url = f"https://www.nisra.gov.uk{pub_url}" # Get the Excel file URL from the publication page try: pub_response = session.get(pub_url, timeout=30) pub_response.raise_for_status() except Exception as e: raise NISRADataNotFoundError(f"Failed to fetch publication page: {e}") from e pub_soup = BeautifulSoup(pub_response.content, "html.parser") # Find Excel file link (construction tables) for file_link in pub_soup.find_all("a", href=True): href = file_link["href"] if ".xlsx" in href.lower() and "construction" in href.lower() and "table" in href.lower(): excel_url = href if not excel_url.startswith("http"): excel_url = f"https://www.nisra.gov.uk{excel_url}" # Extract publication date pub_date = datetime.now() date_meta = pub_soup.find("meta", property="article:published_time") if date_meta and date_meta.get("content"): pub_date = datetime.fromisoformat(date_meta["content"].split("T")[0]) logger.info( f"Found latest Construction Output publication: {excel_url} (published {pub_date.date()})" ) return excel_url, pub_date raise NISRADataNotFoundError("Could not find latest Construction Output publication")
[docs] def parse_construction_file(file_path: str | Path) -> pd.DataFrame: """Parse NISRA Construction Output Excel file. Extracts the main construction output time series (Table 1.1) from the Excel file. Args: file_path: Path to the Construction Output Excel file Returns: DataFrame with columns: - date: datetime (first day of quarter) - quarter: str (e.g., 'Q1', 'Q2', 'Q3', 'Q4') - year: int - all_work_index: float (total construction output, NSA) - new_work_index: float (new construction work, NSA) - repair_maintenance_index: float (repair and maintenance, SA) Example: >>> url, _ = get_latest_construction_publication_url() >>> path = download_file(url, cache_ttl_hours=168) >>> df = parse_construction_file(path) >>> 'all_work_index' in df.columns True """ logger.info(f"Parsing Construction Output file: {file_path}") # Read Table 1.1: Main construction output indices # Skip first 4 rows to get to the header df = pd.read_excel(file_path, sheet_name="Table_1.1", skiprows=4) # Drop empty columns df = df.dropna(axis=1, how="all") # Rename columns for clarity df.columns = [ "time_period", "quarter_num", "new_work_index", "new_work_qoq_change", "repair_maintenance_index", "repair_maintenance_qoq_change", "all_work_index", "all_work_qoq_change", "extra", # Sometimes there's an extra column ] # Drop the extra column if it exists if "extra" in df.columns: df = df.drop(columns=["extra"]) # Remove first row with "Not applicable" values df = df[df["new_work_qoq_change"] != "Not applicable"].reset_index(drop=True) # Extract year and quarter from time period def parse_time_period(time_str): # Remove [R] and [P] markers time_str = re.sub(r"\s*\[[RP]\]", "", str(time_str)) # Extract year from "Jan to Mar 2000" format match = re.search(r"(\d{4})", time_str) if match: return int(match.group(1)) return None df["year"] = df["time_period"].apply(parse_time_period) # Map quarter number to quarter code quarter_map = {1: "Q1", 2: "Q2", 3: "Q3", 4: "Q4"} df["quarter"] = df["quarter_num"].map(quarter_map) # Create date column (first day of quarter) quarter_to_month = {"Q1": 1, "Q2": 4, "Q3": 7, "Q4": 10} df["month"] = df["quarter"].map(quarter_to_month) df["date"] = pd.to_datetime({"year": df["year"], "month": df["month"], "day": 1}) # Select and order columns result = df[ [ "date", "quarter", "year", "all_work_index", "new_work_index", "repair_maintenance_index", ] ].copy() # Remove any rows with missing data result = result.dropna().reset_index(drop=True) # Convert indices to float result["all_work_index"] = result["all_work_index"].astype(float) result["new_work_index"] = result["new_work_index"].astype(float) result["repair_maintenance_index"] = result["repair_maintenance_index"].astype(float) logger.info( f"Parsed {len(result)} quarters of Construction Output data ({result['year'].min()}-{result['year'].max()})" ) return result
[docs] def get_latest_construction_output(force_refresh: bool = False) -> pd.DataFrame: """Get the latest Construction Output data. Downloads and parses the most recent NISRA Construction Output publication. Results are cached for 7 days unless force_refresh=True. Args: force_refresh: If True, bypass cache and download fresh data Returns: DataFrame with quarterly Construction Output data Example: >>> df = get_latest_construction_output() >>> 'all_work_index' in df.columns True """ excel_url, pub_date = get_latest_construction_publication_url() # Cache for 7 days (168 hours) file_path = download_file(excel_url, cache_ttl_hours=168, force_refresh=force_refresh) return parse_construction_file(file_path)
# ============================================================================ # Helper Functions for Analysis # ============================================================================
[docs] def get_construction_by_year(df: pd.DataFrame, year: int) -> pd.DataFrame: """Filter Construction Output data for a specific year. Args: df: Construction Output DataFrame year: Year to filter for Returns: DataFrame with only the specified year's data Example: >>> df = get_latest_construction_output() >>> df_2024 = get_construction_by_year(df, 2024) >>> len(df_2024) <= 4 True """ return df[df["year"] == year].reset_index(drop=True)
[docs] def get_construction_by_quarter(df: pd.DataFrame, quarter: str, year: int) -> pd.DataFrame: """Get Construction Output data for a specific quarter. Args: df: Construction Output DataFrame quarter: Quarter code (e.g., 'Q1', 'Q2', 'Q3', 'Q4') year: Year Returns: DataFrame with single row for the specified quarter Example: >>> df = get_latest_construction_output() >>> q2_2025 = get_construction_by_quarter(df, 'Q2', 2025) >>> len(q2_2025) <= 1 True """ return df[(df["quarter"] == quarter) & (df["year"] == year)].reset_index(drop=True)
[docs] def calculate_growth_rates(df: pd.DataFrame, periods: int = 4) -> pd.DataFrame: """Calculate year-on-year growth rates for Construction Output indices. Args: df: Construction Output DataFrame periods: Number of quarters for comparison (default: 4 for YoY) Returns: DataFrame with additional columns: - all_work_yoy_growth: All Work percentage change vs same quarter previous year - new_work_yoy_growth: New Work percentage change vs same quarter previous year - repair_maintenance_yoy_growth: R&M percentage change vs same quarter previous year Example: >>> df = get_latest_construction_output() >>> df_growth = calculate_growth_rates(df) >>> 'all_work_yoy_growth' in df_growth.columns True """ result = df.copy() # Calculate year-on-year growth rates result["all_work_yoy_growth"] = result["all_work_index"].pct_change(periods=periods) * 100 result["new_work_yoy_growth"] = result["new_work_index"].pct_change(periods=periods) * 100 result["repair_maintenance_yoy_growth"] = result["repair_maintenance_index"].pct_change(periods=periods) * 100 return result
[docs] def get_summary_statistics(df: pd.DataFrame, start_year: int | None = None, end_year: int | None = None) -> dict: """Calculate summary statistics for Construction Output. Args: df: Construction Output DataFrame start_year: Optional start year for summary end_year: Optional end year for summary Returns: Dictionary with summary statistics: - period: Time period covered - all_work_mean: Mean All Work index value - all_work_min: Minimum All Work index value - all_work_max: Maximum All Work index value - new_work_mean: Mean New Work index value - repair_maintenance_mean: Mean Repair & Maintenance index value - quarters_count: Number of quarters included Example: >>> df = get_latest_construction_output() >>> stats = get_summary_statistics(df, start_year=2020) >>> 'all_work_mean' in stats True """ filtered = df.copy() if start_year: filtered = filtered[filtered["year"] >= start_year] if end_year: filtered = filtered[filtered["year"] <= end_year] return { "period": f"{filtered['year'].min()}-{filtered['year'].max()}", "all_work_mean": float(filtered["all_work_index"].mean()), "all_work_min": float(filtered["all_work_index"].min()), "all_work_max": float(filtered["all_work_index"].max()), "new_work_mean": float(filtered["new_work_index"].mean()), "new_work_min": float(filtered["new_work_index"].min()), "new_work_max": float(filtered["new_work_index"].max()), "repair_maintenance_mean": float(filtered["repair_maintenance_index"].mean()), "repair_maintenance_min": float(filtered["repair_maintenance_index"].min()), "repair_maintenance_max": float(filtered["repair_maintenance_index"].max()), "quarters_count": len(filtered), }
[docs] def validate_construction_data(df: pd.DataFrame) -> bool: # pragma: no cover """Validate construction output data integrity. Args: df: DataFrame from construction output functions Returns: True if validation passes, False otherwise """ if df.empty: logger.warning("Construction data is empty") return False # Check for time series structure time_cols = ["quarter", "year", "date", "period"] has_time_data = any(col in df.columns for col in time_cols) if not has_time_data: logger.warning("No time series columns found in construction data") return False return True