Source code for bolster.data_sources.nisra.construction_output

"""NISRA Construction Output Statistics Module.

This module provides access to Northern Ireland's quarterly construction output statistics:
- All Work: Total construction output index
- New Work: New construction projects
- Repair and Maintenance: Repair and maintenance work

Data is published quarterly by NISRA's Economic & Labour Market Statistics Branch.

Data Source: Northern Ireland Statistics and Research Agency provides quarterly construction
output statistics through their Economic Output section at https://www.nisra.gov.uk/statistics/economic-output/construction-output-statistics.
The data tracks construction activity across all sectors using a chained volume measure
approach to provide comparable time series data for construction output analysis.

Update Frequency: Quarterly publications are released approximately 3 months after the end
of each quarter. Construction output statistics are published as part of NISRA's Economic
Output series, providing the official measure of construction sector performance in Northern
Ireland with data updated four times per year.

Data Coverage:
    - All Work: Q1 2000 - Present (quarterly, non-seasonally adjusted)
    - New Work: Q1 2000 - Present (quarterly, non-seasonally adjusted)
    - Repair and Maintenance: Q1 2000 - Present (quarterly, seasonally adjusted)
    - Base year: 2022 = 100 (chained volume measure)

Examples:
    >>> from bolster.data_sources.nisra import construction_output
    >>> df = construction_output.get_latest_construction_output()
    >>> 'all_work_index' in df.columns
    True
    >>> df_2024 = construction_output.get_construction_by_year(df, 2024)
    >>> len(df_2024) <= 4
    True
    >>> df_growth = construction_output.calculate_growth_rates(df)
    >>> 'all_work_yoy_growth' in df_growth.columns
    True

Publication Details:
    - Frequency: Quarterly
    - Published by: NISRA Economic & Labour Market Statistics Branch
    - Contact: economicstats@nisra.gov.uk
    - Next release: Approximately 3 months after quarter end
    - Base year: 2022 (index = 100)
"""

import logging
import re
from datetime import datetime
from pathlib import Path

import pandas as pd

from bolster.utils.web import session

from ._base import NISRADataNotFoundError, download_file


[docs]
logger = logging.getLogger(__name__)


# Base URL for NISRA construction output statistics

[docs]
CONSTRUCTION_BASE_URL = "https://www.nisra.gov.uk/statistics/economic-output/construction-output-statistics"




[docs]
def get_latest_construction_publication_url() -> tuple[str, datetime]:
    """Get the URL of the latest Construction Output publication.

    Scrapes the NISRA Construction Output page to find the most recent publication.

    Returns:
        Tuple of (excel_url, publication_date)

    Raises:
        NISRADataNotFoundError: If unable to find the latest publication

    Example:
        >>> url, pub_date = get_latest_construction_publication_url()
        >>> url.startswith('https://')
        True
    """
    from bs4 import BeautifulSoup

    logger.info("Fetching latest Construction Output publication URL...")

    try:
        response = session.get(CONSTRUCTION_BASE_URL, timeout=30)
        response.raise_for_status()
    except Exception as e:
        raise NISRADataNotFoundError(f"Failed to fetch Construction Output page: {e}") from e

    soup = BeautifulSoup(response.content, "html.parser")

    # Find the link to the latest publication
    # Pattern: "Construction Output Statistics - Q2 - 2025" or similar
    publication_links = soup.find_all("a", href=True)

    for link in publication_links:
        link_text = link.get_text(strip=True)
        if "Construction Output Statistics" in link_text and "Q" in link_text:
            pub_url = link["href"]
            if not pub_url.startswith("http"):
                pub_url = f"https://www.nisra.gov.uk{pub_url}"

            # Get the Excel file URL from the publication page
            try:
                pub_response = session.get(pub_url, timeout=30)
                pub_response.raise_for_status()
            except Exception as e:
                raise NISRADataNotFoundError(f"Failed to fetch publication page: {e}") from e

            pub_soup = BeautifulSoup(pub_response.content, "html.parser")

            # Find Excel file link (construction tables)
            for file_link in pub_soup.find_all("a", href=True):
                href = file_link["href"]
                if ".xlsx" in href.lower() and "construction" in href.lower() and "table" in href.lower():
                    excel_url = href
                    if not excel_url.startswith("http"):
                        excel_url = f"https://www.nisra.gov.uk{excel_url}"

                    # Extract publication date
                    pub_date = datetime.now()
                    date_meta = pub_soup.find("meta", property="article:published_time")
                    if date_meta and date_meta.get("content"):
                        pub_date = datetime.fromisoformat(date_meta["content"].split("T")[0])

                    logger.info(
                        f"Found latest Construction Output publication: {excel_url} (published {pub_date.date()})"
                    )
                    return excel_url, pub_date

    raise NISRADataNotFoundError("Could not find latest Construction Output publication")




[docs]
def parse_construction_file(file_path: str | Path) -> pd.DataFrame:
    """Parse NISRA Construction Output Excel file.

    Extracts the main construction output time series (Table 1.1) from the Excel file.

    Args:
        file_path: Path to the Construction Output Excel file

    Returns:
        DataFrame with columns:
            - date: datetime (first day of quarter)
            - quarter: str (e.g., 'Q1', 'Q2', 'Q3', 'Q4')
            - year: int
            - all_work_index: float (total construction output, NSA)
            - new_work_index: float (new construction work, NSA)
            - repair_maintenance_index: float (repair and maintenance, SA)

    Example:
        >>> url, _ = get_latest_construction_publication_url()
        >>> path = download_file(url, cache_ttl_hours=168)
        >>> df = parse_construction_file(path)
        >>> 'all_work_index' in df.columns
        True
    """
    logger.info(f"Parsing Construction Output file: {file_path}")

    # Read Table 1.1: Main construction output indices
    # Skip first 4 rows to get to the header
    df = pd.read_excel(file_path, sheet_name="Table_1.1", skiprows=4)

    # Drop empty columns
    df = df.dropna(axis=1, how="all")

    # Rename columns for clarity
    df.columns = [
        "time_period",
        "quarter_num",
        "new_work_index",
        "new_work_qoq_change",
        "repair_maintenance_index",
        "repair_maintenance_qoq_change",
        "all_work_index",
        "all_work_qoq_change",
        "extra",  # Sometimes there's an extra column
    ]

    # Drop the extra column if it exists
    if "extra" in df.columns:
        df = df.drop(columns=["extra"])

    # Remove first row with "Not applicable" values
    df = df[df["new_work_qoq_change"] != "Not applicable"].reset_index(drop=True)

    # Extract year and quarter from time period
    def parse_time_period(time_str):
        # Remove [R] and [P] markers
        time_str = re.sub(r"\s*\[[RP]\]", "", str(time_str))

        # Extract year from "Jan to Mar 2000" format
        match = re.search(r"(\d{4})", time_str)
        if match:
            return int(match.group(1))
        return None

    df["year"] = df["time_period"].apply(parse_time_period)

    # Map quarter number to quarter code
    quarter_map = {1: "Q1", 2: "Q2", 3: "Q3", 4: "Q4"}
    df["quarter"] = df["quarter_num"].map(quarter_map)

    # Create date column (first day of quarter)
    quarter_to_month = {"Q1": 1, "Q2": 4, "Q3": 7, "Q4": 10}
    df["month"] = df["quarter"].map(quarter_to_month)
    df["date"] = pd.to_datetime({"year": df["year"], "month": df["month"], "day": 1})

    # Select and order columns
    result = df[
        [
            "date",
            "quarter",
            "year",
            "all_work_index",
            "new_work_index",
            "repair_maintenance_index",
        ]
    ].copy()

    # Remove any rows with missing data
    result = result.dropna().reset_index(drop=True)

    # Convert indices to float
    result["all_work_index"] = result["all_work_index"].astype(float)
    result["new_work_index"] = result["new_work_index"].astype(float)
    result["repair_maintenance_index"] = result["repair_maintenance_index"].astype(float)

    logger.info(
        f"Parsed {len(result)} quarters of Construction Output data ({result['year'].min()}-{result['year'].max()})"
    )

    return result




[docs]
def get_latest_construction_output(force_refresh: bool = False) -> pd.DataFrame:
    """Get the latest Construction Output data.

    Downloads and parses the most recent NISRA Construction Output publication.
    Results are cached for 7 days unless force_refresh=True.

    Args:
        force_refresh: If True, bypass cache and download fresh data

    Returns:
        DataFrame with quarterly Construction Output data

    Example:
        >>> df = get_latest_construction_output()
        >>> 'all_work_index' in df.columns
        True
    """
    excel_url, pub_date = get_latest_construction_publication_url()

    # Cache for 7 days (168 hours)
    file_path = download_file(excel_url, cache_ttl_hours=168, force_refresh=force_refresh)

    return parse_construction_file(file_path)



# ============================================================================
# Helper Functions for Analysis
# ============================================================================



[docs]
def get_construction_by_year(df: pd.DataFrame, year: int) -> pd.DataFrame:
    """Filter Construction Output data for a specific year.

    Args:
        df: Construction Output DataFrame
        year: Year to filter for

    Returns:
        DataFrame with only the specified year's data

    Example:
        >>> df = get_latest_construction_output()
        >>> df_2024 = get_construction_by_year(df, 2024)
        >>> len(df_2024) <= 4
        True
    """
    return df[df["year"] == year].reset_index(drop=True)




[docs]
def get_construction_by_quarter(df: pd.DataFrame, quarter: str, year: int) -> pd.DataFrame:
    """Get Construction Output data for a specific quarter.

    Args:
        df: Construction Output DataFrame
        quarter: Quarter code (e.g., 'Q1', 'Q2', 'Q3', 'Q4')
        year: Year

    Returns:
        DataFrame with single row for the specified quarter

    Example:
        >>> df = get_latest_construction_output()
        >>> q2_2025 = get_construction_by_quarter(df, 'Q2', 2025)
        >>> len(q2_2025) <= 1
        True
    """
    return df[(df["quarter"] == quarter) & (df["year"] == year)].reset_index(drop=True)




[docs]
def calculate_growth_rates(df: pd.DataFrame, periods: int = 4) -> pd.DataFrame:
    """Calculate year-on-year growth rates for Construction Output indices.

    Args:
        df: Construction Output DataFrame
        periods: Number of quarters for comparison (default: 4 for YoY)

    Returns:
        DataFrame with additional columns:
            - all_work_yoy_growth: All Work percentage change vs same quarter previous year
            - new_work_yoy_growth: New Work percentage change vs same quarter previous year
            - repair_maintenance_yoy_growth: R&M percentage change vs same quarter previous year

    Example:
        >>> df = get_latest_construction_output()
        >>> df_growth = calculate_growth_rates(df)
        >>> 'all_work_yoy_growth' in df_growth.columns
        True
    """
    result = df.copy()

    # Calculate year-on-year growth rates
    result["all_work_yoy_growth"] = result["all_work_index"].pct_change(periods=periods) * 100
    result["new_work_yoy_growth"] = result["new_work_index"].pct_change(periods=periods) * 100
    result["repair_maintenance_yoy_growth"] = result["repair_maintenance_index"].pct_change(periods=periods) * 100

    return result




[docs]
def get_summary_statistics(df: pd.DataFrame, start_year: int | None = None, end_year: int | None = None) -> dict:
    """Calculate summary statistics for Construction Output.

    Args:
        df: Construction Output DataFrame
        start_year: Optional start year for summary
        end_year: Optional end year for summary

    Returns:
        Dictionary with summary statistics:
            - period: Time period covered
            - all_work_mean: Mean All Work index value
            - all_work_min: Minimum All Work index value
            - all_work_max: Maximum All Work index value
            - new_work_mean: Mean New Work index value
            - repair_maintenance_mean: Mean Repair & Maintenance index value
            - quarters_count: Number of quarters included

    Example:
        >>> df = get_latest_construction_output()
        >>> stats = get_summary_statistics(df, start_year=2020)
        >>> 'all_work_mean' in stats
        True
    """
    filtered = df.copy()

    if start_year:
        filtered = filtered[filtered["year"] >= start_year]
    if end_year:
        filtered = filtered[filtered["year"] <= end_year]

    return {
        "period": f"{filtered['year'].min()}-{filtered['year'].max()}",
        "all_work_mean": float(filtered["all_work_index"].mean()),
        "all_work_min": float(filtered["all_work_index"].min()),
        "all_work_max": float(filtered["all_work_index"].max()),
        "new_work_mean": float(filtered["new_work_index"].mean()),
        "new_work_min": float(filtered["new_work_index"].min()),
        "new_work_max": float(filtered["new_work_index"].max()),
        "repair_maintenance_mean": float(filtered["repair_maintenance_index"].mean()),
        "repair_maintenance_min": float(filtered["repair_maintenance_index"].min()),
        "repair_maintenance_max": float(filtered["repair_maintenance_index"].max()),
        "quarters_count": len(filtered),
    }




[docs]
def validate_construction_data(df: pd.DataFrame) -> bool:  # pragma: no cover
    """Validate construction output data integrity.

    Args:
        df: DataFrame from construction output functions

    Returns:
        True if validation passes, False otherwise
    """
    if df.empty:
        logger.warning("Construction data is empty")
        return False

    # Check for time series structure
    time_cols = ["quarter", "year", "date", "period"]
    has_time_data = any(col in df.columns for col in time_cols)
    if not has_time_data:
        logger.warning("No time series columns found in construction data")
        return False

    return True