Source code for bolster.data_sources.nisra.tourism.visitor_statistics

"""NISRA Quarterly Tourism Visitor Statistics Data Source.

Provides access to quarterly visitor statistics for Northern Ireland covering
overnight trips, nights spent, and expenditure across geographic markets.

Data includes:
- Overnight trips by visitor origin (GB, Europe, N.America, ROI, NI residents)
- Nights spent (bednights) by market
- Visitor expenditure by market (£ millions)
- Reason for visit breakdowns (holiday, business, visiting friends/relatives)
- Historical trends from 2015 onwards

Data is compiled from multiple sources:
- Northern Ireland Passenger Survey (NIPS)
- Central Statistics Office Inbound Tourism data
- Continuous Household Survey
- Household Travel Survey

Data Source:
    **Mother Page**: https://www.nisra.gov.uk/publications/quarterly-tourism-statistics-publications

    This page lists quarterly tourism statistics publications. The module
    automatically scrapes to find the latest quarterly report Excel file.

Update Frequency: Quarterly (published ~6 weeks after quarter end)
Geographic Coverage: Northern Ireland (by visitor origin market)
Reference Period: Rolling 12-month and year-to-date

Example:
    >>> from bolster.data_sources.nisra.tourism import visitor_statistics
    >>> df = visitor_statistics.get_latest_visitor_statistics()
    >>> 'market' in df.columns
    True
    >>> 'expenditure' in df.columns
    True
"""

import logging
import re

import pandas as pd

from bolster.data_sources.nisra._base import NISRADataNotFoundError, NISRAValidationError, download_file
from bolster.utils.web import session

[docs] logger = logging.getLogger(__name__)
# Base URL for quarterly tourism statistics
[docs] TOURISM_PUBLICATIONS_URL = "https://www.nisra.gov.uk/publications/quarterly-tourism-statistics-publications"
# Market names as they appear in the data
[docs] MARKET_NAMES = [ "Great Britain", "Other Europe", "North America", "Other Overseas", "Republic of Ireland", "NI Residents", "Total", ]
[docs] def get_latest_visitor_statistics_publication_url() -> tuple[str, str]: """Scrape NISRA tourism publications page to find the latest quarterly file. The publications page directly lists Excel files for each quarter in the format: "NI Tourism Q3 2025" linking to .xlsx files. Returns: Tuple of (excel_file_url, publication_period) e.g. ("https://...", "Q3 2025") Raises: NISRADataNotFoundError: If publication or file not found """ from bs4 import BeautifulSoup try: # Use shared session with retry logic for resilient requests response = session.get(TOURISM_PUBLICATIONS_URL, timeout=30) response.raise_for_status() except Exception as e: raise NISRADataNotFoundError(f"Failed to fetch tourism publications page: {e}") from e soup = BeautifulSoup(response.content, "html.parser") # Find latest quarterly tourism Excel file # Pattern: "NI Tourism Q3 2025" with href ending in .xlsx excel_files = [] for link in soup.find_all("a", href=True): link_text = link.get_text(strip=True) href = link["href"] # Match Excel files with tourism in text if "tourism" in link_text.lower() and (href.endswith(".xls") or href.endswith(".xlsx")): if href.startswith("/"): href = f"https://www.nisra.gov.uk{href}" # Extract quarter/year from link text # Pattern: "NI Tourism Q3 2025" or "NI Tourism 2025 Quarter 3" q_match = re.search(r"Q(\d)\s+(\d{4})", link_text, re.IGNORECASE) if q_match: quarter = int(q_match.group(1)) year = int(q_match.group(2)) pub_period = f"Q{quarter} {year}" else: q_match = re.search(r"(\d{4})\s+Quarter\s+(\d)", link_text, re.IGNORECASE) if q_match: year = int(q_match.group(1)) quarter = int(q_match.group(2)) pub_period = f"Q{quarter} {year}" else: continue # Skip if we can't extract period excel_files.append((href, year, quarter, pub_period)) logger.info(f"Found tourism file: {link_text} -> {pub_period}") if not excel_files: raise NISRADataNotFoundError("Could not find quarterly tourism Excel files on publications page") # Sort by year and quarter descending, take the latest excel_files.sort(key=lambda x: (x[1], x[2]), reverse=True) latest_url, _, _, latest_period = excel_files[0] logger.info(f"Selected latest: {latest_period} from {latest_url}") return latest_url, latest_period
def _parse_visitor_statistics_file( file_path: str, ) -> pd.DataFrame: """Parse visitor statistics Excel file (Table 10 - comprehensive market data). Table 10 contains trips, nights, and expenditure by market for rolling 12-month periods. Structure: - Rows 0-14: Metadata and notes - Row 15: Headers with column descriptions - Rows 16-22: Data by market (GB, Other Europe, North America, etc.) The latest year's data is in columns 2, 4, 6 (trips, nights, spend). Previous year's data is in columns 1, 3, 5. Args: file_path: Path to downloaded Excel file Returns: DataFrame with columns: market, trips, nights, expenditure, period, year, quarter Raises: NISRAValidationError: If parsing fails or data is invalid """ try: # Read entire sheet to find header row df_raw = pd.read_excel( file_path, sheet_name="Table 10", header=None, ) except Exception as e: raise NISRAValidationError(f"Failed to read Table 10 from Excel file: {e}") from e # Find header row (contains "Variable" or "Overnight Trips") header_row_idx = None for idx in range(len(df_raw)): row_text = " ".join(str(v) for v in df_raw.iloc[idx].values if pd.notna(v)) if "Variable" in row_text or "Overnight Trips" in row_text: header_row_idx = idx break if header_row_idx is None: raise NISRAValidationError("Could not identify header row in Table 10") # Parse the header to understand column structure # Typical format: col 1/2 = trips (prev/curr), col 3/4 = nights, col 5/6 = spend # We want the latest year's data (higher column indices) header_row = df_raw.iloc[header_row_idx] headers = [str(v).strip() if pd.notna(v) else "" for v in header_row.values] # Find column indices for latest year data by finding highest year in headers trips_col = None nights_col = None spend_col = None year = None # First pass: find all years mentioned in headers all_years = set() for h in headers: # Match both 4-digit (2025) and 2-digit (25) years for match in re.findall(r"\b(20\d{2})\b", h): all_years.add(int(match)) for match in re.findall(r"\b(\d{2})\b", h): if 20 <= int(match) <= 30: # Reasonable range for 2-digit years all_years.add(2000 + int(match)) latest_year = max(all_years) if all_years else None year = latest_year # Find columns for latest year (match either full year or 2-digit suffix) latest_year_patterns = [] if latest_year: latest_year_patterns = [str(latest_year), str(latest_year)[-2:]] for i, h in enumerate(headers): h_lower = h.lower() is_latest = any(p in h for p in latest_year_patterns) if "overnight trips" in h_lower and is_latest: trips_col = i elif "overnights" in h_lower and is_latest: nights_col = i elif "spend" in h_lower and is_latest: spend_col = i # Fallback: use fixed columns if header parsing fails # Column 2 = trips latest, 4 = nights latest, 6 = spend latest if trips_col is None: trips_col = 2 if nights_col is None: nights_col = 4 if spend_col is None: spend_col = 6 if len(headers) > 6 else 5 # Market name mappings (handling variations in source data) market_mappings = { "great britain": "Great Britain", "gb": "Great Britain", "other europe": "Other Europe", "north america": "North America", "other overseas": "Other Overseas", "roi": "Republic of Ireland", "republic of ireland": "Republic of Ireland", "ni": "NI Residents", "ni residents": "NI Residents", "total": "Total", } records = [] # Parse data rows (after header) for idx in range(header_row_idx + 1, len(df_raw)): row = df_raw.iloc[idx] first_cell = str(row.iloc[0]).strip() if pd.notna(row.iloc[0]) else "" # Skip empty rows if not first_cell: continue # Match market name market_key = first_cell.lower().strip() market = market_mappings.get(market_key) if market is None: # Try partial matching for key, value in market_mappings.items(): if key in market_key: market = value break if market is None: continue # Extract values from identified columns try: trips = float(row.iloc[trips_col]) if pd.notna(row.iloc[trips_col]) else 0 nights = float(row.iloc[nights_col]) if pd.notna(row.iloc[nights_col]) else 0 spend = float(row.iloc[spend_col]) if pd.notna(row.iloc[spend_col]) else 0 except (ValueError, TypeError, IndexError): continue records.append( { "market": market, "trips": trips * 1000, # Convert from thousands "nights": nights * 1000, # Convert from thousands "expenditure": spend, # Already in £ millions "period": "12-month rolling", "year": year, "quarter": None, } ) if not records: raise NISRAValidationError("No visitor statistics data found in Table 10") return pd.DataFrame(records)
[docs] def get_latest_visitor_statistics( force_refresh: bool = False, ) -> pd.DataFrame: """Get the latest quarterly visitor statistics by market. Retrieves comprehensive visitor statistics including trips, nights spent, and expenditure broken down by visitor origin market. Args: force_refresh: If True, bypass cache and download fresh data Returns: DataFrame with columns: - market: Visitor origin (Great Britain, Other Europe, etc.) - trips: Number of overnight trips - nights: Number of nights spent - expenditure: Visitor spending (£ millions) - period: Measurement period (12-month rolling, year-to-date) - year: Reference year - quarter: Reference quarter Raises: NISRADataNotFoundError: If publication not found NISRAValidationError: If data parsing fails Example: >>> df = get_latest_visitor_statistics() >>> 'market' in df.columns True """ excel_url, pub_period = get_latest_visitor_statistics_publication_url() logger.info(f"Downloading visitor statistics for {pub_period}: {excel_url}") cache_ttl = 0 if force_refresh else 24 file_path = download_file(excel_url, cache_ttl_hours=cache_ttl) df = _parse_visitor_statistics_file(file_path) # Validate data if not validate_visitor_statistics(df): raise NISRAValidationError("Visitor statistics validation failed") return df
[docs] def validate_visitor_statistics(df: pd.DataFrame) -> bool: """Validate visitor statistics data integrity. Args: df: DataFrame to validate Returns: True if valid, False otherwise """ if df.empty: logger.warning("Visitor statistics DataFrame is empty") return False required_cols = {"market", "trips", "nights", "expenditure"} if not required_cols.issubset(df.columns): logger.warning(f"Missing required columns: {required_cols - set(df.columns)}") return False # Check for reasonable value ranges if (df["trips"] < 0).any(): logger.warning("Negative trip values found") return False if (df["expenditure"] < 0).any(): logger.warning("Negative expenditure values found") return False # Check we have multiple markets if df["market"].nunique() < 3: logger.warning(f"Too few markets: {df['market'].nunique()}") return False return True
[docs] def get_visitor_statistics_by_market(df: pd.DataFrame, market: str) -> pd.Series | None: """Get visitor statistics for a specific market. Args: df: Visitor statistics DataFrame market: Market name (e.g., "Great Britain", "Republic of Ireland") Returns: Series with statistics for the market, or None if not found Example: >>> df = get_latest_visitor_statistics() >>> gb = get_visitor_statistics_by_market(df, "Great Britain") >>> gb is not None True """ matches = df[df["market"].str.lower() == market.lower()] if matches.empty: return None return matches.iloc[0]
[docs] def get_total_visitor_statistics(df: pd.DataFrame) -> pd.Series | None: """Get total visitor statistics across all markets. Args: df: Visitor statistics DataFrame Returns: Series with total statistics, or None if not found """ return get_visitor_statistics_by_market(df, "Total")
[docs] def get_domestic_vs_external(df: pd.DataFrame) -> pd.DataFrame: """Compare domestic (NI residents) vs external visitor statistics. Args: df: Visitor statistics DataFrame Returns: DataFrame with domestic and external totals and percentages Example: >>> df = get_latest_visitor_statistics() >>> comparison = get_domestic_vs_external(df) >>> 'category' in comparison.columns True """ domestic = df[df["market"] == "NI Residents"] external = df[~df["market"].isin(["NI Residents", "Total"])] if domestic.empty: return pd.DataFrame() domestic_stats = domestic.iloc[0] external_total = external.agg({"trips": "sum", "nights": "sum", "expenditure": "sum"}) total_trips = domestic_stats["trips"] + external_total["trips"] total_expenditure = domestic_stats["expenditure"] + external_total["expenditure"] return pd.DataFrame( [ { "category": "Domestic (NI)", "trips": domestic_stats["trips"], "nights": domestic_stats["nights"], "expenditure": domestic_stats["expenditure"], "trips_pct": domestic_stats["trips"] / total_trips * 100 if total_trips > 0 else 0, "expenditure_pct": domestic_stats["expenditure"] / total_expenditure * 100 if total_expenditure > 0 else 0, }, { "category": "External", "trips": external_total["trips"], "nights": external_total["nights"], "expenditure": external_total["expenditure"], "trips_pct": external_total["trips"] / total_trips * 100 if total_trips > 0 else 0, "expenditure_pct": external_total["expenditure"] / total_expenditure * 100 if total_expenditure > 0 else 0, }, ] )
[docs] def get_expenditure_per_trip(df: pd.DataFrame) -> pd.DataFrame: """Calculate average expenditure per trip by market. Args: df: Visitor statistics DataFrame Returns: DataFrame with market and expenditure_per_trip columns Example: >>> df = get_latest_visitor_statistics() >>> spend = get_expenditure_per_trip(df) >>> 'expenditure_per_trip' in spend.columns True """ result = df[df["market"] != "Total"].copy() # Expenditure is in millions, trips are individual result["expenditure_per_trip"] = (result["expenditure"] * 1_000_000 / result["trips"]).round(2) return result[["market", "trips", "expenditure", "expenditure_per_trip"]]
[docs] def get_nights_per_trip(df: pd.DataFrame) -> pd.DataFrame: """Calculate average nights per trip by market. Args: df: Visitor statistics DataFrame Returns: DataFrame with market and nights_per_trip columns Example: >>> df = get_latest_visitor_statistics() >>> duration = get_nights_per_trip(df) >>> 'nights_per_trip' in duration.columns True """ result = df[df["market"] != "Total"].copy() result["nights_per_trip"] = (result["nights"] / result["trips"]).round(2) return result[["market", "trips", "nights", "nights_per_trip"]]
[docs] def get_market_summary(df: pd.DataFrame) -> pd.DataFrame: """Get summary of all markets with derived metrics. Args: df: Visitor statistics DataFrame Returns: DataFrame with market summary including percentages and per-trip metrics Example: >>> df = get_latest_visitor_statistics() >>> summary = get_market_summary(df) >>> 'market' in summary.columns True """ total = get_total_visitor_statistics(df) if total is None: return df result = df[df["market"] != "Total"].copy() # Add percentage columns result["trips_pct"] = (result["trips"] / total["trips"] * 100).round(1) result["nights_pct"] = (result["nights"] / total["nights"] * 100).round(1) result["expenditure_pct"] = (result["expenditure"] / total["expenditure"] * 100).round(1) # Add per-trip metrics result["nights_per_trip"] = (result["nights"] / result["trips"]).round(2) result["expenditure_per_trip"] = (result["expenditure"] * 1_000_000 / result["trips"]).round(2) return result