"""NISRA Quarterly Tourism Visitor Statistics Data Source.
Provides access to quarterly visitor statistics for Northern Ireland covering
overnight trips, nights spent, and expenditure across geographic markets.
Data includes:
- Overnight trips by visitor origin (GB, Europe, N.America, ROI, NI residents)
- Nights spent (bednights) by market
- Visitor expenditure by market (£ millions)
- Reason for visit breakdowns (holiday, business, visiting friends/relatives)
- Historical trends from 2015 onwards
Data is compiled from multiple sources:
- Northern Ireland Passenger Survey (NIPS)
- Central Statistics Office Inbound Tourism data
- Continuous Household Survey
- Household Travel Survey
Data Source:
**Mother Page**: https://www.nisra.gov.uk/publications/quarterly-tourism-statistics-publications
This page lists quarterly tourism statistics publications. The module
automatically scrapes to find the latest quarterly report Excel file.
Update Frequency: Quarterly (published ~6 weeks after quarter end)
Geographic Coverage: Northern Ireland (by visitor origin market)
Reference Period: Rolling 12-month and year-to-date
Example:
>>> from bolster.data_sources.nisra.tourism import visitor_statistics
>>> df = visitor_statistics.get_latest_visitor_statistics()
>>> 'market' in df.columns
True
>>> 'expenditure' in df.columns
True
"""
import logging
import re
import pandas as pd
from bolster.data_sources.nisra._base import NISRADataNotFoundError, NISRAValidationError, download_file
from bolster.utils.web import session
[docs]
logger = logging.getLogger(__name__)
# Base URL for quarterly tourism statistics
[docs]
TOURISM_PUBLICATIONS_URL = "https://www.nisra.gov.uk/publications/quarterly-tourism-statistics-publications"
# Market names as they appear in the data
[docs]
MARKET_NAMES = [
"Great Britain",
"Other Europe",
"North America",
"Other Overseas",
"Republic of Ireland",
"NI Residents",
"Total",
]
[docs]
def get_latest_visitor_statistics_publication_url() -> tuple[str, str]:
"""Scrape NISRA tourism publications page to find the latest quarterly file.
The publications page directly lists Excel files for each quarter in the format:
"NI Tourism Q3 2025" linking to .xlsx files.
Returns:
Tuple of (excel_file_url, publication_period) e.g. ("https://...", "Q3 2025")
Raises:
NISRADataNotFoundError: If publication or file not found
"""
from bs4 import BeautifulSoup
try:
# Use shared session with retry logic for resilient requests
response = session.get(TOURISM_PUBLICATIONS_URL, timeout=30)
response.raise_for_status()
except Exception as e:
raise NISRADataNotFoundError(f"Failed to fetch tourism publications page: {e}") from e
soup = BeautifulSoup(response.content, "html.parser")
# Find latest quarterly tourism Excel file
# Pattern: "NI Tourism Q3 2025" with href ending in .xlsx
excel_files = []
for link in soup.find_all("a", href=True):
link_text = link.get_text(strip=True)
href = link["href"]
# Match Excel files with tourism in text
if "tourism" in link_text.lower() and (href.endswith(".xls") or href.endswith(".xlsx")):
if href.startswith("/"):
href = f"https://www.nisra.gov.uk{href}"
# Extract quarter/year from link text
# Pattern: "NI Tourism Q3 2025" or "NI Tourism 2025 Quarter 3"
q_match = re.search(r"Q(\d)\s+(\d{4})", link_text, re.IGNORECASE)
if q_match:
quarter = int(q_match.group(1))
year = int(q_match.group(2))
pub_period = f"Q{quarter} {year}"
else:
q_match = re.search(r"(\d{4})\s+Quarter\s+(\d)", link_text, re.IGNORECASE)
if q_match:
year = int(q_match.group(1))
quarter = int(q_match.group(2))
pub_period = f"Q{quarter} {year}"
else:
continue # Skip if we can't extract period
excel_files.append((href, year, quarter, pub_period))
logger.info(f"Found tourism file: {link_text} -> {pub_period}")
if not excel_files:
raise NISRADataNotFoundError("Could not find quarterly tourism Excel files on publications page")
# Sort by year and quarter descending, take the latest
excel_files.sort(key=lambda x: (x[1], x[2]), reverse=True)
latest_url, _, _, latest_period = excel_files[0]
logger.info(f"Selected latest: {latest_period} from {latest_url}")
return latest_url, latest_period
def _parse_visitor_statistics_file(
file_path: str,
) -> pd.DataFrame:
"""Parse visitor statistics Excel file (Table 10 - comprehensive market data).
Table 10 contains trips, nights, and expenditure by market for rolling 12-month
periods. Structure:
- Rows 0-14: Metadata and notes
- Row 15: Headers with column descriptions
- Rows 16-22: Data by market (GB, Other Europe, North America, etc.)
The latest year's data is in columns 2, 4, 6 (trips, nights, spend).
Previous year's data is in columns 1, 3, 5.
Args:
file_path: Path to downloaded Excel file
Returns:
DataFrame with columns: market, trips, nights, expenditure, period, year, quarter
Raises:
NISRAValidationError: If parsing fails or data is invalid
"""
try:
# Read entire sheet to find header row
df_raw = pd.read_excel(
file_path,
sheet_name="Table 10",
header=None,
)
except Exception as e:
raise NISRAValidationError(f"Failed to read Table 10 from Excel file: {e}") from e
# Find header row (contains "Variable" or "Overnight Trips")
header_row_idx = None
for idx in range(len(df_raw)):
row_text = " ".join(str(v) for v in df_raw.iloc[idx].values if pd.notna(v))
if "Variable" in row_text or "Overnight Trips" in row_text:
header_row_idx = idx
break
if header_row_idx is None:
raise NISRAValidationError("Could not identify header row in Table 10")
# Parse the header to understand column structure
# Typical format: col 1/2 = trips (prev/curr), col 3/4 = nights, col 5/6 = spend
# We want the latest year's data (higher column indices)
header_row = df_raw.iloc[header_row_idx]
headers = [str(v).strip() if pd.notna(v) else "" for v in header_row.values]
# Find column indices for latest year data by finding highest year in headers
trips_col = None
nights_col = None
spend_col = None
year = None
# First pass: find all years mentioned in headers
all_years = set()
for h in headers:
# Match both 4-digit (2025) and 2-digit (25) years
for match in re.findall(r"\b(20\d{2})\b", h):
all_years.add(int(match))
for match in re.findall(r"\b(\d{2})\b", h):
if 20 <= int(match) <= 30: # Reasonable range for 2-digit years
all_years.add(2000 + int(match))
latest_year = max(all_years) if all_years else None
year = latest_year
# Find columns for latest year (match either full year or 2-digit suffix)
latest_year_patterns = []
if latest_year:
latest_year_patterns = [str(latest_year), str(latest_year)[-2:]]
for i, h in enumerate(headers):
h_lower = h.lower()
is_latest = any(p in h for p in latest_year_patterns)
if "overnight trips" in h_lower and is_latest:
trips_col = i
elif "overnights" in h_lower and is_latest:
nights_col = i
elif "spend" in h_lower and is_latest:
spend_col = i
# Fallback: use fixed columns if header parsing fails
# Column 2 = trips latest, 4 = nights latest, 6 = spend latest
if trips_col is None:
trips_col = 2
if nights_col is None:
nights_col = 4
if spend_col is None:
spend_col = 6 if len(headers) > 6 else 5
# Market name mappings (handling variations in source data)
market_mappings = {
"great britain": "Great Britain",
"gb": "Great Britain",
"other europe": "Other Europe",
"north america": "North America",
"other overseas": "Other Overseas",
"roi": "Republic of Ireland",
"republic of ireland": "Republic of Ireland",
"ni": "NI Residents",
"ni residents": "NI Residents",
"total": "Total",
}
records = []
# Parse data rows (after header)
for idx in range(header_row_idx + 1, len(df_raw)):
row = df_raw.iloc[idx]
first_cell = str(row.iloc[0]).strip() if pd.notna(row.iloc[0]) else ""
# Skip empty rows
if not first_cell:
continue
# Match market name
market_key = first_cell.lower().strip()
market = market_mappings.get(market_key)
if market is None:
# Try partial matching
for key, value in market_mappings.items():
if key in market_key:
market = value
break
if market is None:
continue
# Extract values from identified columns
try:
trips = float(row.iloc[trips_col]) if pd.notna(row.iloc[trips_col]) else 0
nights = float(row.iloc[nights_col]) if pd.notna(row.iloc[nights_col]) else 0
spend = float(row.iloc[spend_col]) if pd.notna(row.iloc[spend_col]) else 0
except (ValueError, TypeError, IndexError):
continue
records.append(
{
"market": market,
"trips": trips * 1000, # Convert from thousands
"nights": nights * 1000, # Convert from thousands
"expenditure": spend, # Already in £ millions
"period": "12-month rolling",
"year": year,
"quarter": None,
}
)
if not records:
raise NISRAValidationError("No visitor statistics data found in Table 10")
return pd.DataFrame(records)
[docs]
def get_latest_visitor_statistics(
force_refresh: bool = False,
) -> pd.DataFrame:
"""Get the latest quarterly visitor statistics by market.
Retrieves comprehensive visitor statistics including trips, nights spent,
and expenditure broken down by visitor origin market.
Args:
force_refresh: If True, bypass cache and download fresh data
Returns:
DataFrame with columns:
- market: Visitor origin (Great Britain, Other Europe, etc.)
- trips: Number of overnight trips
- nights: Number of nights spent
- expenditure: Visitor spending (£ millions)
- period: Measurement period (12-month rolling, year-to-date)
- year: Reference year
- quarter: Reference quarter
Raises:
NISRADataNotFoundError: If publication not found
NISRAValidationError: If data parsing fails
Example:
>>> df = get_latest_visitor_statistics()
>>> 'market' in df.columns
True
"""
excel_url, pub_period = get_latest_visitor_statistics_publication_url()
logger.info(f"Downloading visitor statistics for {pub_period}: {excel_url}")
cache_ttl = 0 if force_refresh else 24
file_path = download_file(excel_url, cache_ttl_hours=cache_ttl)
df = _parse_visitor_statistics_file(file_path)
# Validate data
if not validate_visitor_statistics(df):
raise NISRAValidationError("Visitor statistics validation failed")
return df
[docs]
def validate_visitor_statistics(df: pd.DataFrame) -> bool:
"""Validate visitor statistics data integrity.
Args:
df: DataFrame to validate
Returns:
True if valid, False otherwise
"""
if df.empty:
logger.warning("Visitor statistics DataFrame is empty")
return False
required_cols = {"market", "trips", "nights", "expenditure"}
if not required_cols.issubset(df.columns):
logger.warning(f"Missing required columns: {required_cols - set(df.columns)}")
return False
# Check for reasonable value ranges
if (df["trips"] < 0).any():
logger.warning("Negative trip values found")
return False
if (df["expenditure"] < 0).any():
logger.warning("Negative expenditure values found")
return False
# Check we have multiple markets
if df["market"].nunique() < 3:
logger.warning(f"Too few markets: {df['market'].nunique()}")
return False
return True
[docs]
def get_visitor_statistics_by_market(df: pd.DataFrame, market: str) -> pd.Series | None:
"""Get visitor statistics for a specific market.
Args:
df: Visitor statistics DataFrame
market: Market name (e.g., "Great Britain", "Republic of Ireland")
Returns:
Series with statistics for the market, or None if not found
Example:
>>> df = get_latest_visitor_statistics()
>>> gb = get_visitor_statistics_by_market(df, "Great Britain")
>>> gb is not None
True
"""
matches = df[df["market"].str.lower() == market.lower()]
if matches.empty:
return None
return matches.iloc[0]
[docs]
def get_total_visitor_statistics(df: pd.DataFrame) -> pd.Series | None:
"""Get total visitor statistics across all markets.
Args:
df: Visitor statistics DataFrame
Returns:
Series with total statistics, or None if not found
"""
return get_visitor_statistics_by_market(df, "Total")
[docs]
def get_domestic_vs_external(df: pd.DataFrame) -> pd.DataFrame:
"""Compare domestic (NI residents) vs external visitor statistics.
Args:
df: Visitor statistics DataFrame
Returns:
DataFrame with domestic and external totals and percentages
Example:
>>> df = get_latest_visitor_statistics()
>>> comparison = get_domestic_vs_external(df)
>>> 'category' in comparison.columns
True
"""
domestic = df[df["market"] == "NI Residents"]
external = df[~df["market"].isin(["NI Residents", "Total"])]
if domestic.empty:
return pd.DataFrame()
domestic_stats = domestic.iloc[0]
external_total = external.agg({"trips": "sum", "nights": "sum", "expenditure": "sum"})
total_trips = domestic_stats["trips"] + external_total["trips"]
total_expenditure = domestic_stats["expenditure"] + external_total["expenditure"]
return pd.DataFrame(
[
{
"category": "Domestic (NI)",
"trips": domestic_stats["trips"],
"nights": domestic_stats["nights"],
"expenditure": domestic_stats["expenditure"],
"trips_pct": domestic_stats["trips"] / total_trips * 100 if total_trips > 0 else 0,
"expenditure_pct": domestic_stats["expenditure"] / total_expenditure * 100
if total_expenditure > 0
else 0,
},
{
"category": "External",
"trips": external_total["trips"],
"nights": external_total["nights"],
"expenditure": external_total["expenditure"],
"trips_pct": external_total["trips"] / total_trips * 100 if total_trips > 0 else 0,
"expenditure_pct": external_total["expenditure"] / total_expenditure * 100
if total_expenditure > 0
else 0,
},
]
)
[docs]
def get_expenditure_per_trip(df: pd.DataFrame) -> pd.DataFrame:
"""Calculate average expenditure per trip by market.
Args:
df: Visitor statistics DataFrame
Returns:
DataFrame with market and expenditure_per_trip columns
Example:
>>> df = get_latest_visitor_statistics()
>>> spend = get_expenditure_per_trip(df)
>>> 'expenditure_per_trip' in spend.columns
True
"""
result = df[df["market"] != "Total"].copy()
# Expenditure is in millions, trips are individual
result["expenditure_per_trip"] = (result["expenditure"] * 1_000_000 / result["trips"]).round(2)
return result[["market", "trips", "expenditure", "expenditure_per_trip"]]
[docs]
def get_nights_per_trip(df: pd.DataFrame) -> pd.DataFrame:
"""Calculate average nights per trip by market.
Args:
df: Visitor statistics DataFrame
Returns:
DataFrame with market and nights_per_trip columns
Example:
>>> df = get_latest_visitor_statistics()
>>> duration = get_nights_per_trip(df)
>>> 'nights_per_trip' in duration.columns
True
"""
result = df[df["market"] != "Total"].copy()
result["nights_per_trip"] = (result["nights"] / result["trips"]).round(2)
return result[["market", "trips", "nights", "nights_per_trip"]]
[docs]
def get_market_summary(df: pd.DataFrame) -> pd.DataFrame:
"""Get summary of all markets with derived metrics.
Args:
df: Visitor statistics DataFrame
Returns:
DataFrame with market summary including percentages and per-trip metrics
Example:
>>> df = get_latest_visitor_statistics()
>>> summary = get_market_summary(df)
>>> 'market' in summary.columns
True
"""
total = get_total_visitor_statistics(df)
if total is None:
return df
result = df[df["market"] != "Total"].copy()
# Add percentage columns
result["trips_pct"] = (result["trips"] / total["trips"] * 100).round(1)
result["nights_pct"] = (result["nights"] / total["nights"] * 100).round(1)
result["expenditure_pct"] = (result["expenditure"] / total["expenditure"] * 100).round(1)
# Add per-trip metrics
result["nights_per_trip"] = (result["nights"] / result["trips"]).round(2)
result["expenditure_per_trip"] = (result["expenditure"] * 1_000_000 / result["trips"]).round(2)
return result