"""NISRA Monthly Occupancy Statistics Data Source.
Provides access to monthly hotel and accommodation occupancy data for Northern Ireland.
Data includes:
- Hotel room and bed occupancy rates from 2011 to present
- Rooms and beds sold monthly (hotels)
- Small Service Accommodation (SSA) occupancy from 2013 to present (B&Bs, guest houses, and similar establishments)
- Rooms and beds sold monthly (SSA)
The survey provides indicative monthly occupancy rates that are revised and
finalised in the Annual Publication.
Data Source:
**Mother Page**: https://www.nisra.gov.uk/statistics/tourism/occupancy-surveys
This page lists all occupancy survey publications. The module automatically
scrapes this page to find the latest "Hotel Occupancy" or "Small Service
Accommodation" publication, then downloads the Excel file.
Update Frequency: Monthly (published around the 15th of the following month)
Geographic Coverage: Northern Ireland
Reference Date: Month of survey
Example:
>>> from bolster.data_sources.nisra.tourism import occupancy
>>> df = occupancy.get_latest_hotel_occupancy()
>>> 'room_occupancy' in df.columns
True
>>> df_sold = occupancy.get_latest_rooms_beds_sold()
>>> 'rooms_sold' in df_sold.columns
True
>>> df_combined = occupancy.get_combined_occupancy()
>>> 'accommodation_type' in df_combined.columns
True
"""
import logging
import re
from pathlib import Path
from typing import Literal
import pandas as pd
from bolster.data_sources.nisra._base import NISRADataNotFoundError, NISRAValidationError, download_file
from bolster.utils.web import session
[docs]
logger = logging.getLogger(__name__)
# Base URL for occupancy statistics
[docs]
OCCUPANCY_BASE_URL = "https://www.nisra.gov.uk/statistics/tourism/occupancy-surveys"
[docs]
def get_latest_hotel_occupancy_publication_url() -> tuple[str, str]:
"""Scrape NISRA occupancy surveys page to find the latest hotel occupancy file.
Navigates the publication structure:
1. Scrapes mother page for latest hotel occupancy publication
2. Follows link to publication detail page
3. Finds hotel occupancy Excel file
Returns:
Tuple of (excel_file_url, publication_date)
Raises:
NISRADataNotFoundError: If publication or file not found
"""
from bs4 import BeautifulSoup
mother_page = OCCUPANCY_BASE_URL
try:
# Use shared session with retry logic for resilient requests
response = session.get(mother_page, timeout=30)
response.raise_for_status()
except Exception as e:
raise NISRADataNotFoundError(f"Failed to fetch occupancy surveys page: {e}") from e
soup = BeautifulSoup(response.content, "html.parser")
# Find latest hotel occupancy publication
# Pattern: "December 2024 hotel occupancy survey publications" or similar
pub_link = None
pub_date = None
for link in soup.find_all("a", href=True):
link_text = link.get_text(strip=True)
# Match hotel occupancy publications
if "hotel occupancy" in link_text.lower() and "publications" in link["href"]:
href = link["href"]
if href.startswith("/"):
href = f"https://www.nisra.gov.uk{href}"
# Extract month/year from link text if available
# Pattern: "December 2024 hotel occupancy..."
date_match = re.search(r"([A-Z][a-z]+)\s+(\d{4})", link_text)
if date_match:
pub_date = f"{date_match.group(1)} {date_match.group(2)}"
# Take first match (should be newest due to reverse chronological order)
pub_link = href
logger.info(f"Found hotel occupancy publication: {link_text}")
break
if not pub_link:
raise NISRADataNotFoundError("Could not find hotel occupancy publication on mother page")
# Scrape the publication page for Excel file
try:
pub_response = session.get(pub_link, timeout=30)
pub_response.raise_for_status()
except Exception as e:
raise NISRADataNotFoundError(f"Failed to fetch publication page: {e}") from e
pub_soup = BeautifulSoup(pub_response.content, "html.parser")
# Find hotel occupancy Excel file
# Pattern: "2024-Hotel-December-excel-(online).xls" or similar
excel_url = None
for link in pub_soup.find_all("a", href=True):
href = link["href"]
# Match Excel files (.xls or .xlsx)
if "Hotel" in href and (href.endswith(".xls") or href.endswith(".xlsx")):
if href.startswith("/"):
href = f"https://www.nisra.gov.uk{href}"
excel_url = href
logger.info(f"Found hotel occupancy Excel file: {href}")
break
if not excel_url:
raise NISRADataNotFoundError("Could not find hotel occupancy Excel file on publication page")
return excel_url, pub_date or "Unknown"
[docs]
def get_latest_ssa_occupancy_publication_url() -> tuple[str, str]:
"""Scrape NISRA occupancy surveys page to find the latest SSA file.
SSA = Small Service Accommodation (B&Bs, guest houses, etc.)
Navigates the publication structure:
1. Scrapes mother page for latest SSA occupancy publication
2. Follows link to publication detail page
3. Finds SSA occupancy Excel file
Returns:
Tuple of (excel_file_url, publication_date)
Raises:
NISRADataNotFoundError: If publication or file not found
"""
from bs4 import BeautifulSoup
mother_page = OCCUPANCY_BASE_URL
try:
# Use shared session with retry logic for resilient requests
response = session.get(mother_page, timeout=30)
response.raise_for_status()
except Exception as e:
raise NISRADataNotFoundError(f"Failed to fetch occupancy surveys page: {e}") from e
soup = BeautifulSoup(response.content, "html.parser")
# Find latest SSA occupancy publication
# Pattern: "November 2025 Small Service Accommodation occupancy" or similar
pub_link = None
pub_date = None
for link in soup.find_all("a", href=True):
link_text = link.get_text(strip=True)
# Match SSA publications
if "small service accommodation" in link_text.lower() and "publications" in link["href"]:
href = link["href"]
if href.startswith("/"):
href = f"https://www.nisra.gov.uk{href}"
# Extract month/year from link text if available
date_match = re.search(r"([A-Z][a-z]+)\s+(\d{4})", link_text)
if date_match:
pub_date = f"{date_match.group(1)} {date_match.group(2)}"
# Take first match (should be newest due to reverse chronological order)
pub_link = href
logger.info(f"Found SSA occupancy publication: {link_text}")
break
if not pub_link:
raise NISRADataNotFoundError("Could not find SSA occupancy publication on mother page")
# Scrape the publication page for Excel file
try:
pub_response = session.get(pub_link, timeout=30)
pub_response.raise_for_status()
except Exception as e:
raise NISRADataNotFoundError(f"Failed to fetch SSA publication page: {e}") from e
pub_soup = BeautifulSoup(pub_response.content, "html.parser")
# Find SSA occupancy Excel file
# Pattern: "2025-Small Service-November-excel-(online).xls" or similar
excel_url = None
for link in pub_soup.find_all("a", href=True):
href = link["href"]
# Match Excel files (.xls or .xlsx) - SSA files have "Small" or "Service" in name
if ("Small" in href or "Service" in href) and (href.endswith(".xls") or href.endswith(".xlsx")):
if href.startswith("/"):
href = f"https://www.nisra.gov.uk{href}"
excel_url = href
logger.info(f"Found SSA occupancy Excel file: {href}")
break
if not excel_url:
raise NISRADataNotFoundError("Could not find SSA occupancy Excel file on publication page")
return excel_url, pub_date or "Unknown"
def _find_table_by_title(file_path: str | Path, title_contains: str) -> str:
"""Find the sheet name that contains the specified title.
Args:
file_path: Path to the Excel file
title_contains: String to search for in the first row of each table
Returns:
Sheet name matching the criteria
Raises:
NISRAValidationError: If no matching table is found
"""
import openpyxl
file_path = Path(file_path)
wb = openpyxl.load_workbook(file_path, read_only=True, data_only=True)
for sheet_name in wb.sheetnames:
if sheet_name.strip().startswith("Table"):
ws = wb[sheet_name]
# Check first cell for title
first_cell = ws.cell(1, 1).value
if first_cell and title_contains.lower() in str(first_cell).lower():
wb.close()
return sheet_name
wb.close()
raise NISRAValidationError(f"Could not find table with title containing '{title_contains}'")
[docs]
def parse_hotel_occupancy_rates(file_path: str | Path) -> pd.DataFrame:
"""Parse NISRA hotel occupancy rates from Excel file (Table 1).
Args:
file_path: Path to the hotel occupancy Excel file
Returns:
DataFrame with columns:
- date: datetime (first day of month)
- year: int
- month: str (month name)
- room_occupancy: float (room occupancy rate, 0-1)
- bed_occupancy: float (bed occupancy rate, 0-1)
Raises:
NISRAValidationError: If file structure is unexpected
"""
file_path = Path(file_path)
try:
# Read Table 1 - Monthly occupancy rates
# Row 0-7: Notes and metadata
# Row 8: "<< link back to contents >>"
# Row 9: Headers (Month, YYYY Room occupancy, YYYY Bed occupancy, ...)
# Row 10+: Data (January, February, ...)
df_raw = pd.read_excel(
file_path,
sheet_name="Table 1",
header=None,
skiprows=9, # Skip to header row
nrows=13, # Read header + 12 months
)
except Exception as e:
raise NISRAValidationError(f"Failed to read hotel occupancy file: {e}") from e
# First row contains headers
headers = df_raw.iloc[0].tolist()
df_raw = df_raw.iloc[1:].reset_index(drop=True) # Drop header row
df_raw.columns = headers
# First column should be "Month"
month_col = headers[0]
# Identify year columns - they have pattern "YYYY Room occupancy" or "YYYY Bed occupancy"
year_room_cols = {}
year_bed_cols = {}
for col in headers[1:]:
col_str = str(col)
room_match = re.match(r"(\d{4})\s*Room\s*occupancy", col_str, re.IGNORECASE)
bed_match = re.match(r"(\d{4})\s*Bed\s*occupancy", col_str, re.IGNORECASE)
if room_match:
year_room_cols[int(room_match.group(1))] = col
elif bed_match:
year_bed_cols[int(bed_match.group(1))] = col
# Build long-format data
records = []
month_map = {
"January": 1,
"February": 2,
"March": 3,
"April": 4,
"May": 5,
"June": 6,
"July": 7,
"August": 8,
"September": 9,
"October": 10,
"November": 11,
"December": 12,
}
for _, row in df_raw.iterrows():
month_name = str(row[month_col]).strip()
if month_name not in month_map:
continue
month_num = month_map[month_name]
# Get data for each year
for year in sorted(set(year_room_cols.keys()) | set(year_bed_cols.keys())):
room_occ = None
bed_occ = None
if year in year_room_cols:
val = row[year_room_cols[year]]
if pd.notna(val) and val != 0 and val != "c":
room_occ = float(val)
if year in year_bed_cols:
val = row[year_bed_cols[year]]
if pd.notna(val) and val != 0 and val != "c":
bed_occ = float(val)
# Only add if we have at least one value
if room_occ is not None or bed_occ is not None:
records.append(
{
"year": year,
"month": month_name,
"month_num": month_num,
"room_occupancy": room_occ,
"bed_occupancy": bed_occ,
}
)
df = pd.DataFrame(records)
# Create datetime column
df["date"] = pd.to_datetime({"year": df["year"], "month": df["month_num"], "day": 1})
# Select and reorder columns
result = df[["date", "year", "month", "room_occupancy", "bed_occupancy"]].copy()
# Sort by date
result = result.sort_values("date").reset_index(drop=True)
# Log summary
logger.info(
f"Parsed {len(result)} monthly hotel occupancy records "
f"({result['date'].min().strftime('%Y-%m')} to {result['date'].max().strftime('%Y-%m')})"
)
return result
[docs]
def parse_rooms_beds_sold(file_path: str | Path) -> pd.DataFrame:
"""Parse NISRA hotel rooms and beds sold from Excel file (Table 3).
Args:
file_path: Path to the hotel occupancy Excel file
Returns:
DataFrame with columns:
- date: datetime (first day of month)
- year: int
- month: str (month name)
- rooms_sold: float (number of rooms sold)
- beds_sold: float (number of beds sold)
Raises:
NISRAValidationError: If file structure is unexpected
"""
file_path = Path(file_path)
try:
# Read Table 3 - Rooms and beds sold
# Row 0-7: Notes and metadata
# Row 8: "<< link to methodology >>"
# Row 9: "<< link back to contents >>"
# Row 10: Headers (Month, YYYY Rooms sold, YYYY Beds sold, ...)
# Row 11+: Data (January, February, ...)
df_raw = pd.read_excel(
file_path,
sheet_name="Table 3",
header=None,
skiprows=10, # Skip to header row
nrows=13, # Read header + 12 months
)
except Exception as e:
raise NISRAValidationError(f"Failed to read hotel occupancy file: {e}") from e
# First row contains headers
headers = df_raw.iloc[0].tolist()
df_raw = df_raw.iloc[1:].reset_index(drop=True) # Drop header row
df_raw.columns = headers
# First column should be "Month"
month_col = headers[0]
# Identify year columns
year_rooms_cols = {}
year_beds_cols = {}
for col in headers[1:]:
col_str = str(col)
rooms_match = re.match(r"(\d{4})\s*Rooms\s*sold", col_str, re.IGNORECASE)
beds_match = re.match(r"(\d{4})\s*Beds\s*sold", col_str, re.IGNORECASE)
if rooms_match:
year_rooms_cols[int(rooms_match.group(1))] = col
elif beds_match:
year_beds_cols[int(beds_match.group(1))] = col
# Build long-format data
records = []
month_map = {
"January": 1,
"February": 2,
"March": 3,
"April": 4,
"May": 5,
"June": 6,
"July": 7,
"August": 8,
"September": 9,
"October": 10,
"November": 11,
"December": 12,
}
for _, row in df_raw.iterrows():
month_name = str(row[month_col]).strip()
if month_name not in month_map:
continue
month_num = month_map[month_name]
# Get data for each year
for year in sorted(set(year_rooms_cols.keys()) | set(year_beds_cols.keys())):
rooms_sold = None
beds_sold = None
if year in year_rooms_cols:
val = row[year_rooms_cols[year]]
if pd.notna(val) and val != 0 and val != "*" and val != "c":
rooms_sold = float(val)
if year in year_beds_cols:
val = row[year_beds_cols[year]]
if pd.notna(val) and val != 0 and val != "*" and val != "c":
beds_sold = float(val)
# Only add if we have at least one value
if rooms_sold is not None or beds_sold is not None:
records.append(
{
"year": year,
"month": month_name,
"month_num": month_num,
"rooms_sold": rooms_sold,
"beds_sold": beds_sold,
}
)
df = pd.DataFrame(records)
# Create datetime column
df["date"] = pd.to_datetime({"year": df["year"], "month": df["month_num"], "day": 1})
# Select and reorder columns
result = df[["date", "year", "month", "rooms_sold", "beds_sold"]].copy()
# Sort by date
result = result.sort_values("date").reset_index(drop=True)
logger.info(
f"Parsed {len(result)} monthly rooms/beds sold records "
f"({result['date'].min().strftime('%Y-%m')} to {result['date'].max().strftime('%Y-%m')})"
)
return result
[docs]
def get_latest_hotel_occupancy(force_refresh: bool = False) -> pd.DataFrame:
"""Get the latest monthly hotel occupancy rates data.
Automatically discovers and downloads the most recent hotel occupancy
data from the NISRA website.
Args:
force_refresh: If True, bypass cache and download fresh data
Returns:
DataFrame with columns:
- date: datetime (first day of month)
- year: int
- month: str (month name)
- room_occupancy: float (room occupancy rate, 0-1)
- bed_occupancy: float (bed occupancy rate, 0-1)
Raises:
NISRADataNotFoundError: If latest publication cannot be found
NISRAValidationError: If file structure is unexpected
Example:
>>> df = get_latest_hotel_occupancy()
>>> 'room_occupancy' in df.columns
True
"""
excel_url, pub_date = get_latest_hotel_occupancy_publication_url()
logger.info(f"Downloading hotel occupancy data ({pub_date}) from: {excel_url}")
# Cache for 30 days (monthly data)
cache_ttl_hours = 30 * 24
file_path = download_file(excel_url, cache_ttl_hours=cache_ttl_hours, force_refresh=force_refresh)
return parse_hotel_occupancy_rates(file_path)
[docs]
def get_latest_rooms_beds_sold(force_refresh: bool = False) -> pd.DataFrame:
"""Get the latest monthly rooms and beds sold data.
Args:
force_refresh: If True, bypass cache and download fresh data
Returns:
DataFrame with columns:
- date: datetime (first day of month)
- year: int
- month: str (month name)
- rooms_sold: float (number of rooms sold)
- beds_sold: float (number of beds sold)
Example:
>>> df = get_latest_rooms_beds_sold()
>>> 'rooms_sold' in df.columns
True
"""
excel_url, pub_date = get_latest_hotel_occupancy_publication_url()
logger.info(f"Downloading rooms/beds sold data ({pub_date}) from: {excel_url}")
cache_ttl_hours = 30 * 24
file_path = download_file(excel_url, cache_ttl_hours=cache_ttl_hours, force_refresh=force_refresh)
return parse_rooms_beds_sold(file_path)
# ============================================================================
# Small Service Accommodation (SSA) Functions
# ============================================================================
[docs]
def parse_ssa_occupancy_rates(file_path: str | Path) -> pd.DataFrame:
"""Parse NISRA SSA occupancy rates from Excel file (Table 1).
SSA = Small Service Accommodation (B&Bs, guest houses, etc.)
Args:
file_path: Path to the SSA occupancy Excel file
Returns:
DataFrame with columns:
- date: datetime (first day of month)
- year: int
- month: str (month name)
- room_occupancy: float (room occupancy rate, 0-1)
- bed_occupancy: float (bed occupancy rate, 0-1)
Raises:
NISRAValidationError: If file structure is unexpected
"""
file_path = Path(file_path)
try:
# SSA files have trailing space on some sheet names
# Try both "Table 1" and "Table 1 "
sheet_name = None
with pd.ExcelFile(file_path) as xl:
for name in xl.sheet_names:
if name.strip() == "Table 1":
sheet_name = name
break
if not sheet_name:
raise NISRAValidationError("Could not find Table 1 in SSA file")
# Read Table 1 - Monthly occupancy rates
# Structure is same as hotel: rows 0-9 metadata, row 10 headers, rows 11+ data
df_raw = pd.read_excel(
file_path,
sheet_name=sheet_name,
header=None,
skiprows=10, # Skip to header row
nrows=13, # Read header + 12 months
)
except NISRAValidationError:
raise
except Exception as e:
raise NISRAValidationError(f"Failed to read SSA occupancy file: {e}") from e
# First row contains headers
headers = df_raw.iloc[0].tolist()
df_raw = df_raw.iloc[1:].reset_index(drop=True) # Drop header row
df_raw.columns = headers
# First column should be "Month"
month_col = headers[0]
# Identify year columns - same pattern as hotel
year_room_cols = {}
year_bed_cols = {}
for col in headers[1:]:
col_str = str(col)
room_match = re.match(r"(\d{4})\s*Room\s*occupancy", col_str, re.IGNORECASE)
bed_match = re.match(r"(\d{4})\s*Bed\s*occupancy", col_str, re.IGNORECASE)
if room_match:
year_room_cols[int(room_match.group(1))] = col
elif bed_match:
year_bed_cols[int(bed_match.group(1))] = col
# Build long-format data
records = []
month_map = {
"January": 1,
"February": 2,
"March": 3,
"April": 4,
"May": 5,
"June": 6,
"July": 7,
"August": 8,
"September": 9,
"October": 10,
"November": 11,
"December": 12,
}
for _, row in df_raw.iterrows():
month_name = str(row[month_col]).strip()
if month_name not in month_map:
continue
month_num = month_map[month_name]
# Get data for each year
for year in sorted(set(year_room_cols.keys()) | set(year_bed_cols.keys())):
room_occ = None
bed_occ = None
if year in year_room_cols:
val = row[year_room_cols[year]]
if pd.notna(val) and val != 0 and val != "c":
room_occ = float(val)
if year in year_bed_cols:
val = row[year_bed_cols[year]]
if pd.notna(val) and val != 0 and val != "c":
bed_occ = float(val)
# Only add if we have at least one value
if room_occ is not None or bed_occ is not None:
records.append(
{
"year": year,
"month": month_name,
"month_num": month_num,
"room_occupancy": room_occ,
"bed_occupancy": bed_occ,
}
)
df = pd.DataFrame(records)
# Create datetime column
df["date"] = pd.to_datetime({"year": df["year"], "month": df["month_num"], "day": 1})
# Select and reorder columns
result = df[["date", "year", "month", "room_occupancy", "bed_occupancy"]].copy()
# Sort by date
result = result.sort_values("date").reset_index(drop=True)
logger.info(
f"Parsed {len(result)} monthly SSA occupancy records "
f"({result['date'].min().strftime('%Y-%m')} to {result['date'].max().strftime('%Y-%m')})"
)
return result
[docs]
def parse_ssa_rooms_beds_sold(file_path: str | Path) -> pd.DataFrame:
"""Parse NISRA SSA rooms and beds sold from Excel file (Table 2).
Note: SSA uses Table 2 for rooms/beds sold, while Hotel uses Table 3.
Args:
file_path: Path to the SSA occupancy Excel file
Returns:
DataFrame with columns:
- date: datetime (first day of month)
- year: int
- month: str (month name)
- rooms_sold: float (number of rooms sold)
- beds_sold: float (number of beds sold)
Raises:
NISRAValidationError: If file structure is unexpected
"""
file_path = Path(file_path)
try:
# Read Table 2 - Rooms and beds sold
df_raw = pd.read_excel(
file_path,
sheet_name="Table 2",
header=None,
skiprows=10, # Skip to header row
nrows=13, # Read header + 12 months
)
except Exception as e:
raise NISRAValidationError(f"Failed to read SSA occupancy file: {e}") from e
# First row contains headers
headers = df_raw.iloc[0].tolist()
df_raw = df_raw.iloc[1:].reset_index(drop=True) # Drop header row
df_raw.columns = headers
# First column should be "Month"
month_col = headers[0]
# Identify year columns
year_rooms_cols = {}
year_beds_cols = {}
for col in headers[1:]:
col_str = str(col)
rooms_match = re.match(r"(\d{4})\s*Rooms\s*sold", col_str, re.IGNORECASE)
beds_match = re.match(r"(\d{4})\s*Beds\s*sold", col_str, re.IGNORECASE)
if rooms_match:
year_rooms_cols[int(rooms_match.group(1))] = col
elif beds_match:
year_beds_cols[int(beds_match.group(1))] = col
# Build long-format data
records = []
month_map = {
"January": 1,
"February": 2,
"March": 3,
"April": 4,
"May": 5,
"June": 6,
"July": 7,
"August": 8,
"September": 9,
"October": 10,
"November": 11,
"December": 12,
}
for _, row in df_raw.iterrows():
month_name = str(row[month_col]).strip()
if month_name not in month_map:
continue
month_num = month_map[month_name]
# Get data for each year
for year in sorted(set(year_rooms_cols.keys()) | set(year_beds_cols.keys())):
rooms_sold = None
beds_sold = None
if year in year_rooms_cols:
val = row[year_rooms_cols[year]]
if pd.notna(val) and val != 0 and val != "*" and val != "c":
rooms_sold = float(val)
if year in year_beds_cols:
val = row[year_beds_cols[year]]
if pd.notna(val) and val != 0 and val != "*" and val != "c":
beds_sold = float(val)
# Only add if we have at least one value
if rooms_sold is not None or beds_sold is not None:
records.append(
{
"year": year,
"month": month_name,
"month_num": month_num,
"rooms_sold": rooms_sold,
"beds_sold": beds_sold,
}
)
df = pd.DataFrame(records)
# Create datetime column
df["date"] = pd.to_datetime({"year": df["year"], "month": df["month_num"], "day": 1})
# Select and reorder columns
result = df[["date", "year", "month", "rooms_sold", "beds_sold"]].copy()
# Sort by date
result = result.sort_values("date").reset_index(drop=True)
logger.info(
f"Parsed {len(result)} monthly SSA rooms/beds sold records "
f"({result['date'].min().strftime('%Y-%m')} to {result['date'].max().strftime('%Y-%m')})"
)
return result
[docs]
def get_latest_ssa_occupancy(force_refresh: bool = False) -> pd.DataFrame:
"""Get the latest monthly SSA occupancy rates data.
SSA = Small Service Accommodation (B&Bs, guest houses, etc.)
Automatically discovers and downloads the most recent SSA occupancy
data from the NISRA website.
Args:
force_refresh: If True, bypass cache and download fresh data
Returns:
DataFrame with columns:
- date: datetime (first day of month)
- year: int
- month: str (month name)
- room_occupancy: float (room occupancy rate, 0-1)
- bed_occupancy: float (bed occupancy rate, 0-1)
Raises:
NISRADataNotFoundError: If latest publication cannot be found
NISRAValidationError: If file structure is unexpected
Example:
>>> df = get_latest_ssa_occupancy()
>>> 'room_occupancy' in df.columns
True
"""
excel_url, pub_date = get_latest_ssa_occupancy_publication_url()
logger.info(f"Downloading SSA occupancy data ({pub_date}) from: {excel_url}")
# Cache for 30 days (monthly data)
cache_ttl_hours = 30 * 24
file_path = download_file(excel_url, cache_ttl_hours=cache_ttl_hours, force_refresh=force_refresh)
return parse_ssa_occupancy_rates(file_path)
[docs]
def get_latest_ssa_rooms_beds_sold(force_refresh: bool = False) -> pd.DataFrame:
"""Get the latest monthly SSA rooms and beds sold data.
SSA = Small Service Accommodation (B&Bs, guest houses, etc.)
Args:
force_refresh: If True, bypass cache and download fresh data
Returns:
DataFrame with columns:
- date: datetime (first day of month)
- year: int
- month: str (month name)
- rooms_sold: float (number of rooms sold)
- beds_sold: float (number of beds sold)
Example:
>>> df = get_latest_ssa_rooms_beds_sold()
>>> 'rooms_sold' in df.columns
True
"""
excel_url, pub_date = get_latest_ssa_occupancy_publication_url()
logger.info(f"Downloading SSA rooms/beds sold data ({pub_date}) from: {excel_url}")
cache_ttl_hours = 30 * 24
file_path = download_file(excel_url, cache_ttl_hours=cache_ttl_hours, force_refresh=force_refresh)
return parse_ssa_rooms_beds_sold(file_path)
# ============================================================================
# Combined Accommodation Functions
# ============================================================================
[docs]
def get_combined_occupancy(force_refresh: bool = False) -> pd.DataFrame:
"""Get combined hotel and SSA occupancy data with accommodation type column.
This function fetches both hotel and SSA occupancy data and combines them
into a single DataFrame with an 'accommodation_type' column to distinguish
between the two accommodation types.
Args:
force_refresh: If True, bypass cache and download fresh data
Returns:
DataFrame with columns:
- date: datetime (first day of month)
- year: int
- month: str (month name)
- room_occupancy: float (room occupancy rate, 0-1)
- bed_occupancy: float (bed occupancy rate, 0-1)
- accommodation_type: str ('hotel' or 'ssa')
Example:
>>> df = get_combined_occupancy()
>>> 'accommodation_type' in df.columns
True
"""
hotel_df = get_latest_hotel_occupancy(force_refresh=force_refresh)
hotel_df["accommodation_type"] = "hotel"
ssa_df = get_latest_ssa_occupancy(force_refresh=force_refresh)
ssa_df["accommodation_type"] = "ssa"
combined = pd.concat([hotel_df, ssa_df], ignore_index=True)
combined = combined.sort_values(["date", "accommodation_type"]).reset_index(drop=True)
logger.info(f"Combined {len(hotel_df)} hotel + {len(ssa_df)} SSA = {len(combined)} total occupancy records")
return combined
[docs]
def compare_accommodation_types(
df: pd.DataFrame, metric: Literal["room_occupancy", "bed_occupancy"] = "room_occupancy"
) -> pd.DataFrame:
"""Compare occupancy between hotel and SSA by year.
Args:
df: DataFrame from get_combined_occupancy()
metric: Which occupancy metric to compare
Returns:
DataFrame with columns:
- year: int
- hotel_{metric}: float
- ssa_{metric}: float
- difference: float (hotel - ssa)
- ratio: float (hotel / ssa)
Example:
>>> df = get_combined_occupancy()
>>> comparison = compare_accommodation_types(df)
>>> 'difference' in comparison.columns
True
"""
pivot = df.pivot_table(
index="year",
columns="accommodation_type",
values=metric,
aggfunc="mean",
).reset_index()
pivot.columns.name = None
pivot = pivot.rename(columns={"hotel": f"hotel_{metric}", "ssa": f"ssa_{metric}"})
# Calculate comparison metrics
pivot["difference"] = pivot[f"hotel_{metric}"] - pivot[f"ssa_{metric}"]
pivot["ratio"] = pivot[f"hotel_{metric}"] / pivot[f"ssa_{metric}"]
return pivot
[docs]
def get_occupancy_by_year(df: pd.DataFrame, year: int) -> pd.DataFrame:
"""Filter occupancy data for a specific year.
Args:
df: DataFrame from get_latest_hotel_occupancy()
year: Year to filter
Returns:
Filtered DataFrame
"""
return df[df["year"] == year].reset_index(drop=True)
[docs]
def get_occupancy_summary_by_year(df: pd.DataFrame) -> pd.DataFrame:
"""Calculate annual occupancy averages and statistics.
Args:
df: DataFrame from get_latest_hotel_occupancy()
Returns:
DataFrame with columns:
- year: int
- avg_room_occupancy: float
- avg_bed_occupancy: float
- months_reported: int
"""
return (
df.groupby("year")
.agg(
avg_room_occupancy=("room_occupancy", "mean"),
avg_bed_occupancy=("bed_occupancy", "mean"),
months_reported=("room_occupancy", lambda x: x.notna().sum()),
)
.reset_index()
)
[docs]
def get_seasonal_patterns(df: pd.DataFrame) -> pd.DataFrame:
"""Calculate average occupancy by month across all years.
Args:
df: DataFrame from get_latest_hotel_occupancy()
Returns:
DataFrame with columns:
- month: str (month name)
- avg_room_occupancy: float
- avg_bed_occupancy: float
Example:
>>> df = get_latest_hotel_occupancy()
>>> seasonal = get_seasonal_patterns(df)
>>> 'avg_room_occupancy' in seasonal.columns
True
"""
month_order = [
"January",
"February",
"March",
"April",
"May",
"June",
"July",
"August",
"September",
"October",
"November",
"December",
]
summary = (
df.groupby("month")
.agg(
avg_room_occupancy=("room_occupancy", "mean"),
avg_bed_occupancy=("bed_occupancy", "mean"),
)
.reset_index()
)
# Sort by month order
summary["month"] = pd.Categorical(summary["month"], categories=month_order, ordered=True)
return summary.sort_values("month").reset_index(drop=True)
[docs]
def validate_occupancy_data(df: pd.DataFrame) -> bool: # pragma: no cover
"""Validate tourism occupancy data integrity.
Args:
df: DataFrame from get_latest_occupancy_data
Returns:
True if validation passes, False otherwise
"""
if df.empty:
logger.warning("Occupancy data is empty")
return False
required_cols = {"month", "accommodation_type"}
if not required_cols.issubset(df.columns):
missing = required_cols - set(df.columns)
logger.warning(f"Missing required occupancy columns: {missing}")
return False
# Check for reasonable occupancy percentages
percentage_cols = [col for col in df.columns if "occupancy" in col.lower() and col != "accommodation_type"]
for col in percentage_cols:
if col in df.columns and ((df[col] < 0).any() or (df[col] > 100).any()):
logger.warning(f"Occupancy percentages out of range in column {col}")
return False
return True