Source code for bolster.data_sources.nisra.tourism.occupancy

"""NISRA Monthly Occupancy Statistics Data Source.

Provides access to monthly hotel and accommodation occupancy data for Northern Ireland.

Data includes:

- Hotel room and bed occupancy rates from 2011 to present
- Rooms and beds sold monthly (hotels)
- Small Service Accommodation (SSA) occupancy from 2013 to present (B&Bs, guest houses, and similar establishments)
- Rooms and beds sold monthly (SSA)

The survey provides indicative monthly occupancy rates that are revised and
finalised in the Annual Publication.

Data Source:
    **Mother Page**: https://www.nisra.gov.uk/statistics/tourism/occupancy-surveys

    This page lists all occupancy survey publications. The module automatically
    scrapes this page to find the latest "Hotel Occupancy" or "Small Service
    Accommodation" publication, then downloads the Excel file.

Update Frequency: Monthly (published around the 15th of the following month)
Geographic Coverage: Northern Ireland
Reference Date: Month of survey

Example:
    >>> from bolster.data_sources.nisra.tourism import occupancy
    >>> df = occupancy.get_latest_hotel_occupancy()
    >>> 'room_occupancy' in df.columns
    True
    >>> df_sold = occupancy.get_latest_rooms_beds_sold()
    >>> 'rooms_sold' in df_sold.columns
    True
    >>> df_combined = occupancy.get_combined_occupancy()
    >>> 'accommodation_type' in df_combined.columns
    True
"""

import logging
import re
from pathlib import Path
from typing import Literal

import pandas as pd

from bolster.data_sources.nisra._base import NISRADataNotFoundError, NISRAValidationError, download_file
from bolster.utils.web import session

[docs] logger = logging.getLogger(__name__)
# Base URL for occupancy statistics
[docs] OCCUPANCY_BASE_URL = "https://www.nisra.gov.uk/statistics/tourism/occupancy-surveys"
[docs] def get_latest_hotel_occupancy_publication_url() -> tuple[str, str]: """Scrape NISRA occupancy surveys page to find the latest hotel occupancy file. Navigates the publication structure: 1. Scrapes mother page for latest hotel occupancy publication 2. Follows link to publication detail page 3. Finds hotel occupancy Excel file Returns: Tuple of (excel_file_url, publication_date) Raises: NISRADataNotFoundError: If publication or file not found """ from bs4 import BeautifulSoup mother_page = OCCUPANCY_BASE_URL try: # Use shared session with retry logic for resilient requests response = session.get(mother_page, timeout=30) response.raise_for_status() except Exception as e: raise NISRADataNotFoundError(f"Failed to fetch occupancy surveys page: {e}") from e soup = BeautifulSoup(response.content, "html.parser") # Find latest hotel occupancy publication # Pattern: "December 2024 hotel occupancy survey publications" or similar pub_link = None pub_date = None for link in soup.find_all("a", href=True): link_text = link.get_text(strip=True) # Match hotel occupancy publications if "hotel occupancy" in link_text.lower() and "publications" in link["href"]: href = link["href"] if href.startswith("/"): href = f"https://www.nisra.gov.uk{href}" # Extract month/year from link text if available # Pattern: "December 2024 hotel occupancy..." date_match = re.search(r"([A-Z][a-z]+)\s+(\d{4})", link_text) if date_match: pub_date = f"{date_match.group(1)} {date_match.group(2)}" # Take first match (should be newest due to reverse chronological order) pub_link = href logger.info(f"Found hotel occupancy publication: {link_text}") break if not pub_link: raise NISRADataNotFoundError("Could not find hotel occupancy publication on mother page") # Scrape the publication page for Excel file try: pub_response = session.get(pub_link, timeout=30) pub_response.raise_for_status() except Exception as e: raise NISRADataNotFoundError(f"Failed to fetch publication page: {e}") from e pub_soup = BeautifulSoup(pub_response.content, "html.parser") # Find hotel occupancy Excel file # Pattern: "2024-Hotel-December-excel-(online).xls" or similar excel_url = None for link in pub_soup.find_all("a", href=True): href = link["href"] # Match Excel files (.xls or .xlsx) if "Hotel" in href and (href.endswith(".xls") or href.endswith(".xlsx")): if href.startswith("/"): href = f"https://www.nisra.gov.uk{href}" excel_url = href logger.info(f"Found hotel occupancy Excel file: {href}") break if not excel_url: raise NISRADataNotFoundError("Could not find hotel occupancy Excel file on publication page") return excel_url, pub_date or "Unknown"
[docs] def get_latest_ssa_occupancy_publication_url() -> tuple[str, str]: """Scrape NISRA occupancy surveys page to find the latest SSA file. SSA = Small Service Accommodation (B&Bs, guest houses, etc.) Navigates the publication structure: 1. Scrapes mother page for latest SSA occupancy publication 2. Follows link to publication detail page 3. Finds SSA occupancy Excel file Returns: Tuple of (excel_file_url, publication_date) Raises: NISRADataNotFoundError: If publication or file not found """ from bs4 import BeautifulSoup mother_page = OCCUPANCY_BASE_URL try: # Use shared session with retry logic for resilient requests response = session.get(mother_page, timeout=30) response.raise_for_status() except Exception as e: raise NISRADataNotFoundError(f"Failed to fetch occupancy surveys page: {e}") from e soup = BeautifulSoup(response.content, "html.parser") # Find latest SSA occupancy publication # Pattern: "November 2025 Small Service Accommodation occupancy" or similar pub_link = None pub_date = None for link in soup.find_all("a", href=True): link_text = link.get_text(strip=True) # Match SSA publications if "small service accommodation" in link_text.lower() and "publications" in link["href"]: href = link["href"] if href.startswith("/"): href = f"https://www.nisra.gov.uk{href}" # Extract month/year from link text if available date_match = re.search(r"([A-Z][a-z]+)\s+(\d{4})", link_text) if date_match: pub_date = f"{date_match.group(1)} {date_match.group(2)}" # Take first match (should be newest due to reverse chronological order) pub_link = href logger.info(f"Found SSA occupancy publication: {link_text}") break if not pub_link: raise NISRADataNotFoundError("Could not find SSA occupancy publication on mother page") # Scrape the publication page for Excel file try: pub_response = session.get(pub_link, timeout=30) pub_response.raise_for_status() except Exception as e: raise NISRADataNotFoundError(f"Failed to fetch SSA publication page: {e}") from e pub_soup = BeautifulSoup(pub_response.content, "html.parser") # Find SSA occupancy Excel file # Pattern: "2025-Small Service-November-excel-(online).xls" or similar excel_url = None for link in pub_soup.find_all("a", href=True): href = link["href"] # Match Excel files (.xls or .xlsx) - SSA files have "Small" or "Service" in name if ("Small" in href or "Service" in href) and (href.endswith(".xls") or href.endswith(".xlsx")): if href.startswith("/"): href = f"https://www.nisra.gov.uk{href}" excel_url = href logger.info(f"Found SSA occupancy Excel file: {href}") break if not excel_url: raise NISRADataNotFoundError("Could not find SSA occupancy Excel file on publication page") return excel_url, pub_date or "Unknown"
def _find_table_by_title(file_path: str | Path, title_contains: str) -> str: """Find the sheet name that contains the specified title. Args: file_path: Path to the Excel file title_contains: String to search for in the first row of each table Returns: Sheet name matching the criteria Raises: NISRAValidationError: If no matching table is found """ import openpyxl file_path = Path(file_path) wb = openpyxl.load_workbook(file_path, read_only=True, data_only=True) for sheet_name in wb.sheetnames: if sheet_name.strip().startswith("Table"): ws = wb[sheet_name] # Check first cell for title first_cell = ws.cell(1, 1).value if first_cell and title_contains.lower() in str(first_cell).lower(): wb.close() return sheet_name wb.close() raise NISRAValidationError(f"Could not find table with title containing '{title_contains}'")
[docs] def parse_hotel_occupancy_rates(file_path: str | Path) -> pd.DataFrame: """Parse NISRA hotel occupancy rates from Excel file (Table 1). Args: file_path: Path to the hotel occupancy Excel file Returns: DataFrame with columns: - date: datetime (first day of month) - year: int - month: str (month name) - room_occupancy: float (room occupancy rate, 0-1) - bed_occupancy: float (bed occupancy rate, 0-1) Raises: NISRAValidationError: If file structure is unexpected """ file_path = Path(file_path) try: # Read Table 1 - Monthly occupancy rates # Row 0-7: Notes and metadata # Row 8: "<< link back to contents >>" # Row 9: Headers (Month, YYYY Room occupancy, YYYY Bed occupancy, ...) # Row 10+: Data (January, February, ...) df_raw = pd.read_excel( file_path, sheet_name="Table 1", header=None, skiprows=9, # Skip to header row nrows=13, # Read header + 12 months ) except Exception as e: raise NISRAValidationError(f"Failed to read hotel occupancy file: {e}") from e # First row contains headers headers = df_raw.iloc[0].tolist() df_raw = df_raw.iloc[1:].reset_index(drop=True) # Drop header row df_raw.columns = headers # First column should be "Month" month_col = headers[0] # Identify year columns - they have pattern "YYYY Room occupancy" or "YYYY Bed occupancy" year_room_cols = {} year_bed_cols = {} for col in headers[1:]: col_str = str(col) room_match = re.match(r"(\d{4})\s*Room\s*occupancy", col_str, re.IGNORECASE) bed_match = re.match(r"(\d{4})\s*Bed\s*occupancy", col_str, re.IGNORECASE) if room_match: year_room_cols[int(room_match.group(1))] = col elif bed_match: year_bed_cols[int(bed_match.group(1))] = col # Build long-format data records = [] month_map = { "January": 1, "February": 2, "March": 3, "April": 4, "May": 5, "June": 6, "July": 7, "August": 8, "September": 9, "October": 10, "November": 11, "December": 12, } for _, row in df_raw.iterrows(): month_name = str(row[month_col]).strip() if month_name not in month_map: continue month_num = month_map[month_name] # Get data for each year for year in sorted(set(year_room_cols.keys()) | set(year_bed_cols.keys())): room_occ = None bed_occ = None if year in year_room_cols: val = row[year_room_cols[year]] if pd.notna(val) and val != 0 and val != "c": room_occ = float(val) if year in year_bed_cols: val = row[year_bed_cols[year]] if pd.notna(val) and val != 0 and val != "c": bed_occ = float(val) # Only add if we have at least one value if room_occ is not None or bed_occ is not None: records.append( { "year": year, "month": month_name, "month_num": month_num, "room_occupancy": room_occ, "bed_occupancy": bed_occ, } ) df = pd.DataFrame(records) # Create datetime column df["date"] = pd.to_datetime({"year": df["year"], "month": df["month_num"], "day": 1}) # Select and reorder columns result = df[["date", "year", "month", "room_occupancy", "bed_occupancy"]].copy() # Sort by date result = result.sort_values("date").reset_index(drop=True) # Log summary logger.info( f"Parsed {len(result)} monthly hotel occupancy records " f"({result['date'].min().strftime('%Y-%m')} to {result['date'].max().strftime('%Y-%m')})" ) return result
[docs] def parse_rooms_beds_sold(file_path: str | Path) -> pd.DataFrame: """Parse NISRA hotel rooms and beds sold from Excel file (Table 3). Args: file_path: Path to the hotel occupancy Excel file Returns: DataFrame with columns: - date: datetime (first day of month) - year: int - month: str (month name) - rooms_sold: float (number of rooms sold) - beds_sold: float (number of beds sold) Raises: NISRAValidationError: If file structure is unexpected """ file_path = Path(file_path) try: # Read Table 3 - Rooms and beds sold # Row 0-7: Notes and metadata # Row 8: "<< link to methodology >>" # Row 9: "<< link back to contents >>" # Row 10: Headers (Month, YYYY Rooms sold, YYYY Beds sold, ...) # Row 11+: Data (January, February, ...) df_raw = pd.read_excel( file_path, sheet_name="Table 3", header=None, skiprows=10, # Skip to header row nrows=13, # Read header + 12 months ) except Exception as e: raise NISRAValidationError(f"Failed to read hotel occupancy file: {e}") from e # First row contains headers headers = df_raw.iloc[0].tolist() df_raw = df_raw.iloc[1:].reset_index(drop=True) # Drop header row df_raw.columns = headers # First column should be "Month" month_col = headers[0] # Identify year columns year_rooms_cols = {} year_beds_cols = {} for col in headers[1:]: col_str = str(col) rooms_match = re.match(r"(\d{4})\s*Rooms\s*sold", col_str, re.IGNORECASE) beds_match = re.match(r"(\d{4})\s*Beds\s*sold", col_str, re.IGNORECASE) if rooms_match: year_rooms_cols[int(rooms_match.group(1))] = col elif beds_match: year_beds_cols[int(beds_match.group(1))] = col # Build long-format data records = [] month_map = { "January": 1, "February": 2, "March": 3, "April": 4, "May": 5, "June": 6, "July": 7, "August": 8, "September": 9, "October": 10, "November": 11, "December": 12, } for _, row in df_raw.iterrows(): month_name = str(row[month_col]).strip() if month_name not in month_map: continue month_num = month_map[month_name] # Get data for each year for year in sorted(set(year_rooms_cols.keys()) | set(year_beds_cols.keys())): rooms_sold = None beds_sold = None if year in year_rooms_cols: val = row[year_rooms_cols[year]] if pd.notna(val) and val != 0 and val != "*" and val != "c": rooms_sold = float(val) if year in year_beds_cols: val = row[year_beds_cols[year]] if pd.notna(val) and val != 0 and val != "*" and val != "c": beds_sold = float(val) # Only add if we have at least one value if rooms_sold is not None or beds_sold is not None: records.append( { "year": year, "month": month_name, "month_num": month_num, "rooms_sold": rooms_sold, "beds_sold": beds_sold, } ) df = pd.DataFrame(records) # Create datetime column df["date"] = pd.to_datetime({"year": df["year"], "month": df["month_num"], "day": 1}) # Select and reorder columns result = df[["date", "year", "month", "rooms_sold", "beds_sold"]].copy() # Sort by date result = result.sort_values("date").reset_index(drop=True) logger.info( f"Parsed {len(result)} monthly rooms/beds sold records " f"({result['date'].min().strftime('%Y-%m')} to {result['date'].max().strftime('%Y-%m')})" ) return result
[docs] def get_latest_hotel_occupancy(force_refresh: bool = False) -> pd.DataFrame: """Get the latest monthly hotel occupancy rates data. Automatically discovers and downloads the most recent hotel occupancy data from the NISRA website. Args: force_refresh: If True, bypass cache and download fresh data Returns: DataFrame with columns: - date: datetime (first day of month) - year: int - month: str (month name) - room_occupancy: float (room occupancy rate, 0-1) - bed_occupancy: float (bed occupancy rate, 0-1) Raises: NISRADataNotFoundError: If latest publication cannot be found NISRAValidationError: If file structure is unexpected Example: >>> df = get_latest_hotel_occupancy() >>> 'room_occupancy' in df.columns True """ excel_url, pub_date = get_latest_hotel_occupancy_publication_url() logger.info(f"Downloading hotel occupancy data ({pub_date}) from: {excel_url}") # Cache for 30 days (monthly data) cache_ttl_hours = 30 * 24 file_path = download_file(excel_url, cache_ttl_hours=cache_ttl_hours, force_refresh=force_refresh) return parse_hotel_occupancy_rates(file_path)
[docs] def get_latest_rooms_beds_sold(force_refresh: bool = False) -> pd.DataFrame: """Get the latest monthly rooms and beds sold data. Args: force_refresh: If True, bypass cache and download fresh data Returns: DataFrame with columns: - date: datetime (first day of month) - year: int - month: str (month name) - rooms_sold: float (number of rooms sold) - beds_sold: float (number of beds sold) Example: >>> df = get_latest_rooms_beds_sold() >>> 'rooms_sold' in df.columns True """ excel_url, pub_date = get_latest_hotel_occupancy_publication_url() logger.info(f"Downloading rooms/beds sold data ({pub_date}) from: {excel_url}") cache_ttl_hours = 30 * 24 file_path = download_file(excel_url, cache_ttl_hours=cache_ttl_hours, force_refresh=force_refresh) return parse_rooms_beds_sold(file_path)
# ============================================================================ # Small Service Accommodation (SSA) Functions # ============================================================================
[docs] def parse_ssa_occupancy_rates(file_path: str | Path) -> pd.DataFrame: """Parse NISRA SSA occupancy rates from Excel file (Table 1). SSA = Small Service Accommodation (B&Bs, guest houses, etc.) Args: file_path: Path to the SSA occupancy Excel file Returns: DataFrame with columns: - date: datetime (first day of month) - year: int - month: str (month name) - room_occupancy: float (room occupancy rate, 0-1) - bed_occupancy: float (bed occupancy rate, 0-1) Raises: NISRAValidationError: If file structure is unexpected """ file_path = Path(file_path) try: # SSA files have trailing space on some sheet names # Try both "Table 1" and "Table 1 " sheet_name = None with pd.ExcelFile(file_path) as xl: for name in xl.sheet_names: if name.strip() == "Table 1": sheet_name = name break if not sheet_name: raise NISRAValidationError("Could not find Table 1 in SSA file") # Read Table 1 - Monthly occupancy rates # Structure is same as hotel: rows 0-9 metadata, row 10 headers, rows 11+ data df_raw = pd.read_excel( file_path, sheet_name=sheet_name, header=None, skiprows=10, # Skip to header row nrows=13, # Read header + 12 months ) except NISRAValidationError: raise except Exception as e: raise NISRAValidationError(f"Failed to read SSA occupancy file: {e}") from e # First row contains headers headers = df_raw.iloc[0].tolist() df_raw = df_raw.iloc[1:].reset_index(drop=True) # Drop header row df_raw.columns = headers # First column should be "Month" month_col = headers[0] # Identify year columns - same pattern as hotel year_room_cols = {} year_bed_cols = {} for col in headers[1:]: col_str = str(col) room_match = re.match(r"(\d{4})\s*Room\s*occupancy", col_str, re.IGNORECASE) bed_match = re.match(r"(\d{4})\s*Bed\s*occupancy", col_str, re.IGNORECASE) if room_match: year_room_cols[int(room_match.group(1))] = col elif bed_match: year_bed_cols[int(bed_match.group(1))] = col # Build long-format data records = [] month_map = { "January": 1, "February": 2, "March": 3, "April": 4, "May": 5, "June": 6, "July": 7, "August": 8, "September": 9, "October": 10, "November": 11, "December": 12, } for _, row in df_raw.iterrows(): month_name = str(row[month_col]).strip() if month_name not in month_map: continue month_num = month_map[month_name] # Get data for each year for year in sorted(set(year_room_cols.keys()) | set(year_bed_cols.keys())): room_occ = None bed_occ = None if year in year_room_cols: val = row[year_room_cols[year]] if pd.notna(val) and val != 0 and val != "c": room_occ = float(val) if year in year_bed_cols: val = row[year_bed_cols[year]] if pd.notna(val) and val != 0 and val != "c": bed_occ = float(val) # Only add if we have at least one value if room_occ is not None or bed_occ is not None: records.append( { "year": year, "month": month_name, "month_num": month_num, "room_occupancy": room_occ, "bed_occupancy": bed_occ, } ) df = pd.DataFrame(records) # Create datetime column df["date"] = pd.to_datetime({"year": df["year"], "month": df["month_num"], "day": 1}) # Select and reorder columns result = df[["date", "year", "month", "room_occupancy", "bed_occupancy"]].copy() # Sort by date result = result.sort_values("date").reset_index(drop=True) logger.info( f"Parsed {len(result)} monthly SSA occupancy records " f"({result['date'].min().strftime('%Y-%m')} to {result['date'].max().strftime('%Y-%m')})" ) return result
[docs] def parse_ssa_rooms_beds_sold(file_path: str | Path) -> pd.DataFrame: """Parse NISRA SSA rooms and beds sold from Excel file (Table 2). Note: SSA uses Table 2 for rooms/beds sold, while Hotel uses Table 3. Args: file_path: Path to the SSA occupancy Excel file Returns: DataFrame with columns: - date: datetime (first day of month) - year: int - month: str (month name) - rooms_sold: float (number of rooms sold) - beds_sold: float (number of beds sold) Raises: NISRAValidationError: If file structure is unexpected """ file_path = Path(file_path) try: # Read Table 2 - Rooms and beds sold df_raw = pd.read_excel( file_path, sheet_name="Table 2", header=None, skiprows=10, # Skip to header row nrows=13, # Read header + 12 months ) except Exception as e: raise NISRAValidationError(f"Failed to read SSA occupancy file: {e}") from e # First row contains headers headers = df_raw.iloc[0].tolist() df_raw = df_raw.iloc[1:].reset_index(drop=True) # Drop header row df_raw.columns = headers # First column should be "Month" month_col = headers[0] # Identify year columns year_rooms_cols = {} year_beds_cols = {} for col in headers[1:]: col_str = str(col) rooms_match = re.match(r"(\d{4})\s*Rooms\s*sold", col_str, re.IGNORECASE) beds_match = re.match(r"(\d{4})\s*Beds\s*sold", col_str, re.IGNORECASE) if rooms_match: year_rooms_cols[int(rooms_match.group(1))] = col elif beds_match: year_beds_cols[int(beds_match.group(1))] = col # Build long-format data records = [] month_map = { "January": 1, "February": 2, "March": 3, "April": 4, "May": 5, "June": 6, "July": 7, "August": 8, "September": 9, "October": 10, "November": 11, "December": 12, } for _, row in df_raw.iterrows(): month_name = str(row[month_col]).strip() if month_name not in month_map: continue month_num = month_map[month_name] # Get data for each year for year in sorted(set(year_rooms_cols.keys()) | set(year_beds_cols.keys())): rooms_sold = None beds_sold = None if year in year_rooms_cols: val = row[year_rooms_cols[year]] if pd.notna(val) and val != 0 and val != "*" and val != "c": rooms_sold = float(val) if year in year_beds_cols: val = row[year_beds_cols[year]] if pd.notna(val) and val != 0 and val != "*" and val != "c": beds_sold = float(val) # Only add if we have at least one value if rooms_sold is not None or beds_sold is not None: records.append( { "year": year, "month": month_name, "month_num": month_num, "rooms_sold": rooms_sold, "beds_sold": beds_sold, } ) df = pd.DataFrame(records) # Create datetime column df["date"] = pd.to_datetime({"year": df["year"], "month": df["month_num"], "day": 1}) # Select and reorder columns result = df[["date", "year", "month", "rooms_sold", "beds_sold"]].copy() # Sort by date result = result.sort_values("date").reset_index(drop=True) logger.info( f"Parsed {len(result)} monthly SSA rooms/beds sold records " f"({result['date'].min().strftime('%Y-%m')} to {result['date'].max().strftime('%Y-%m')})" ) return result
[docs] def get_latest_ssa_occupancy(force_refresh: bool = False) -> pd.DataFrame: """Get the latest monthly SSA occupancy rates data. SSA = Small Service Accommodation (B&Bs, guest houses, etc.) Automatically discovers and downloads the most recent SSA occupancy data from the NISRA website. Args: force_refresh: If True, bypass cache and download fresh data Returns: DataFrame with columns: - date: datetime (first day of month) - year: int - month: str (month name) - room_occupancy: float (room occupancy rate, 0-1) - bed_occupancy: float (bed occupancy rate, 0-1) Raises: NISRADataNotFoundError: If latest publication cannot be found NISRAValidationError: If file structure is unexpected Example: >>> df = get_latest_ssa_occupancy() >>> 'room_occupancy' in df.columns True """ excel_url, pub_date = get_latest_ssa_occupancy_publication_url() logger.info(f"Downloading SSA occupancy data ({pub_date}) from: {excel_url}") # Cache for 30 days (monthly data) cache_ttl_hours = 30 * 24 file_path = download_file(excel_url, cache_ttl_hours=cache_ttl_hours, force_refresh=force_refresh) return parse_ssa_occupancy_rates(file_path)
[docs] def get_latest_ssa_rooms_beds_sold(force_refresh: bool = False) -> pd.DataFrame: """Get the latest monthly SSA rooms and beds sold data. SSA = Small Service Accommodation (B&Bs, guest houses, etc.) Args: force_refresh: If True, bypass cache and download fresh data Returns: DataFrame with columns: - date: datetime (first day of month) - year: int - month: str (month name) - rooms_sold: float (number of rooms sold) - beds_sold: float (number of beds sold) Example: >>> df = get_latest_ssa_rooms_beds_sold() >>> 'rooms_sold' in df.columns True """ excel_url, pub_date = get_latest_ssa_occupancy_publication_url() logger.info(f"Downloading SSA rooms/beds sold data ({pub_date}) from: {excel_url}") cache_ttl_hours = 30 * 24 file_path = download_file(excel_url, cache_ttl_hours=cache_ttl_hours, force_refresh=force_refresh) return parse_ssa_rooms_beds_sold(file_path)
# ============================================================================ # Combined Accommodation Functions # ============================================================================
[docs] def get_combined_occupancy(force_refresh: bool = False) -> pd.DataFrame: """Get combined hotel and SSA occupancy data with accommodation type column. This function fetches both hotel and SSA occupancy data and combines them into a single DataFrame with an 'accommodation_type' column to distinguish between the two accommodation types. Args: force_refresh: If True, bypass cache and download fresh data Returns: DataFrame with columns: - date: datetime (first day of month) - year: int - month: str (month name) - room_occupancy: float (room occupancy rate, 0-1) - bed_occupancy: float (bed occupancy rate, 0-1) - accommodation_type: str ('hotel' or 'ssa') Example: >>> df = get_combined_occupancy() >>> 'accommodation_type' in df.columns True """ hotel_df = get_latest_hotel_occupancy(force_refresh=force_refresh) hotel_df["accommodation_type"] = "hotel" ssa_df = get_latest_ssa_occupancy(force_refresh=force_refresh) ssa_df["accommodation_type"] = "ssa" combined = pd.concat([hotel_df, ssa_df], ignore_index=True) combined = combined.sort_values(["date", "accommodation_type"]).reset_index(drop=True) logger.info(f"Combined {len(hotel_df)} hotel + {len(ssa_df)} SSA = {len(combined)} total occupancy records") return combined
[docs] def compare_accommodation_types( df: pd.DataFrame, metric: Literal["room_occupancy", "bed_occupancy"] = "room_occupancy" ) -> pd.DataFrame: """Compare occupancy between hotel and SSA by year. Args: df: DataFrame from get_combined_occupancy() metric: Which occupancy metric to compare Returns: DataFrame with columns: - year: int - hotel_{metric}: float - ssa_{metric}: float - difference: float (hotel - ssa) - ratio: float (hotel / ssa) Example: >>> df = get_combined_occupancy() >>> comparison = compare_accommodation_types(df) >>> 'difference' in comparison.columns True """ pivot = df.pivot_table( index="year", columns="accommodation_type", values=metric, aggfunc="mean", ).reset_index() pivot.columns.name = None pivot = pivot.rename(columns={"hotel": f"hotel_{metric}", "ssa": f"ssa_{metric}"}) # Calculate comparison metrics pivot["difference"] = pivot[f"hotel_{metric}"] - pivot[f"ssa_{metric}"] pivot["ratio"] = pivot[f"hotel_{metric}"] / pivot[f"ssa_{metric}"] return pivot
[docs] def get_occupancy_by_year(df: pd.DataFrame, year: int) -> pd.DataFrame: """Filter occupancy data for a specific year. Args: df: DataFrame from get_latest_hotel_occupancy() year: Year to filter Returns: Filtered DataFrame """ return df[df["year"] == year].reset_index(drop=True)
[docs] def get_occupancy_summary_by_year(df: pd.DataFrame) -> pd.DataFrame: """Calculate annual occupancy averages and statistics. Args: df: DataFrame from get_latest_hotel_occupancy() Returns: DataFrame with columns: - year: int - avg_room_occupancy: float - avg_bed_occupancy: float - months_reported: int """ return ( df.groupby("year") .agg( avg_room_occupancy=("room_occupancy", "mean"), avg_bed_occupancy=("bed_occupancy", "mean"), months_reported=("room_occupancy", lambda x: x.notna().sum()), ) .reset_index() )
[docs] def get_seasonal_patterns(df: pd.DataFrame) -> pd.DataFrame: """Calculate average occupancy by month across all years. Args: df: DataFrame from get_latest_hotel_occupancy() Returns: DataFrame with columns: - month: str (month name) - avg_room_occupancy: float - avg_bed_occupancy: float Example: >>> df = get_latest_hotel_occupancy() >>> seasonal = get_seasonal_patterns(df) >>> 'avg_room_occupancy' in seasonal.columns True """ month_order = [ "January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December", ] summary = ( df.groupby("month") .agg( avg_room_occupancy=("room_occupancy", "mean"), avg_bed_occupancy=("bed_occupancy", "mean"), ) .reset_index() ) # Sort by month order summary["month"] = pd.Categorical(summary["month"], categories=month_order, ordered=True) return summary.sort_values("month").reset_index(drop=True)
[docs] def validate_occupancy_data(df: pd.DataFrame) -> bool: # pragma: no cover """Validate tourism occupancy data integrity. Args: df: DataFrame from get_latest_occupancy_data Returns: True if validation passes, False otherwise """ if df.empty: logger.warning("Occupancy data is empty") return False required_cols = {"month", "accommodation_type"} if not required_cols.issubset(df.columns): missing = required_cols - set(df.columns) logger.warning(f"Missing required occupancy columns: {missing}") return False # Check for reasonable occupancy percentages percentage_cols = [col for col in df.columns if "occupancy" in col.lower() and col != "accommodation_type"] for col in percentage_cols: if col in df.columns and ((df[col] < 0).any() or (df[col] > 100).any()): logger.warning(f"Occupancy percentages out of range in column {col}") return False return True