"""NISRA Labour Market Statistics Module.
This module provides access to Northern Ireland Statistics and Research Agency (NISRA)
Labour Force Survey (LFS) data.
The Labour Force Survey is a sample survey of households providing information on the
labour force using internationally agreed concepts and definitions. It provides estimates
of employment, unemployment, and economic inactivity.
Data is published quarterly covering 3-month rolling periods (e.g., "July to September 2025").
Some tables (particularly Local Government District breakdowns) are published annually only.
Data Source:
**Mother Page**: https://www.nisra.gov.uk/statistics/labour-market-and-social-welfare
This page lists all Labour Market and Social Welfare publications in reverse chronological
order (newest first). The module automatically scrapes this page to find the latest
"Quarterly Labour Force Survey Tables" publication, then downloads the Excel file from
that publication's detail page.
This ensures the module always retrieves the most recent data without hardcoding dates
or quarters.
Update Frequency: Quarterly publications covering 3-month rolling periods are released
approximately 6-8 weeks after the reference period ends. Labour Force Survey data is
published four times per year with additional annual publications containing Local Government
District breakdowns. Data is updated as part of NISRA's regular labour market statistics
programme following ONS Labour Force Survey methodology.
Architecture:
- Automatically discovers latest quarterly LFS publication from NISRA mother page
- Downloads quarterly Labour Force Survey Excel files
- Parses multiple tables including:
* Employment by age band and sex (Table 2.15)
* Employment by industry sector (Table 2.17)
* Employment by occupation (Table 2.18)
* Economic inactivity (Table 2.21)
* Unemployment rates (Table 2.22)
- Separately handles annual Local Government District data (Table 1.16a)
published in dedicated LGD tables file (not in quarterly publications)
- Returns long-format DataFrames for flexibility
- Uses standardized Excel parsing utilities from _base.py
Usage:
>>> from bolster.data_sources.nisra import labour_market
>>> # Get latest quarterly employment data
>>> df = labour_market.get_latest_employment()
>>> # Get economic inactivity data
>>> df = labour_market.get_latest_economic_inactivity()
>>> # Get all tables for a specific quarter
>>> data = labour_market.get_quarterly_data(year=2025, quarter="Jul-Sep")
>>> # Get annual employment by Local Government District
>>> lgd_df = labour_market.get_latest_employment_by_lgd()
Example:
>>> from bolster.data_sources.nisra import labour_market
>>> emp_df = labour_market.get_latest_employment()
>>> 'age_group' in emp_df.columns
True
>>> lgd_df = labour_market.get_latest_employment_by_lgd()
>>> 'employment_rate' in lgd_df.columns
True
Author: Claude Code
Date: 2025-12-21
"""
import logging
import re
from pathlib import Path
import pandas as pd
from openpyxl import load_workbook
from bolster.data_sources.nisra._base import (
CACHE_DIR,
NISRADataNotFoundError,
download_file,
safe_float,
safe_int,
)
from bolster.utils.web import session
[docs]
logger = logging.getLogger(__name__)
# Cache configuration
[docs]
LFS_CACHE_DIR = CACHE_DIR / "labour_market"
LFS_CACHE_DIR.mkdir(parents=True, exist_ok=True)
# NISRA LFS publication URL pattern
[docs]
LFS_BASE_URL = "https://www.nisra.gov.uk"
def _quarter_to_month_names(quarter_str: str) -> tuple[str, str, str]:
"""Convert quarter string to month names.
Args:
quarter_str: Quarter identifier like "Jul-Sep" or "Jan-Mar"
Returns:
Tuple of (first_month, second_month, third_month)
Examples:
>>> _quarter_to_month_names("Jul-Sep")
('July', 'August', 'September')
>>> _quarter_to_month_names("Jan-Mar")
('January', 'February', 'March')
"""
quarter_map = {
"Jan-Mar": ("January", "February", "March"),
"Apr-Jun": ("April", "May", "June"),
"Jul-Sep": ("July", "August", "September"),
"Oct-Dec": ("October", "November", "December"),
}
# Normalize input - remove spaces and hyphens for comparison
quarter_normalized = quarter_str.strip().replace(" ", "").replace("-", "").lower()
for key, months in quarter_map.items():
key_normalized = key.replace("-", "").lower()
if key_normalized == quarter_normalized:
return months
raise ValueError(f"Invalid quarter string: {quarter_str}. Expected format like 'Jul-Sep' or 'Jan-Mar'")
def _build_lfs_url(year: int, quarter: str) -> str:
"""Build the download URL for a quarterly LFS file.
Args:
year: Year (e.g., 2025)
quarter: Quarter string like "Jul-Sep" or "Q3"
Returns:
Full URL to the Excel file
Note:
The URL pattern is:
/system/files/statistics/YYYY-MM/lmr-labour-force-survey-quarterly-tables-MONTH-MONTH-YY.xlsx
Where YYYY-MM is the publication date (usually 2 months after quarter end)
and MONTH-MONTH-YY is the data period
"""
# Get month names for the quarter
first_month, _, third_month = _quarter_to_month_names(quarter)
# Determine publication month (typically 2 months after quarter end)
quarter_to_pub_month = {
"Jan-Mar": "05", # Published in May
"Apr-Jun": "08", # Published in August
"Jul-Sep": "11", # Published in November
"Oct-Dec": "02", # Published in February (next year)
}
# Normalize quarter string - remove spaces and hyphens
quarter_normalized = quarter.strip().replace(" ", "").replace("-", "").lower()
pub_month = None
for key, month in quarter_to_pub_month.items():
key_normalized = key.replace("-", "").lower()
if key_normalized == quarter_normalized:
pub_month = month
break
if pub_month is None:
raise ValueError(f"Could not determine publication month for quarter: {quarter}")
# For Oct-Dec quarter, publication is in February of next year
pub_year = year + 1 if pub_month == "02" else year
# Build filename: lmr-labour-force-survey-quarterly-tables-July-September-25.xlsx
year_short = str(year)[-2:] # Last 2 digits
filename = f"lmr-labour-force-survey-quarterly-tables-{first_month}-{third_month}-{year_short}.xlsx"
# Build full URL
return f"{LFS_BASE_URL}/system/files/statistics/{pub_year}-{pub_month}/{filename}"
[docs]
def download_quarterly_lfs(year: int, quarter: str, force_refresh: bool = False, cache_ttl_days: int = 90) -> Path:
"""Download Labour Force Survey quarterly tables Excel file.
Args:
year: Year (e.g., 2025)
quarter: Quarter string like "Jul-Sep", "Jan-Mar", etc.
force_refresh: If True, always download fresh data ignoring cache
cache_ttl_days: Cache time-to-live in days (default: 90 days)
Returns:
Path to the downloaded Excel file
Raises:
NISRADataNotFoundError: If the file cannot be downloaded
Example:
>>> file_path = download_quarterly_lfs(2025, "Jul-Sep")
>>> file_path.exists()
True
"""
# Build download URL
url = _build_lfs_url(year, quarter)
logger.info(f"Downloading LFS data from {url}")
# Convert days to hours for download_file function
cache_ttl_hours = cache_ttl_days * 24
try:
return download_file(url, cache_ttl_hours=cache_ttl_hours, force_refresh=force_refresh)
except Exception as e:
raise NISRADataNotFoundError(f"Failed to download LFS quarterly data for {year} {quarter}: {e}") from e
[docs]
def parse_employment_by_age_sex(file_path: str | Path) -> pd.DataFrame:
"""Parse Table 2.15: Employment by Age Band and Sex.
Extracts employment numbers and percentages broken down by age group and sex.
Args:
file_path: Path to the quarterly LFS Excel file
Returns:
DataFrame with columns:
- quarter_period: Quarter label (e.g., "July to September 2025")
- age_group: Age band (e.g., "16 to 19", "20 to 24", "25 to 29", etc.)
- sex: "Male", "Female", or "All Persons"
- percentage: Percentage of employment in this age group (for this sex)
- number: Absolute number employed (from Table 2.15b)
Example:
>>> url, year, quarter = get_latest_lfs_publication_url()
>>> path = download_file(url, cache_ttl_hours=90*24)
>>> df = parse_employment_by_age_sex(path)
>>> 'age_group' in df.columns
True
"""
wb = load_workbook(file_path, data_only=True)
if "2.15" not in wb.sheetnames:
raise NISRADataNotFoundError("Table 2.15 (Employment by Age and Sex) not found in file")
sheet = wb["2.15"]
# Extract quarter period from title (Row 1 or nearby)
quarter_period = None
for row in sheet.iter_rows(min_row=1, max_row=5, values_only=True):
if row[0] and "Percentage in Employment by Age Band" in str(row[0]):
# Extract date range from title
match = re.search(r"([A-Z][a-z]+ to [A-Z][a-z]+ \d{4})", str(row[0]))
if match:
quarter_period = match.group(1)
break
if not quarter_period:
logger.warning("Could not extract quarter period from Table 2.15 title")
quarter_period = "Unknown"
# Find header rows for Table 2.15a (percentages) and Table 2.15b (numbers)
# Table 2.15a starts around row 6
# Headers: "Age group", "Male\n(percentage)", "Female\n(percentage)", "All Persons\n(percentage)"
header_row_a = None
for row_idx, row in enumerate(sheet.iter_rows(min_row=5, max_row=10, values_only=True), start=5):
if row[0] and "Age group" in str(row[0]):
header_row_a = row_idx
break
if not header_row_a:
raise NISRADataNotFoundError("Could not find header row for Table 2.15a")
# Parse Table 2.15a (percentages)
percentage_records = []
for row in sheet.iter_rows(min_row=header_row_a + 1, max_row=header_row_a + 15, values_only=True):
age_group = row[0]
# Stop at empty rows or footer
if not age_group or age_group == "All Persons":
break
male_pct = safe_float(row[1])
female_pct = safe_float(row[2])
all_persons_pct = safe_float(row[3])
if male_pct is not None:
percentage_records.append(
{
"quarter_period": quarter_period,
"age_group": str(age_group).strip(),
"sex": "Male",
"percentage": male_pct,
}
)
if female_pct is not None:
percentage_records.append(
{
"quarter_period": quarter_period,
"age_group": str(age_group).strip(),
"sex": "Female",
"percentage": female_pct,
}
)
if all_persons_pct is not None:
percentage_records.append(
{
"quarter_period": quarter_period,
"age_group": str(age_group).strip(),
"sex": "All Persons",
"percentage": all_persons_pct,
}
)
df_pct = pd.DataFrame(percentage_records)
# Find Table 2.15b (numbers) - should be to the right, around column 5
# Headers: "Sex", "Number"
header_row_b = None
for row_idx, row in enumerate(sheet.iter_rows(min_row=5, max_row=10, values_only=True), start=5):
if len(row) > 5 and row[5] and "Sex" in str(row[5]):
header_row_b = row_idx
break
if not header_row_b:
logger.warning("Could not find Table 2.15b (total numbers by sex)")
# Return just percentages
return df_pct
# Parse Table 2.15b (total numbers) - only has Male, Female, All Persons totals
# Not broken down by age, so we'll add as a separate record with age_group = "All ages"
number_records = []
for row in sheet.iter_rows(min_row=header_row_b + 1, max_row=header_row_b + 5, values_only=True):
sex = row[5] if len(row) > 5 else None
number = safe_int(row[6]) if len(row) > 6 else None
if not sex or number is None:
continue
number_records.append(
{
"quarter_period": quarter_period,
"age_group": "All ages",
"sex": str(sex).strip(),
"number": number,
}
)
df_num = pd.DataFrame(number_records)
# Merge percentages with total numbers
# The total numbers apply to all age groups, so we need to calculate
# We can merge on sex only for the "All ages" totals
# Return both as separate info or merged smartly
# For now, let's add the total numbers as additional rows
return pd.concat([df_pct, df_num], ignore_index=True)
[docs]
def parse_economic_inactivity(file_path: str | Path) -> pd.DataFrame:
"""Parse Table 2.21: Economically Inactive by Sex (Time Series).
Extracts economic inactivity data broken down by sex with historical time series.
Table 2.21 shows data for the same quarter (e.g., Jul-Sep) across multiple years (2012-2025).
Args:
file_path: Path to the quarterly LFS Excel file
Returns:
DataFrame with columns:
- time_period: Quarter label (e.g., "Jul to Sep 2025")
- sex: "Male", "Female", or "All Persons"
- economically_inactive_number: Number of people economically inactive
- economic_inactivity_rate: Percentage economically inactive
Note:
Economically inactive persons are those not in employment and not seeking work
(students, retired, caring for family, long-term sick/disabled, etc.).
This table provides **historical time series** for the same quarter across years,
allowing year-over-year comparisons for the same seasonal period.
Example:
>>> url, year, quarter = get_latest_lfs_publication_url()
>>> path = download_file(url, cache_ttl_hours=90*24)
>>> df = parse_economic_inactivity(path)
>>> 'economic_inactivity_rate' in df.columns
True
"""
wb = load_workbook(file_path, data_only=True)
if "2.21" not in wb.sheetnames:
raise NISRADataNotFoundError("Table 2.21 (Economic Inactivity) not found in file")
sheet = wb["2.21"]
# Table 2.21 is a time series table with two sub-tables:
# Table 2.21a: Numbers (Male, Female, All Persons columns)
# Table 2.21b: Rates (Male, Female, All Persons columns)
# Rows are time periods (Jul to Sep 2012, Jul to Sep 2013, etc.)
# Find header for Table 2.21a (columns: Time period, Male, Female, All Persons)
header_row_a = None
for row_idx, row in enumerate(sheet.iter_rows(min_row=5, max_row=10, values_only=True), start=5):
if row[0] and "Time period" in str(row[0]):
header_row_a = row_idx
break
if not header_row_a:
raise NISRADataNotFoundError("Could not find header row for Table 2.21a")
records = []
# Parse Table 2.21a (numbers) and Table 2.21b (rates) side-by-side
# Data rows start after header
for row in sheet.iter_rows(min_row=header_row_a + 1, max_row=header_row_a + 20, values_only=True):
time_period_a = row[0]
# Stop at empty rows
if not time_period_a:
break
# Table 2.21a columns (0-3): Time period, Male, Female, All Persons
male_number = safe_int(row[1])
female_number = safe_int(row[2])
all_persons_number = safe_int(row[3])
# Table 2.21b columns (5-8): Time period, Male, Female, All Persons
male_rate = safe_float(row[6]) if len(row) > 6 else None
female_rate = safe_float(row[7]) if len(row) > 7 else None
all_persons_rate = safe_float(row[8]) if len(row) > 8 else None
time_period_str = str(time_period_a).strip()
# Add record for Male
if male_number is not None or male_rate is not None:
records.append(
{
"time_period": time_period_str,
"sex": "Male",
"economically_inactive_number": male_number,
"economic_inactivity_rate": male_rate,
}
)
# Add record for Female
if female_number is not None or female_rate is not None:
records.append(
{
"time_period": time_period_str,
"sex": "Female",
"economically_inactive_number": female_number,
"economic_inactivity_rate": female_rate,
}
)
# Add record for All Persons
if all_persons_number is not None or all_persons_rate is not None:
records.append(
{
"time_period": time_period_str,
"sex": "All Persons",
"economically_inactive_number": all_persons_number,
"economic_inactivity_rate": all_persons_rate,
}
)
return pd.DataFrame(records)
[docs]
def get_latest_lfs_publication_url() -> tuple[str, str, str]:
"""Find the latest Labour Force Survey quarterly tables publication.
Scrapes the NISRA Labour Market statistics mother page to find the most recent
"Quarterly Labour Force Survey Tables" publication.
Returns:
Tuple of (excel_file_url, year, quarter)
- excel_file_url: Full URL to the Excel file
- year: Data year as string (e.g., "2025")
- quarter: Quarter string (e.g., "Jul-Sep")
Raises:
NISRADataNotFoundError: If no publication found
Example:
>>> url, year, quarter = get_latest_lfs_publication_url()
>>> url.startswith('https://')
True
"""
from bs4 import BeautifulSoup
# Mother page listing all labour market publications
mother_page = "https://www.nisra.gov.uk/statistics/labour-market-and-social-welfare"
try:
response = session.get(mother_page, timeout=30)
response.raise_for_status()
soup = BeautifulSoup(response.content, "html.parser")
# Find all links with "Quarterly Labour Force Survey Tables" in text
lfs_links = []
for link in soup.find_all("a", href=True):
link_text = link.get_text(strip=True)
if "Quarterly Labour Force Survey Tables" in link_text:
href = link["href"]
# Make absolute URL if needed
if href.startswith("/"):
href = f"https://www.nisra.gov.uk{href}"
lfs_links.append({"text": link_text, "url": href})
if not lfs_links:
raise NISRADataNotFoundError("No Quarterly Labour Force Survey Tables publications found")
# First link should be the latest (page shows newest first)
latest_pub = lfs_links[0]
logger.info(f"Found latest LFS publication: {latest_pub['text']}")
# Now fetch the publication page to get the Excel file
pub_response = session.get(latest_pub["url"], timeout=30)
pub_response.raise_for_status()
pub_soup = BeautifulSoup(pub_response.content, "html.parser")
# Find Excel file link for quarterly tables
excel_url = None
for link in pub_soup.find_all("a", href=True):
href = link["href"]
if "lmr-labour-force-survey-quarterly-tables" in href.lower() and href.endswith(".xlsx"):
excel_url = f"https://www.nisra.gov.uk{href}" if href.startswith("/") else href
break
if not excel_url:
raise NISRADataNotFoundError(f"No Excel file found on publication page: {latest_pub['url']}")
# Extract quarter and year from filename or page content
# Filename pattern: lmr-labour-force-survey-quarterly-tables-July-September-25.xlsx
filename = excel_url.split("/")[-1]
match = re.search(
r"(January|February|March|April|May|June|July|August|September|October|November|December)-"
r"(January|February|March|April|May|June|July|August|September|October|November|December)-"
r"(\d{2})",
filename,
)
if match:
first_month = match.group(1)
third_month = match.group(2)
year_short = match.group(3)
year = f"20{year_short}"
# Map months to quarter format
month_to_abbrev = {
"January": "Jan",
"February": "Feb",
"March": "Mar",
"April": "Apr",
"May": "May",
"June": "Jun",
"July": "Jul",
"August": "Aug",
"September": "Sep",
"October": "Oct",
"November": "Nov",
"December": "Dec",
}
quarter = f"{month_to_abbrev[first_month]}-{month_to_abbrev[third_month]}"
logger.info(f"Extracted quarter: {quarter} {year}")
return (excel_url, year, quarter)
# Fallback: Try to extract from page content
page_text = pub_soup.get_text()
match = re.search(
r"(January|February|March|April|May|June|July|August|September|October|November|December)\s+to\s+"
r"(January|February|March|April|May|June|July|August|September|October|November|December)\s+"
r"(\d{4})",
page_text,
)
if match:
first_month = match.group(1)
third_month = match.group(2)
year = match.group(3)
month_to_abbrev = {
"January": "Jan",
"February": "Feb",
"March": "Mar",
"April": "Apr",
"May": "May",
"June": "Jun",
"July": "Jul",
"August": "Aug",
"September": "Sep",
"October": "Oct",
"November": "Nov",
"December": "Dec",
}
quarter = f"{month_to_abbrev[first_month]}-{month_to_abbrev[third_month]}"
logger.info(f"Extracted quarter from page content: {quarter} {year}")
return (excel_url, year, quarter)
raise NISRADataNotFoundError("Could not extract quarter and year from publication")
except Exception as e:
raise NISRADataNotFoundError(f"Failed to fetch LFS publication list: {e}") from e
[docs]
def get_latest_employment(force_refresh: bool = False) -> pd.DataFrame:
"""Get the latest quarterly employment data by age and sex.
Automatically discovers and downloads the most recent quarterly LFS publication
from the NISRA website.
Args:
force_refresh: If True, always download fresh data ignoring cache
Returns:
DataFrame with employment statistics by age group and sex
Example:
>>> df = get_latest_employment()
>>> 'age_group' in df.columns
True
>>> young_adults = df[df['age_group'] == '25 to 29']
>>> len(young_adults) >= 0
True
"""
# Discover latest publication from NISRA website
excel_url, year, quarter = get_latest_lfs_publication_url()
# Download directly using the discovered URL
logger.info(f"Downloading latest LFS data: {quarter} {year}")
# Use download_file directly with the discovered URL
cache_ttl_hours = 90 * 24 # 90 days for quarterly data
file_path = download_file(excel_url, cache_ttl_hours=cache_ttl_hours, force_refresh=force_refresh)
return parse_employment_by_age_sex(file_path)
[docs]
def get_latest_economic_inactivity(force_refresh: bool = False) -> pd.DataFrame:
"""Get the latest quarterly economic inactivity data.
Automatically discovers and downloads the most recent quarterly LFS publication
from the NISRA website.
Args:
force_refresh: If True, always download fresh data ignoring cache
Returns:
DataFrame with economic inactivity statistics by sex
Example:
>>> df = get_latest_economic_inactivity()
>>> 'sex' in df.columns
True
"""
# Discover latest publication from NISRA website
excel_url, year, quarter = get_latest_lfs_publication_url()
# Download directly using the discovered URL
logger.info(f"Downloading latest LFS data: {quarter} {year}")
# Use download_file directly with the discovered URL
cache_ttl_hours = 90 * 24 # 90 days for quarterly data
file_path = download_file(excel_url, cache_ttl_hours=cache_ttl_hours, force_refresh=force_refresh)
return parse_economic_inactivity(file_path)
[docs]
def get_quarterly_data(
year: int, quarter: str, tables: list[str] | None = None, force_refresh: bool = False
) -> dict[str, pd.DataFrame]:
"""Get Labour Force Survey data for a specific quarter.
Args:
year: Year (e.g., 2025)
quarter: Quarter string like "Jul-Sep", "Jan-Mar", etc.
tables: List of table names to parse (default: ['employment', 'economic_inactivity'])
Options: 'employment', 'economic_inactivity'
force_refresh: If True, always download fresh data ignoring cache
Returns:
Dictionary mapping table names to DataFrames:
- 'employment': Employment by age and sex (Table 2.15)
- 'economic_inactivity': Economic inactivity data (Table 2.21)
Example:
>>> data = get_quarterly_data(2025, "Jul-Sep")
>>> sorted(data.keys())
['economic_inactivity', 'employment']
>>> emp_df = data['employment']
>>> 'age_group' in emp_df.columns
True
"""
if tables is None:
tables = ["employment", "economic_inactivity"]
file_path = download_quarterly_lfs(year, quarter, force_refresh=force_refresh)
result = {}
if "employment" in tables:
result["employment"] = parse_employment_by_age_sex(file_path)
if "economic_inactivity" in tables:
result["economic_inactivity"] = parse_economic_inactivity(file_path)
return result
# ============================================================================
# Local Government District Employment Functions
# ============================================================================
[docs]
def get_latest_lgd_employment_url() -> tuple[str, int]:
"""Get the URL of the latest LFS Local Government District tables publication.
Uses known URL pattern to construct the file URL. The LGD tables are published
annually but not listed on the main Labour Market page.
Returns:
Tuple of (excel_url, year)
Raises:
NISRADataNotFoundError: If unable to find the latest publication
Example:
>>> url, year = get_latest_lgd_employment_url()
>>> url.startswith('https://')
True
"""
from datetime import datetime
logger.info("Fetching latest LFS LGD employment publication URL...")
# The LGD tables follow a predictable URL pattern
# We know the publication exists for 2024, try current year first then fall back
current_year = datetime.now().year
years_to_try = [current_year, current_year - 1, 2024] # Try current, last year, and known 2024
for year in years_to_try:
# Construct the expected URL
# Pattern: https://www.nisra.gov.uk/publications/labour-force-survey-tables-local-government-districts-2009-YYYY
pub_url = f"{LFS_BASE_URL}/publications/labour-force-survey-tables-local-government-districts-2009-{year}"
try:
response = session.get(pub_url, timeout=30)
if response.status_code == 200:
# Found the publication page, now find the Excel file
from bs4 import BeautifulSoup
soup = BeautifulSoup(response.content, "html.parser")
# Find Excel file link
for link in soup.find_all("a", href=True):
href = link["href"]
if ".xlsx" in href.lower() and "labour" in href.lower():
excel_url = href
if not excel_url.startswith("http"):
excel_url = f"{LFS_BASE_URL}{excel_url}"
logger.info(f"Found latest LFS LGD tables: {year} at {excel_url}")
return excel_url, year
except Exception:
continue
# If all attempts fail, return the known 2024 file URL as fallback
fallback_url = (
"https://www.nisra.gov.uk/system/files/statistics/2025-07/"
"Labour%20Force%20Survey%20-%20Tables%20for%20Local%20Government%20Districts%202009%20to%202024.xlsx"
)
logger.warning(f"Could not find latest LGD publication via pattern, using known 2024 file: {fallback_url}")
return fallback_url, 2024
[docs]
def parse_employment_by_lgd(file_path: str | Path, year: int = None) -> pd.DataFrame:
"""Parse Table 1.16a: Employment by Local Government District (ages 16+).
Extracts employment statistics for all 11 Northern Ireland LGDs from the annual
LFS LGD tables file.
Args:
file_path: Path to the LFS LGD tables Excel file
year: Year of the data (if not provided, will be extracted from sheet name)
Returns:
DataFrame with columns:
- year: int
- lgd: str (Local Government District name)
- population_16plus: int (thousands)
- economically_active: int (thousands)
- in_employment: int (thousands)
- full_time_employment: int (thousands)
- part_time_employment: int (thousands)
- economically_inactive: int (thousands)
- economic_activity_rate: float (%)
- employment_rate: float (%)
Example:
>>> url, year = get_latest_lgd_employment_url()
>>> path = download_file(url, cache_ttl_hours=180*24)
>>> df = parse_employment_by_lgd(path, year=year)
>>> 'employment_rate' in df.columns
True
"""
logger.info(f"Parsing LFS LGD employment from: {file_path}")
# If year not provided, try to extract from filename or use the latest sheet
if year is None:
# Try to extract from filename
filename_match = re.search(r"(\d{4})\.xlsx", str(file_path))
if filename_match:
year = int(filename_match.group(1))
else:
# Use the latest year sheet
wb = load_workbook(file_path, data_only=True)
# Find numeric sheet names (years)
year_sheets = [int(s) for s in wb.sheetnames if s.isdigit()]
if year_sheets:
year = max(year_sheets)
else:
raise ValueError("Cannot determine year from filename or sheet names")
# Read Table 1.16a (ages 16+)
# Header is in row 6 (0-indexed), data starts in row 7
# Skip first 6 rows to use row 6 as header
df = pd.read_excel(file_path, sheet_name=str(year), skiprows=6, nrows=12)
# Rename columns to standard names
df.columns = [
"lgd",
"population_16plus",
"economically_active",
"in_employment",
"full_time_employment",
"part_time_employment",
"economically_inactive",
"economic_activity_rate",
"employment_rate",
"notes",
]
# Drop notes column and Total row
df = df.drop(columns=["notes"])
df = df[df["lgd"] != "Total"].reset_index(drop=True)
# Add year column
df["year"] = year
# Reorder columns
df = df[
[
"year",
"lgd",
"population_16plus",
"economically_active",
"in_employment",
"full_time_employment",
"part_time_employment",
"economically_inactive",
"economic_activity_rate",
"employment_rate",
]
]
logger.info(f"Parsed {len(df)} LGD employment records for {year}")
return df
[docs]
def get_latest_employment_by_lgd(force_refresh: bool = False) -> pd.DataFrame:
"""Get the latest Employment by Local Government District data.
Downloads and parses the most recent annual LFS LGD tables publication.
Results are cached for 180 days unless force_refresh=True.
Args:
force_refresh: If True, bypass cache and download fresh data
Returns:
DataFrame with employment statistics for all 11 NI Local Government Districts
Example:
>>> df = get_latest_employment_by_lgd()
>>> 'employment_rate' in df.columns
True
>>> len(df) > 0
True
"""
excel_url, year = get_latest_lgd_employment_url()
# Cache for 180 days (published annually)
file_path = download_file(excel_url, cache_ttl_hours=180 * 24, force_refresh=force_refresh)
return parse_employment_by_lgd(file_path, year=year)
[docs]
def validate_labour_market_data(df: pd.DataFrame) -> bool: # pragma: no cover
"""Validate labour market data integrity.
Args:
df: DataFrame from labour market functions
Returns:
True if validation passes, False otherwise
"""
if df.empty:
logger.warning("Labour market data is empty")
return False
# Check for employment-related columns
employment_indicators = ["employed", "unemployed", "rate", "count", "percentage"]
has_employment_data = any(indicator in " ".join(df.columns).lower() for indicator in employment_indicators)
if not has_employment_data:
logger.warning("No employment indicators found in labour market data")
return False
# Check for reasonable employment rates/percentages
rate_cols = [col for col in df.columns if "rate" in col.lower() or "percentage" in col.lower()]
for col in rate_cols:
if (
col in df.columns
and df[col].dtype in ["float64", "int64"]
and ((df[col] < 0).any() or (df[col] > 100).any())
):
logger.warning(f"Employment rates out of range in column {col}")
return False
return True