Source code for bolster.data_sources.gender_pay_gap

"""UK Gender Pay Gap Reporting Data Source.

Provides access to UK Gender Pay Gap (GPG) reporting data published annually by
the UK Government. All employers with 250+ employees are legally required to
report their gender pay gap figures each year.

Data Source:
    **Download page**: https://gender-pay-gap.service.gov.uk/viewing/download

    Annual CSV files are published for each reporting year, covering all UK employers
    with 250+ employees. Data is available from 2017 to present. Northern Ireland
    employers (identifiable by BT postcodes) are included in the UK-wide dataset.

    The reporting deadline is 4 April each year (for the 12-month snapshot period
    ending 5 April the previous year), so data for year Y covers the snapshot date
    of 5 April Y.

Update Frequency: Annual (April each year)
Geographic Coverage: UK-wide. NI employers identifiable via BT postcode prefix.
Licence: Open Government Licence v3.0

Metrics Provided:
    - Mean and median hourly pay gap (%) between male and female employees
    - Mean and median bonus pay gap (%)
    - Proportion of male/female employees receiving a bonus
    - Pay quartile gender composition (lower, lower-middle, upper-middle, upper)
    - Employer metadata (size band, SIC code, Companies House number)

Example:
    >>> from bolster.data_sources import gender_pay_gap
    >>> df = gender_pay_gap.get_data(year=2024)
    >>> 'employer_name' in df.columns
    True
    >>> ni_df = gender_pay_gap.get_data(year=2024, postcode_prefix="BT")
    >>> len(ni_df) > 0
    True
"""

import logging
from io import StringIO

import pandas as pd

from bolster.utils.web import session


[docs]
logger = logging.getLogger(__name__)


# Base URL for gender pay gap CSV downloads

[docs]
GPG_BASE_URL = "https://gender-pay-gap.service.gov.uk/viewing/download-data/{year}"


# First year data is available

[docs]
FIRST_YEAR = 2017


# Column name mapping from raw CSV to snake_case

[docs]
COLUMN_MAPPING = {
    "EmployerName": "employer_name",
    "EmployerId": "employer_id",
    "Address": "address",
    "PostCode": "postcode",
    "CompanyNumber": "company_number",
    "SicCodes": "sic_codes",
    "DiffMeanHourlyPercent": "diff_mean_hourly_percent",
    "DiffMedianHourlyPercent": "diff_median_hourly_percent",
    "DiffMeanBonusPercent": "diff_mean_bonus_percent",
    "DiffMedianBonusPercent": "diff_median_bonus_percent",
    "MaleBonusPercent": "male_bonus_percent",
    "FemaleBonusPercent": "female_bonus_percent",
    "MaleLowerQuartile": "male_lower_quartile",
    "FemaleLowerQuartile": "female_lower_quartile",
    "MaleLowerMiddleQuartile": "male_lower_middle_quartile",
    "FemaleLowerMiddleQuartile": "female_lower_middle_quartile",
    "MaleUpperMiddleQuartile": "male_upper_middle_quartile",
    "FemaleUpperMiddleQuartile": "female_upper_middle_quartile",
    "MaleTopQuartile": "male_top_quartile",
    "FemaleTopQuartile": "female_top_quartile",
    "CompanyLinkToGPGInfo": "company_link_to_gpg_info",
    "ResponsiblePerson": "responsible_person",
    "EmployerSize": "employer_size",
    "CurrentName": "current_name",
    "SubmittedAfterTheDeadline": "submitted_after_deadline",
    "DueDate": "due_date",
    "DateSubmitted": "date_submitted",
}


# Numeric columns to coerce (some cells contain empty strings)

[docs]
NUMERIC_COLUMNS = [
    "diff_mean_hourly_percent",
    "diff_median_hourly_percent",
    "diff_mean_bonus_percent",
    "diff_median_bonus_percent",
    "male_bonus_percent",
    "female_bonus_percent",
    "male_lower_quartile",
    "female_lower_quartile",
    "male_lower_middle_quartile",
    "female_lower_middle_quartile",
    "male_upper_middle_quartile",
    "female_upper_middle_quartile",
    "male_top_quartile",
    "female_top_quartile",
]




[docs]
class GenderPayGapError(Exception):
    """Base exception for gender pay gap data errors."""




[docs]
class GenderPayGapDataNotFoundError(GenderPayGapError):
    """Raised when data for the requested year is not available."""




[docs]
def get_available_years() -> list[int]:
    """Return the list of reporting years with published data.

    Data is published annually. The first year is 2017; data for the current
    year is available once the April reporting deadline has passed.

    Returns:
        List of years (integers) for which data is available, e.g. [2017, ..., 2024].

    Example:
        >>> years = get_available_years()
        >>> len(years) > 0
        True
    """
    current_year = pd.Timestamp.now().year
    # Data for year Y is typically published in April of year Y+1
    # Be conservative: include up to the previous year
    return list(range(FIRST_YEAR, current_year))




[docs]
def get_data(
    year: int,
    postcode_prefix: str | None = None,
    force_refresh: bool = False,
) -> pd.DataFrame:
    """Download and parse gender pay gap data for a given reporting year.

    Returns UK-wide data by default. Use ``postcode_prefix`` to filter to a
    specific region — e.g. ``"BT"`` for Northern Ireland, ``"EH"`` for Edinburgh,
    ``"M"`` for Manchester. The full dataset is always downloaded first; filtering
    happens in-memory.

    Args:
        year: The reporting year (e.g. 2024 for the snapshot date of 5 April 2024).
              Must be between 2017 and the most recent available year.
        postcode_prefix: If given, only return employers whose postcode starts with
                         this prefix (case-insensitive). ``None`` (default) returns
                         all UK employers. Common values: ``"BT"`` (Northern Ireland),
                         ``"EH"`` (Edinburgh), ``"G"`` (Glasgow), ``"M"`` (Manchester).
        force_refresh: If True, bypass any cached response. Has no effect currently
                       as responses are streamed directly; reserved for future caching.

    Returns:
        DataFrame with one row per employer, columns:

        - employer_name: str
        - employer_id: str
        - address: str
        - postcode: str
        - company_number: str
        - sic_codes: str
        - diff_mean_hourly_percent: float — mean hourly pay gap (positive = men paid more)
        - diff_median_hourly_percent: float
        - diff_mean_bonus_percent: float
        - diff_median_bonus_percent: float
        - male_bonus_percent: float — % of male employees receiving a bonus
        - female_bonus_percent: float
        - male_lower_quartile: float — % of lower pay quartile who are male
        - female_lower_quartile: float
        - male_lower_middle_quartile: float
        - female_lower_middle_quartile: float
        - male_upper_middle_quartile: float
        - female_upper_middle_quartile: float
        - male_top_quartile: float
        - female_top_quartile: float
        - company_link_to_gpg_info: str
        - responsible_person: str
        - employer_size: str — e.g. "250 to 499", "500 to 999", "5000 to 19,999", "20,000 or more"
        - current_name: str
        - submitted_after_deadline: bool
        - due_date: datetime
        - date_submitted: datetime
        - reporting_year: int — the reporting year (same as ``year`` arg)

    Raises:
        GenderPayGapDataNotFoundError: If data for the requested year is not available.
        GenderPayGapError: If the download or parse fails.

    Example:
        >>> df = get_data(year=2024)
        >>> 'employer_name' in df.columns
        True
        >>> ni = get_data(year=2024, postcode_prefix="BT")
        >>> len(ni) > 0
        True
    """
    available = get_available_years()
    if year not in available:
        raise GenderPayGapDataNotFoundError(
            f"Year {year} is not available. Available years: {min(available)}–{max(available)}"
        )

    url = GPG_BASE_URL.format(year=year)
    logger.info(f"Downloading GPG data for {year} from {url}")

    try:
        response = session.get(url, timeout=60)
        response.raise_for_status()
    except Exception as e:
        raise GenderPayGapError(f"Failed to download GPG data for {year}: {e}") from e

    try:
        df = pd.read_csv(StringIO(response.text))
    except Exception as e:
        raise GenderPayGapError(f"Failed to parse GPG CSV for {year}: {e}") from e

    # Rename columns to snake_case
    df = df.rename(columns=COLUMN_MAPPING)

    # Coerce numeric columns (empty strings → NaN)
    for col in NUMERIC_COLUMNS:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors="coerce")

    # Parse datetime columns
    for col in ("due_date", "date_submitted"):
        if col in df.columns:
            df[col] = pd.to_datetime(df[col], errors="coerce")

    # Boolean column
    if "submitted_after_deadline" in df.columns:
        df["submitted_after_deadline"] = df["submitted_after_deadline"].map(
            {"True": True, "False": False, True: True, False: False}
        )

    # Add derived columns
    df["reporting_year"] = year

    if postcode_prefix is not None:
        prefix = postcode_prefix.upper()
        mask = df["postcode"].str.upper().str.startswith(prefix, na=False)
        df = df[mask].copy()
        logger.info(f"Filtered to {len(df)} employers with postcode prefix '{prefix}' for {year}")

    logger.info(f"Loaded {len(df)} employers for reporting year {year}")
    return df




[docs]
def get_all_years(postcode_prefix: str | None = None) -> pd.DataFrame:
    """Download and combine gender pay gap data for all available years.

    Useful for trend analysis across multiple reporting years.

    Args:
        postcode_prefix: If given, filter to employers whose postcode starts with
                         this prefix before combining. See :func:`get_data` for details.

    Returns:
        Combined DataFrame with all years, including a ``reporting_year`` column.
        See :func:`get_data` for full column documentation.

    Example:
        >>> df = get_all_years(postcode_prefix="BT")
        >>> 'reporting_year' in df.columns
        True
    """
    frames = []
    for year in get_available_years():
        try:
            df = get_data(year=year, postcode_prefix=postcode_prefix)
            frames.append(df)
            logger.info(f"Loaded {year}: {len(df)} employers")
        except GenderPayGapError as e:
            logger.warning(f"Skipping year {year}: {e}")

    if not frames:
        raise GenderPayGapError("No data could be loaded for any year")

    return pd.concat(frames, ignore_index=True)




[docs]
def validate_data(df: pd.DataFrame) -> bool:
    """Validate a gender pay gap DataFrame for internal consistency.

    Checks:
    - Required columns are present
    - Pay quartile columns sum to ~100% (male + female = 100 per quartile)
    - Hourly pay gap values are within plausible range (-100% to +100%)

    Args:
        df: DataFrame from :func:`get_data` or :func:`get_all_years`.

    Returns:
        True if validation passes.

    Raises:
        GenderPayGapError: If any validation check fails.

    Example:
        >>> df = get_data(year=2024)
        >>> validate_data(df)
        True
    """
    required_columns = {
        "employer_name",
        "postcode",
        "diff_mean_hourly_percent",
        "diff_median_hourly_percent",
        "employer_size",
        "reporting_year",
    }
    missing = required_columns - set(df.columns)
    if missing:
        raise GenderPayGapError(f"Missing required columns: {missing}")

    # Check quartile columns sum to ~100 (allow rounding tolerance)
    quartile_pairs = [
        ("male_lower_quartile", "female_lower_quartile"),
        ("male_lower_middle_quartile", "female_lower_middle_quartile"),
        ("male_upper_middle_quartile", "female_upper_middle_quartile"),
        ("male_top_quartile", "female_top_quartile"),
    ]
    for male_col, female_col in quartile_pairs:
        if male_col in df.columns and female_col in df.columns:
            totals = df[male_col].fillna(0) + df[female_col].fillna(0)
            # Only check rows where both are non-null
            mask = df[male_col].notna() & df[female_col].notna()
            if mask.any():
                bad = ((totals[mask] - 100).abs() > 1.5).sum()
                if bad > 0:
                    raise GenderPayGapError(f"Quartile columns {male_col}+{female_col} don't sum to 100 for {bad} rows")

    # Hourly pay gap within plausible range.
    # The service accepts extreme negatives (e.g. -757%) from employers with
    # heavily female-dominated lower pay bands; cap check at -1000/+100.
    for col in ("diff_mean_hourly_percent", "diff_median_hourly_percent"):
        if col in df.columns:
            valid = df[col].dropna()
            if len(valid) > 0 and ((valid < -1000) | (valid > 100)).any():
                raise GenderPayGapError(f"Column {col} contains implausible values outside [-1000, 100]")

    logger.info(f"Validation passed for {len(df)} employers")
    return True