Source code for bolster.data_sources.eoni

"""Northern Ireland Electoral Office (EONI) Election Data Integration.

Data Source: The Electoral Office for Northern Ireland provides official election results
and data through their website at https://www.eoni.org.uk. This module accesses NI Assembly
election results from 2003 onwards, including constituency-level results, candidate information,
and vote tallies for all electoral areas in Northern Ireland.

Update Frequency: Electoral data is updated after each election cycle. NI Assembly elections
typically occur every 4-5 years, with the most recent elections in 2022, 2017, and 2016.
Historical data remains static once published, with occasional corrections or clarifications.

Example:
    Retrieve election results for the 2022 NI Assembly election:

        >>> from bolster.data_sources import eoni
        >>> results_2022 = eoni.get_results(2022)
        >>> isinstance(results_2022, dict)
        True
        >>> len(results_2022) > 0
        True
        >>> constituency = next(iter(results_2022))
        >>> isinstance(constituency, str)
        True
        >>> 'candidates' in results_2022[constituency]
        True

The module supports automated ingestion of NI Assembly election results with constituency-level
detail and candidate performance data.

Implementation Status:
✅ 2022, 2017, 2016 elections supported
⏳ 2011, 2007, 2003 elections (planned)
"""

import datetime
import logging
import re
from collections.abc import Iterable
from typing import AnyStr

import pandas as pd
from bs4 import BeautifulSoup

from bolster.utils.web import get_excel_dataframe, session, ua

[docs] logger = logging.getLogger(__name__)
# _headers = { "user-agent": f"User-Agent: {ua} Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, " f"like Gecko) Chrome/91.0.4472.114 Safari/537.36" } _base_url = "https://www.eoni.org.uk"
[docs] def get_page(path: AnyStr) -> BeautifulSoup: """For a given path (within EONI.org.uk), get the response as a BeautifulSoup instance. Note: EONI is trying to block people from scraping and will return a 403 error if you don't pass a 'conventional' user agent >>> page = get_page("/Elections/") >>> page.find('title').contents[0].strip() 'Elections | The Electoral Office for Northern Ireland' """ res = session.get(_base_url + path, headers=_headers) res.raise_for_status() return BeautifulSoup(res.content, features="html.parser")
[docs] def normalise_constituencies(cons_str: str) -> str: """Some constituencies change names or cases etc. Use this function to take external/unconventional inputs and project them into a normalised format. >>> normalise_constituencies('Newry & Armagh') 'newry and armagh' """ return cons_str.lower().replace(" & ", " and ")
[docs] def get_metadata_from_df( df: pd.DataFrame, ) -> dict[str, int | str | datetime.datetime]: """Extract Ballot metadata from the table header(s) of an XLS formatted result sheet, as output from `get_excel_dataframe`. # TODO this could probably be done better as a `dataclass` Returns: dict of 'stage': int, 'date': datetime 'constituency': str (lower) 'eligible_electorate': int 'votes_polled': int 'number_to_be_elected': int 'invalid_votes': int 'electoral_quota': int """ stage_n_catcher = re.compile(r"^Stage (\d+)") return { "stage": int(re.match(stage_n_catcher, df.columns[5]).group(1)), # should have been just int(df.columns[5].split()[-1])., but someone insisted on messing up 2017 "date": df.columns[10], "constituency": normalise_constituencies(df.iloc[0, 3]), "eligible_electorate": int(df.iloc[1, 3]), "votes_polled": int(df.iloc[2, 3]), "number_to_be_elected": int(df.iloc[1, 6]), "total_valid_votes": int(df.iloc[2, 6]), "invalid_votes": int(df.iloc[1, 9]), "electoral_quota": int(df.iloc[1, 12]), }
[docs] def get_candidates_from_df(df: pd.DataFrame) -> pd.DataFrame: """Extract Candidates name and party columns from first stage sheet.""" candidates_df = df.iloc[9:29, 2:4] candidates_df.columns = ["candidate_name", "candidate_party"] return candidates_df.replace(0, None).dropna().reset_index(drop=True)
[docs] def get_stage_votes_from_df(df: pd.DataFrame) -> pd.DataFrame: """Extract the votes from each stage as a mapped column for each stage, i.e. stages 1...N.""" stages = get_metadata_from_df(df)["stage"] return ( pd.concat({n: extract_stage_n_votes(df, n) for n in range(stages)}) .unstack() .T.replace(0, None) .dropna(how="all") )
[docs] def get_stage_transfers_from_df(df: pd.DataFrame) -> pd.DataFrame: """Extract the transfers from each stage as a mapped column for each stage, i.e. stages 2...N.""" stages = get_metadata_from_df(df)["stage"] return ( pd.concat({n: extract_stage_n_transfers(df, n) for n in range(stages)}) .unstack() .T.replace(0, None) .dropna(how="all") )
[docs] def extract_stage_n_votes(df: pd.DataFrame, n: int) -> pd.Series | None: """Extract the votes from a given stage N. Note: This will include trailing, unaligned `Nones` which must be cleaned up at the Ballot level """ if n == 0: return None if n < 10: row_offset = 9 col_offset = 4 + (2 * (n - 1)) else: row_offset = 55 col_offset = 6 + (2 * (n - 10)) return df.iloc[row_offset : row_offset + 20, col_offset].reset_index(drop=True)
[docs] def extract_stage_n_transfers(df: pd.DataFrame, n: int) -> pd.Series | None: """Extract the votes from a given stage N. Note: This will include trailing, unaligned `Nones` which must be cleaned up at the Ballot level Stage Transfers are associated with the 'next' stage, i.e. stage 1 has no transfers """ if n <= 1: return None if n < 10: row_offset = 9 col_offset = 5 + (2 * (n - 2)) else: row_offset = 55 col_offset = 5 + (2 * (n - 10)) return df.iloc[row_offset : row_offset + 20, col_offset].reset_index(drop=True)
[docs] def get_results_from_sheet(sheet_url: AnyStr) -> dict[str, pd.DataFrame | dict]: """Download and parse election results from an Excel sheet URL.""" df = get_excel_dataframe(sheet_url, requests_kwargs={"headers": _headers}) metadata = get_metadata_from_df(df) candidates = get_candidates_from_df(df) stage_votes = get_stage_votes_from_df(df) stage_transfers = get_stage_transfers_from_df(df) return { "candidates": candidates, "stage_votes": stage_votes, "stage_transfers": stage_transfers, "metadata": metadata, }
[docs] def get_results(year: int) -> dict[str, pd.DataFrame | dict]: """Get election results for a specific year from EONI website.""" results_listing_dir = "/results-data/" results_listing_path = { 2022: "ni-assembly-election-2022-results/", } results = {} results_listing_page = get_page(results_listing_dir + results_listing_path[year]) for sheet_url in find_xls_links_in_page(results_listing_page): data = get_results_from_sheet(sheet_url) results[data["metadata"]["constituency"]] = data return results
[docs] def validate_election_results(results: dict[str, dict]) -> bool: # pragma: no cover """Validate election results data integrity. Args: results: Dictionary of election results by constituency Returns: True if validation passes, False otherwise """ if not results: logger.warning("Election results data is empty") return False valid_constituencies = 0 for constituency, data in results.items(): if not isinstance(data, dict): logger.warning(f"Invalid data structure for {constituency}") continue required_keys = {"candidates", "stage_votes", "metadata"} if not required_keys.issubset(data.keys()): missing = required_keys - set(data.keys()) logger.warning(f"Missing required keys in {constituency}: {missing}") continue # Check candidates DataFrame candidates = data["candidates"] if isinstance(candidates, pd.DataFrame) and not candidates.empty: if "candidate_name" in candidates.columns or "name" in candidates.columns: valid_constituencies += 1 else: logger.warning(f"Missing candidate names in {constituency}") else: logger.warning(f"Invalid or empty candidates data for {constituency}") if valid_constituencies < len(results) * 0.8: # At least 80% should be valid logger.warning(f"Only {valid_constituencies}/{len(results)} constituencies have valid data") return False return True