"""Wikipedia Northern Ireland Data Integration.
Data Source: Wikipedia provides publicly edited information about Northern Ireland institutions
and governance through structured tables at https://en.wikipedia.org/wiki/Northern_Ireland_Executive.
This module accesses historical composition data for the Northern Ireland Executive, including
formation dates, dissolution dates, and leadership appointments since devolution began in 1999.
Update Frequency: Wikipedia content is updated continuously by volunteer editors as political
events occur. Executive composition changes are typically reflected within days of official
announcements. The module specifically parses the "Historical composition of the Northern Ireland
Executive" table which maintains a comprehensive record of all executives since devolution.
Example:
Extract NI Executive historical data and analyze political stability:
>>> from bolster.data_sources import wikipedia
>>> executives = wikipedia.get_ni_executive_basic_table()
>>> 'Duration' in executives.columns
True
>>> len(executives) > 0
True
This module provides utilities for analyzing Northern Ireland's political history and executive
stability patterns since the establishment of devolved government.
"""
import datetime
import logging
import dateparser
import numpy as np
import pandas as pd
from bolster.utils.web import session
[docs]
logger = logging.getLogger(__name__)
[docs]
def get_ni_executive_basic_table() -> pd.DataFrame:
"""Get Northern Ireland Executive composition data from Wikipedia.
Extracts historical data from the "Historical composition of the Northern Ireland Executive"
table at: https://en.wikipedia.org/wiki/Northern_Ireland_Executive#Composition_since_devolution
Returns:
DataFrame with Executive index and columns:
- Established: datetime64[ns] - When the executive was formed
- Dissolved: datetime64[ns] - When the executive ended
- Duration: timedelta64[ns] - How long the executive lasted
- Interregnum: timedelta64[ns] - Gap until next executive
Example:
>>> df = get_ni_executive_basic_table()
>>> sorted(df.columns.tolist())
['Dissolved', 'Duration', 'Established', 'Interregnum']
>>> len(df) > 0
True
"""
# Use a custom user agent to avoid Wikipedia 403 errors
# Wikipedia blocks default pandas/urllib user agents
headers = {
"User-Agent": "Bolster Data Science Library/0.3.4 (https://github.com/andrewbolster/bolster; andrew.bolster@gmail.com)"
}
url = "https://en.wikipedia.org/wiki/Northern_Ireland_Executive"
response = session.get(url, headers=headers)
response.raise_for_status()
tables = pd.read_html(pd.io.common.StringIO(response.text))
tables[4].columns = range(len(tables[4].columns))
# Get rid of the nasty multi index
executive_events = tables[4][[0, 1, 2, 3, 4, 5, 6]]
executive_events.columns = [
"Executive",
"Date",
"Event",
"vFM",
"FM",
"vDFM",
"DFM",
]
# Get rid of the comments row at the bottom
executive_events = executive_events[:-1]
# Clean up the 'Executive' as for some reason wikipedians count the caretakers differently.
executive_events["Executive"] = executive_events["Executive"].map(lambda x: x.split("(")[0])
# Use the OFMDFM posts as a proxy for 'active' to flatten out the range of reasons for failure.
executive_events["Active"] = (
executive_events[["vFM", "FM", "vDFM", "DFM"]].replace("Vacant", None).replace(np.nan, None).any(axis=1)
)
executive_durations = executive_events.groupby(["Executive", "Active"])["Date"].first().unstack()
executive_durations.columns = ["Dissolved", "Established"]
executive_durations = executive_durations[reversed(executive_durations.columns)]
executive_durations = executive_durations.map(lambda s: dateparser.parse(s) if isinstance(s, str) else s)
executive_durations["Duration"] = executive_durations.diff(axis=1).iloc[:, -1:]
executive_dissolutions = pd.concat(
[
executive_durations["Dissolved"],
executive_durations["Established"].shift(-1),
],
axis=1,
)
executive_dissolutions = executive_dissolutions.apply(
lambda r: r.Established - r.Dissolved
if not pd.isnull(r.Established)
else datetime.datetime.today().replace(hour=0, minute=0, second=0, microsecond=0) - r.Dissolved,
axis=1,
)
executive_durations["Interregnum"] = executive_dissolutions
# Fix last / most recent
executive_durations.loc[executive_durations.index[-1], "Duration"] = (
datetime.datetime.today().replace(hour=0, minute=0, second=0, microsecond=0)
- executive_durations["Established"].iloc[-1]
)
return executive_durations