"""UK Companies House Data Integration.
Data Source: UK Companies House provides comprehensive company registration data through
their bulk download service at http://download.companieshouse.gov.uk/en_output.html.
The service provides complete company information including names, addresses, status,
and registration details for all active and dissolved companies in the UK.
Update Frequency: The Companies House bulk data is updated monthly, typically available
by the first week of each month. The data reflects the state of company registrations
as of the snapshot date.
Example:
Basic usage for querying company data:
>>> from bolster.data_sources import companies_house
>>> farset_companies = list(companies_house.query_basic_company_data(
... companies_house.companies_house_record_might_be_farset
... ))
>>> len(farset_companies) > 0
True
The module provides utilities for downloading and parsing the complete UK company registry,
with built-in filtering capabilities for targeted analysis.
"""
import csv
import logging
from collections.abc import Callable, Iterator
import bs4
from tqdm.auto import tqdm
from bolster import always, dict_concat_safe
from bolster.utils.web import download_extract_zip, session
[docs]
logger = logging.getLogger(__name__)
[docs]
def get_basic_company_data_url() -> str:
"""Parse the companies house website to get the current URL for the 'BasicCompanyData'.
Currently uses the 'one file' method but it could be split into the multi files for memory efficiency
"""
base_url = "http://download.companieshouse.gov.uk/en_output.html"
# TODO: Network integration testing - requires active Companies House website
s = bs4.BeautifulSoup(session.get(base_url).content, features="lxml") # pragma: no cover
for a in s.find_all("a"): # pragma: no cover
if a.get("href").startswith("BasicCompanyDataAsOneFile"): # pragma: no cover
url = f"http://download.companieshouse.gov.uk/{a.get('href')}" # pragma: no cover
break # assume first time lucky # pragma: no cover
return url # pragma: no cover
[docs]
def query_basic_company_data(query_func: Callable[..., bool] = always) -> Iterator[dict]:
"""Grab the url for the basic company data, and walk through the CSV files within.
For each row in each CSV file, parse the row data through the given `query_func`
such that if `query_func(row)` is True it will be yielded.
"""
# TODO: Network integration testing - requires Companies House data download
url = get_basic_company_data_url() # pragma: no cover
for _filename, data in tqdm(download_extract_zip(url)): # pragma: no cover
for row in tqdm(csv.DictReader(d.decode("utf-8") for d in data)): # pragma: no cover
if query_func(row): # pragma: no cover
yield row # pragma: no cover
[docs]
def companies_house_record_might_be_farset(r: dict) -> bool:
"""A heuristic function for working out if a record in the companies house registry *might* be based in Farset Labs.
Almost certainly incomplete and needs more testing/validation.
"""
if r["RegAddress.PostCode"].lower().replace(" ", "") != "bt125gh":
return False
address_line = ",".join(
map(
str,
dict_concat_safe(
r,
[
"RegAddress.CareOf",
"RegAddress.AddressLine1",
"RegAddress.AddressLine2", # This appears to be optional now
],
default="",
),
)
).lower()
if "farset" in address_line:
return True
if "unit 10" in address_line or "unit 18" in address_line or "unit 17" in address_line:
return False
return "unit 1" in address_line
[docs]
def get_companies_house_records_that_might_be_in_farset() -> Iterator[dict]:
"""Query Companies House records that might be located at Farset Labs."""
# TODO: Network integration testing - requires Companies House data download
yield from query_basic_company_data(companies_house_record_might_be_farset) # pragma: no cover