diff --git a/.gitignore b/.gitignore index 12ca52a..fdc36a0 100644 --- a/.gitignore +++ b/.gitignore @@ -890,5 +890,8 @@ docs/_linkcheck/ *.xls *.xlsx +# Font file +*.ttf + # ignoring all contents of top level data folder data/ \ No newline at end of file diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..52c509b --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,20 @@ +# See https://pre-commit.com for more information +# See https://pre-commit.com/hooks.html for more hooks +repos: + - repo: https://github.com/PyCQA/bandit + rev: 1.8.3 + hooks: + - id: bandit + name: bandit - Checks for vulnerabilities + args: ["-c", "pyproject.toml"] + additional_dependencies: ["bandit[toml]"] + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.11.12 + hooks: + # Run ruff linter. + - id: ruff-check + types_or: [ python, pyi ] + args: [ --fix ] + # Run ruff formatter. + - id: ruff-format + types_or: [ python, pyi ] \ No newline at end of file diff --git a/README.md b/README.md index a893090..2eae8ca 100644 --- a/README.md +++ b/README.md @@ -180,7 +180,12 @@ Your file structure should look like the following. Text in red are the folders [Clicking this link will open the image in a separate window to allow you to zoom in if needed.](https://github.com/user-attachments/assets/196538ad-8df7-4011-a696-8d7744501260) -### 4.3.6 Running the pipeline +### 4.3.6 Download specific font +To create the horizontal bar chart, the Open Sans font is required. +Download the Open Sans font from https://fonts.google.com/specimen/Open+Sans +Manually save the downloaded font folder within the Data folder, output_data, bar_charts. + +### 4.3.7 Running the pipeline The entry point for the pipeline is stored within the package and called `main_pipeline.py`. To run the pipeline, run the following code in the terminal (either in the root directory of the project, or by specifying the path to `main_pipeline.py` from elsewhere). @@ -227,6 +232,4 @@ This project structure is based on the [`govcookiecutter` template project][govc If you want to help us build and improve `area_classification`, please take a look at our [contributing guidance][contributing]. # 11.0 Contacts -[ONS Geography inbox](mailto:ons.geography@ons.gov.uk) - - +[ONS Geography inbox](mailto:ons.geography@ons.gov.uk) \ No newline at end of file diff --git a/area_classification/__init__.py b/area_classification/__init__.py index 5202e22..988c50c 100644 --- a/area_classification/__init__.py +++ b/area_classification/__init__.py @@ -3,10 +3,8 @@ import sys from pathlib import Path -#This ensures that when the log prints it includes the time, the level and the message associated -logging_str = ( - "%(asctime)s - %(levelname)s - %(message)s" -) +# This ensures that when the log prints it includes the time, the level and the message associated +logging_str = "%(asctime)s - %(levelname)s - %(message)s" # Get the root directory by going up two levels root_dir = Path(__file__).resolve().parents[1] @@ -15,11 +13,12 @@ os.makedirs(log_dir, exist_ok=True) log_filepath = os.path.join(log_dir, "running_log.log") -#If level is set to INFO it won't show DEBUG messages. DEBUG includes all messages associated with this repo. +# If level is set to INFO it won't show DEBUG messages. DEBUG includes all +# messages associated with this repo. logging.basicConfig( level=logging.INFO, format=logging_str, handlers=[logging.FileHandler(log_filepath), logging.StreamHandler(sys.stdout)], ) -logger = logging.getLogger("area_classification") \ No newline at end of file +logger = logging.getLogger("area_classification") diff --git a/area_classification/clustering/clustering.py b/area_classification/clustering/clustering.py index d003087..81d1302 100644 --- a/area_classification/clustering/clustering.py +++ b/area_classification/clustering/clustering.py @@ -1,29 +1,32 @@ # Note: Supergroup = cluster, group = subcluster, subgroup = subsubcluster. +import logging +import os +from typing import Union + +import matplotlib.pyplot as plt import pandas as pd -import numpy as np -from sklearn.cluster import KMeans from clustergram import Clustergram -import matplotlib.pyplot as plt -from typing import Union -import os -import logging +from sklearn.cluster import KMeans + +from area_classification.utilities.loading_data import load_data logger = logging.getLogger(__name__) -from area_classification.utilities.load_config import load_config -from area_classification.utilities.loading_data import load_data -def clustering_wrapper(config: dict, - input_dataframe: Union[pd.DataFrame, str], - number_of_clusters: int, - n_init: int, - output_directory: str, - clustergram_directory: str, - random_seed: int = None) -> pd.DataFrame: +def clustering_wrapper( + config: dict, + input_dataframe: Union[pd.DataFrame, str], + number_of_clusters: int, + n_init: int, + output_directory: str, + clustergram_directory: str, + random_seed: int = None, +) -> pd.DataFrame: """ - Performs hierarchical K-means clustering on input data, generating supergroups, groups, and subgroups. - Saves cluster assignments and clustergram plots to specified directories. + Performs hierarchical K-means clustering on input data, generating + supergroups, groups, and subgroups. Saves cluster assignments and + clustergram plots to specified directories. Parameters ---------- @@ -50,7 +53,7 @@ def clustering_wrapper(config: dict, # Create folders to save the outputs into os.makedirs(output_directory, exist_ok=True) os.makedirs(clustergram_directory, exist_ok=True) - + if isinstance(input_dataframe, str): # If a file path is provided, load the data from the CSV file logger.info(f"Loading data from {input_dataframe}") @@ -62,8 +65,11 @@ def clustering_wrapper(config: dict, variable_df.set_index(variable_df.columns[0], inplace=True) missing_values = variable_df.isnull().sum().sum() if missing_values > 0: - logger.warning(f"Warning: {missing_values} missing values found in input data. Missing values will be replaced with 0.") - variable_df.fillna(0, inplace=True) + logger.warning( + f"Warning: {missing_values} missing values found in input data." + + "Missing values will be replaced with 0." + ) + variable_df.fillna(0, inplace=True) else: raise ValueError("Input must be a file path (str) or a pandas DataFrame.") @@ -71,110 +77,127 @@ def clustering_wrapper(config: dict, # Ensure number_of_clusters does not exceed the number of data points # for example, will fail if 10 data points attempted to group in 11 clusters if len(variable_df) < number_of_clusters: - logger.warning(f"Warning: Reducing number_of_clusters from {number_of_clusters} to {len(variable_df)}.") - number_of_clusters = len(variable_df) - - # Create a clustergram from all the data to establish number of supergroups (clusters) for K means - create_clustergram(variable_df, - number_of_clusters, - n_init, - save_location=clustergram_directory+"/supergroup_clustergram.png", - random_seed=random_seed) - + logger.warning( + f"Warning: Reducing number_of_clusters from {number_of_clusters} to {len(variable_df)}." + ) + number_of_clusters = len(variable_df) + + # Create a clustergram from all the data to establish number of + # supergroups (clusters) for K means + create_clustergram( + variable_df, + number_of_clusters, + n_init, + save_location=clustergram_directory + "/supergroup_clustergram.png", + random_seed=random_seed, + ) + logger.info("create supergroup clustergrams completed.") # Pause for user input before proceeding input("Press Enter to continue with supergroups creation...") - + # Assign the file path to save the supergroup cluster output - supergroup_output_filepath = output_directory+"/cluster_assignments/supergroups_clustering_output.csv" - + supergroup_output_filepath = ( + output_directory + "/cluster_assignments/supergroups_clustering_output.csv" + ) + # Run the K means clustering to assign supergroups - supergroup_variable_df = run_kmeans(variable_df, - number_of_clusters, - n_init, - supergroup_output_filepath, - random_seed) + supergroup_variable_df = run_kmeans( + variable_df, number_of_clusters, n_init, supergroup_output_filepath, random_seed + ) logger.info("Kmeans run completed.") # Pause for user input before proceeding logger.info(f"Unique clusters at this stage: {supergroup_variable_df['cluster'].unique()}") logger.info("Check that dictionary in config for subsubclustering mapping is correct") input("Press Enter to continue to move onto groups...") - + ###GROUP SECTION ### - # Create a clustergram for each supergroup to establish number of groups (subclusters) for K means - create_subcluster_clustergrams(cluster_variable_df=supergroup_variable_df, - clustergram_directory=clustergram_directory, - number_of_clusters=number_of_clusters, - drop_columns=['cluster'], - cluster_col_name='cluster', - n_init=n_init, - random_seed=random_seed) + # Create a clustergram for each supergroup to establish number of + # groups (subclusters) for K means + create_subcluster_clustergrams( + cluster_variable_df=supergroup_variable_df, + clustergram_directory=clustergram_directory, + number_of_clusters=number_of_clusters, + drop_columns=["cluster"], + cluster_col_name="cluster", + n_init=n_init, + random_seed=random_seed, + ) logger.info("group clustergrams completed.") # Pause for user input before proceeding input("Press Enter to continue with the subcluster numbers below for groups creation...") - + # Run K-means clustering to assign groups (subclusters) using config mapping - grouped_variable_df = run_subclustering(input_df=supergroup_variable_df, - output_location=f"{output_directory}cluster_assignments/group", - drop_columns="cluster", - column_name="subcluster", - cluster_col_name="cluster", - cluster_to_numbers = config["subclustering_mapping"], - n_init=n_init, - random_seed=random_seed) + grouped_variable_df = run_subclustering( + input_df=supergroup_variable_df, + output_location=f"{output_directory}cluster_assignments/group", + drop_columns="cluster", + column_name="subcluster", + cluster_col_name="cluster", + cluster_to_numbers=config["subclustering_mapping"], + n_init=n_init, + random_seed=random_seed, + ) logger.info("groups cluster run completed.") # Pause for user input before proceeding input("Press Enter to continue to move onto subgroup...") - ###SUBGROUP SECTION ### - # Create a clustergram for each group to establish number of subgroups (subsubclusters) for K means - create_subcluster_clustergrams(cluster_variable_df=grouped_variable_df, - clustergram_directory=clustergram_directory, - number_of_clusters= number_of_clusters, - drop_columns=['cluster', 'subcluster'], - cluster_col_name='subcluster', - n_init=n_init, - random_seed=random_seed) + ###SUBGROUP SECTION ### + # Create a clustergram for each group to establish number of + # subgroups (subsubclusters) for K means + create_subcluster_clustergrams( + cluster_variable_df=grouped_variable_df, + clustergram_directory=clustergram_directory, + number_of_clusters=number_of_clusters, + drop_columns=["cluster", "subcluster"], + cluster_col_name="subcluster", + n_init=n_init, + random_seed=random_seed, + ) logger.info("subgroup clustergrams completed.") - + # Pause for user input before proceeding logger.info(f"Unique subclusters at this stage: {grouped_variable_df['subcluster'].unique()}") logger.info("Check that dictionary in config for subsubclustering mapping is correct") input("Press Enter to continue with the cluster numbers below for subgroups creation...") # Run K-means clustering to assign subgroups (subsubclusters) using config mapping - subgrouped_variable_df = run_subclustering(input_df=grouped_variable_df, - output_location=f"{output_directory}cluster_assignments/subgroup", - drop_columns=['cluster', 'subcluster'], - column_name="subsubcluster", - cluster_col_name="subcluster", - cluster_to_numbers = config["subsubclustering_mapping"], - n_init=n_init, - random_seed=random_seed) - + subgrouped_variable_df = run_subclustering( + input_df=grouped_variable_df, + output_location=f"{output_directory}cluster_assignments/subgroup", + drop_columns=["cluster", "subcluster"], + column_name="subsubcluster", + cluster_col_name="subcluster", + cluster_to_numbers=config["subsubclustering_mapping"], + n_init=n_init, + random_seed=random_seed, + ) + logger.info("subgroup cluster run completed.") - + logger.info("Final output for supergroup, group and subgroup saved to output directory.") return subgrouped_variable_df + ## Clustergrams # We produce a clustergram plot to assess an appropriate number of clusters for the supergroups. -# Some guidance on interpreting clustergrams and choosing the number of clusters can be found here: +# Some guidance on interpreting clustergrams and choosing the number of clusters can be found here: # [Clustergram](https://clustergram.readthedocs.io/en/stable/notebooks/introduction.html) + def create_clustergram(df, number_of_clusters, n_init, save_location, random_seed=None): """ Create and save a clustergram for evaluating k-means clustering solutions. - The clustergram visualizes clustering stability and helps identify the optimal + The clustergram visualizes clustering stability and helps identify the optimal number of clusters by performed the k-means algorithm for a range of cluster numbers. - Since k-means is sensitive to initialization, `n_init` determines the number of - times the algorithm runs with different centroid seeds. The final result is the + Since k-means is sensitive to initialization, `n_init` determines the number of + times the algorithm runs with different centroid seeds. The final result is the best outcome based on inertia/WCSS (within-cluster sum of squares). Parameters @@ -184,7 +207,7 @@ def create_clustergram(df, number_of_clusters, n_init, save_location, random_see number_of_clusters : int The total number of clusters to iterate over. n_init : int - Number of k-means runs with different initial centroid seeds. + Number of k-means runs with different initial centroid seeds. Higher values (e.g., ~1000) improve solution stability but increase runtime. save_location : str File path to save the clustergram plot. @@ -193,28 +216,33 @@ def create_clustergram(df, number_of_clusters, n_init, save_location, random_see """ # Validate the number of clusters if len(df) < number_of_clusters: - logger.warning(f"Warning: Reducing number_of_clusters from {number_of_clusters} to {len(df)} (number of samples).") + logger.warning( + f"Warning: Reducing number_of_clusters from {number_of_clusters} " + + "to {len(df)} (number of samples)." + ) number_of_clusters = len(df) # Define the range of clusters to evaluate k_range = range(1, number_of_clusters + 1) # Start from 2 clusters up to number_of_clusters # Create the clustergram - cgram = Clustergram(k_range=k_range, method='kmeans', random_state=random_seed, n_init=n_init) - + cgram = Clustergram(k_range=k_range, method="kmeans", random_state=random_seed, n_init=n_init) + cgram.fit(df) # Fit model to data cgram.plot() # Generate plot plt.savefig(save_location) # Save figure # plt.show() # Display plot + ## Clusters = supergroup # Run kmeans to cluster the geographies in K clusters (supergroups) + def run_kmeans(input_df, number_of_clusters, n_init, output_filepath, random_seed=None): """ Run K-means clustering on the input dataset and save the cluster assignments. - This function applies K-means clustering to the provided dataset, assigns cluster + This function applies K-means clustering to the provided dataset, assigns cluster labels to each row, and saves the cluster assignments as a lookup table. Parameters @@ -224,8 +252,8 @@ def run_kmeans(input_df, number_of_clusters, n_init, output_filepath, random_see number_of_clusters : int The number of clusters (K) to create. n_init : int - Number of times the K-means algorithm runs with different initial centroid seeds. - The best result based on inertia/WCSS is chosen. A higher value (e.g., ~1000) is + Number of times the K-means algorithm runs with different initial centroid seeds. + The best result based on inertia/WCSS is chosen. A higher value (e.g., ~1000) is recommended for final results, but a lower value can be used for testing. output_filepath : str Path to save the resulting cluster assignments. @@ -235,24 +263,29 @@ def run_kmeans(input_df, number_of_clusters, n_init, output_filepath, random_see Returns ------- pd.DataFrame - The input DataFrame with an added 'cluster' column containing + The input DataFrame with an added 'cluster' column containing the assigned cluster for each row. """ df = input_df.copy() if number_of_clusters > len(df): - logger.warning(f"Warning: Reducing number_of_clusters from {number_of_clusters} to {len(df)} (number of samples).") + logger.warning( + f"Warning: Reducing number_of_clusters from {number_of_clusters}" + + " to {len(df)} (number of samples)." + ) number_of_clusters = len(df) # Initialize the K-means model - kmeans_model = KMeans(n_clusters=number_of_clusters, max_iter=1000, random_state=random_seed, n_init=n_init) - + kmeans_model = KMeans( + n_clusters=number_of_clusters, max_iter=1000, random_state=random_seed, n_init=n_init + ) + # Fit the model and assign clusters - df['cluster'] = kmeans_model.fit_predict(df) + df["cluster"] = kmeans_model.fit_predict(df) # Ensure output directory exists os.makedirs(os.path.dirname(output_filepath), exist_ok=True) # Save the cluster assignments column to a CSV file - df[['cluster']].to_csv(output_filepath) + df[["cluster"]].to_csv(output_filepath) # Show the first few rows of the assigned clusters logger.info(f"K-means clusters:\n{df[['cluster']].head()}") @@ -261,16 +294,26 @@ def run_kmeans(input_df, number_of_clusters, n_init, output_filepath, random_see ## Subclusters = groups and subgroups -# For LAD area classification, supergroup clusters are further split into groups and subgroups by iteratively applying the clustering process. - -def create_subcluster_clustergrams(cluster_variable_df, clustergram_directory, number_of_clusters, drop_columns, cluster_col_name, n_init, random_seed=None): +# For LAD area classification, supergroup clusters are further split into +# groups and subgroups by iteratively applying the clustering process. + + +def create_subcluster_clustergrams( + cluster_variable_df, + clustergram_directory, + number_of_clusters, + drop_columns, + cluster_col_name, + n_init, + random_seed=None, +): """ Generate and save clustergrams for each unique cluster label in the DataFrame. This function iterates through each unique value in the specified cluster column, - filters the DataFrame for that cluster and creates a clustergram for the resulting subset. + filters the DataFrame for that cluster and creates a clustergram for the resulting subset. The clustergram is saved to the specified directory. - + Parameters ---------- cluster_variable_df : pd.DataFrame @@ -299,26 +342,48 @@ def create_subcluster_clustergrams(cluster_variable_df, clustergram_directory, n # Drop the cluster related columns after filtering as they are strings. subcluster_df = filtered_df.drop(columns=drop_columns) logger.info(f"Cluster: {subcluster}, {len(subcluster_df)} geographies in cluster") - + # Define save location - save_location = os.path.join(clustergram_directory, f"subcluster_clustergram_cluster{subcluster}.png") + save_location = os.path.join( + clustergram_directory, f"subcluster_clustergram_cluster{subcluster}.png" + ) logger.info(f"Saving clustergram to {save_location}") if len(subcluster_df) <= 2: # Skip this subcluster if it has insufficient data points (2 or fewer). - # For example when running on number_of_times_k_means_initialised = 1000 the Oxford and Cambridge - # subgroup have an errors when creating the clustergram so skip this subcluster instead. - logger.info(f"Skipping cluster {subcluster} due to insufficient data points ({len(subcluster_df)}).") + # For example when running on number_of_times_k_means_initialised = 1000 the + # Oxford and Cambridge subgroup have an errors when creating the clustergram + # so skip this subcluster instead. + logger.info( + f"Skipping cluster {subcluster} due to insufficient" + + "data points ({len(subcluster_df)})." + ) continue else: # Generate clustergram - create_clustergram(subcluster_df, number_of_clusters, n_init=n_init, save_location=save_location, random_seed=random_seed) - - -def run_subclustering(input_df, output_location,drop_columns,column_name, cluster_col_name, cluster_to_numbers, n_init, random_seed = None) -> pd.DataFrame: + create_clustergram( + subcluster_df, + number_of_clusters, + n_init=n_init, + save_location=save_location, + random_seed=random_seed, + ) + + +def run_subclustering( + input_df, + output_location, + drop_columns, + column_name, + cluster_col_name, + cluster_to_numbers, + n_init, + random_seed=None, +) -> pd.DataFrame: """ - Runs subclustering for each supergroup using KMeans and returns a modified DataFrame with subcluster labels. - + Runs subclustering for each supergroup using KMeans and returns a modified + DataFrame with subcluster labels. + Parameters ---------- input_df : pd.DataFrame @@ -352,31 +417,38 @@ def run_subclustering(input_df, output_location,drop_columns,column_name, cluste # Iterate over each supergroup and its desired number of subclusters for cluster, num_subclusters in cluster_to_numbers.items(): - - logger.info(f"Clustering supergroup {cluster} into {cluster_to_numbers[cluster]} subclusters.") + logger.info( + f"Clustering supergroup {cluster} into {cluster_to_numbers[cluster]} subclusters." + ) # Select rows corresponding to the current cluster, drop the cluster column before clustering logger.debug(f"input_df shape: {input_df.shape}") - cluster_df = input_df.query(f"{cluster_col_name} == @cluster").drop(columns=drop_columns).copy() + cluster_df = ( + input_df.query(f"{cluster_col_name} == @cluster").drop(columns=drop_columns).copy() + ) logger.debug(f"cluster_df shape: {cluster_df.shape}") # Run KMeans clustering for the selected supergroup subcluster_output_df = run_kmeans( - cluster_df, - num_subclusters, - n_init=n_init, - output_filepath=output_location+f"/supergroup{cluster}_subclusteroutput.csv", - random_seed=random_seed # Use a different random seed for each subclustering to ensure diversity + cluster_df, + num_subclusters, + n_init=n_init, + output_filepath=output_location + f"/supergroup{cluster}_subclusteroutput.csv", + random_seed=random_seed, + # Use a different random seed for each subclustering to ensure diversity ) - # Convert subcluster numbers (0, 1, 2, ...) into a more readable format (e.g., '0a', '0b', '0c'). + # Convert subcluster numbers (0, 1, 2) into a more readable format ('0a', '0b', '0c'). # The numeric part represents the main cluster; the letter represents the subcluster. - subcluster_output_df[column_name] = [str(cluster) + chr(97 + i) for i in subcluster_output_df["cluster"]] + subcluster_output_df[column_name] = [ + str(cluster) + chr(97 + i) for i in subcluster_output_df["cluster"] + ] # Update the modified DataFrame with subclustering results df.loc[cluster_df.index, column_name] = subcluster_output_df[column_name] - # Save the cluster outputs one directory up from the output_location - the cluster assignment folder + # Save the cluster outputs one directory up from the output_location - the + # cluster assignment folder if column_name == "subcluster": file_name = "group_clustering_output.csv" elif column_name == "subsubcluster": diff --git a/area_classification/downloading_data/ew_lad_bulk_download.py b/area_classification/downloading_data/ew_lad_bulk_download.py index bf9ef0f..823cbd0 100644 --- a/area_classification/downloading_data/ew_lad_bulk_download.py +++ b/area_classification/downloading_data/ew_lad_bulk_download.py @@ -1,28 +1,29 @@ +import logging import os -from bs4 import BeautifulSoup # Equivalent to rvest for web scraping import re # For string manipulation (similar to stringr) -import pandas as pd # For data manipulation (similar to tidyverse and vroom) -import requests # For making HTTP requests -from zipfile import ZipFile +import tempfile from glob import glob from shutil import rmtree -import tempfile -import logging +from zipfile import ZipFile -from area_classification.utilities.load_config import load_config +import pandas as pd # For data manipulation (similar to tidyverse and vroom) +import requests # For making HTTP requests +from bs4 import BeautifulSoup # Equivalent to rvest for web scraping logger = logging.getLogger(__name__) + def ew_lad_bulk_download(config: dict): """ - Downloads the latest census 2021 data for England and Wales Local Authority Districts (LADs) from Nomis. - Census data is exported in CSV format to output directory specified in the config. + Downloads the latest census 2021 data for England and Wales Local Authority + Districts (LADs) from Nomis. Census data is exported in CSV format to output + directory specified in the config. Parameters ---------- config : dict main config for pipeline - + Returns ------- None @@ -49,27 +50,35 @@ def get_census_table_urls(config: dict) -> list: ------- list list of URLs for census tables that contain Output Areas (OA). - """ + """ # Read the HTML page from the Nomis Census 2021 bulk download site - html_page = BeautifulSoup(requests.get("https://www.nomisweb.co.uk/sources/census_2021_bulk").content, "html.parser") + html_page = BeautifulSoup( + requests.get("https://www.nomisweb.co.uk/sources/census_2021_bulk", timeout=100).content, + "html.parser", + ) # Extract links to census table ZIP files (excluding 'extra.zip') zip_urls = [ - link['href'] for link in html_page.find_all('a', href=True) - if link['href'].endswith('.zip') and 'extra.zip' not in link['href'] + link["href"] + for link in html_page.find_all("a", href=True) + if link["href"].endswith(".zip") and "extra.zip" not in link["href"] ] # Make zip file names into a full URLs zip_urls = ["https://www.nomisweb.co.uk" + url for url in zip_urls] nomis_address = "https://www.nomisweb.co.uk/output/census/2021/census2021-{table_id}.zip" - no_oa_tables = [nomis_address.format(table_id=code) for code in config["england_and_wales_table_codes_to_remove"]] + no_oa_tables = [ + nomis_address.format(table_id=code) + for code in config["england_and_wales_table_codes_to_remove"] + ] # Remove the tables without Output Areas (OA) zip_urls = list(set(zip_urls) - set(no_oa_tables)) - + return zip_urls + def download_and_unzip_data(zip_urls: list, config: dict) -> pd.DataFrame: """ Fucntion to download and unzip the census data files, extract the relevant tables, @@ -97,13 +106,13 @@ def download_and_unzip_data(zip_urls: list, config: dict) -> pd.DataFrame: tmp_dir = tempfile.mkdtemp() # Download the specified zip file - response = requests.get(url) + response = requests.get(url, timeout=100) zip_file_path = os.path.join(tmp_dir, "temp.zip") with open(zip_file_path, "wb") as f: f.write(response.content) # Unzip the file - with ZipFile(zip_file_path, 'r') as zip_ref: + with ZipFile(zip_file_path, "r") as zip_ref: zip_ref.extractall(tmp_dir) # Extract the table name from the URL @@ -128,7 +137,7 @@ def download_and_unzip_data(zip_urls: list, config: dict) -> pd.DataFrame: if t_name == "ts007a": unit = "Person" else: - # --- Find and read the .txt file in metadata folder for unit --- + # --- Find and read the .txt file in metadata folder for unit --- metadata_txt_files = glob(os.path.join(tmp_dir, "metadata", "*.txt")) if metadata_txt_files: with open(metadata_txt_files[0], "r", encoding="utf-8") as meta_file: @@ -149,12 +158,9 @@ def download_and_unzip_data(zip_urls: list, config: dict) -> pd.DataFrame: Variable_ID = [f"{t_name}{i:04d}" for i in range(1, len(Full_Name) + 1)] # Create a metadata table - n_list = pd.DataFrame({ - "Variable_ID": Variable_ID, - "Table_ID": t_name, - "Full_Name": Full_Name, - "Unit": unit - }) + n_list = pd.DataFrame( + {"Variable_ID": Variable_ID, "Table_ID": t_name, "Full_Name": Full_Name, "Unit": unit} + ) # Append to the metadata table meta_data_table = pd.concat([meta_data_table, n_list], ignore_index=True) @@ -162,7 +168,7 @@ def download_and_unzip_data(zip_urls: list, config: dict) -> pd.DataFrame: # Rename the columns in the DataFrame df.columns = Variable_ID # Move row names back to a column - df.reset_index(inplace=True) + df.reset_index(inplace=True) df.rename(columns={"geography code": "LTLA"}, inplace=True) # Write the DataFrame to a CSV file @@ -176,7 +182,6 @@ def download_and_unzip_data(zip_urls: list, config: dict) -> pd.DataFrame: return meta_data_table - def format_and_export_metadata_table(meta_data_table: pd.DataFrame, config: dict): """ Function to format the metadata table and saves it as a CSV to the input directory. @@ -193,33 +198,40 @@ def format_and_export_metadata_table(meta_data_table: pd.DataFrame, config: dict -------- pd.DataFrame The formatted metadata table. Also saved as a CSV file in the specified output directory. - """ + """ - # Format the lookup table meta_data_table_full = ( meta_data_table # Extract variable name from 'Full_Name' (text after first colon, up to semicolon if present) - .assign( - Variable_Name=meta_data_table['Full_Name'].str.extract(r':\s*([^;:]+?)(?:;|$)') - ) - + .assign(Variable_Name=meta_data_table["Full_Name"].str.extract(r":\s*([^;:]+?)(?:;|$)")) # Extract table name from 'Full_Name' (text before first colon) - .assign(Table_Name=meta_data_table['Full_Name'].str.split(':', n=1).str[0]) + .assign(Table_Name=meta_data_table["Full_Name"].str.split(":", n=1).str[0]) .assign(Type="Count") + ) - ) - # Specify desired column order - column_order = ["Variable_Name", "Variable_ID", "Table_ID", "Table_Name", "Type", "Unit", "Full_Name"] + column_order = [ + "Variable_Name", + "Variable_ID", + "Table_ID", + "Table_Name", + "Type", + "Unit", + "Full_Name", + ] # Reorder columns (only keep columns that exist in the DataFrame) - meta_data_table_full = meta_data_table_full[[col for col in column_order if col in meta_data_table_full.columns]] - + meta_data_table_full = meta_data_table_full[ + [col for col in column_order if col in meta_data_table_full.columns] + ] + # Ensure input directory exists os.makedirs(os.path.dirname(config["input_directory"]), exist_ok=True) # Write the resulting DataFrame to a CSV file - meta_data_table_full.to_csv(os.path.join(config["input_directory"], "ew_lad_table_metadata.csv"), index=False) + meta_data_table_full.to_csv( + os.path.join(config["input_directory"], "ew_lad_table_metadata.csv"), index=False + ) - return meta_data_table_full \ No newline at end of file + return meta_data_table_full diff --git a/area_classification/downloading_data/ni_lgd_downloading_data.py b/area_classification/downloading_data/ni_lgd_downloading_data.py index c0e143c..6ffca72 100644 --- a/area_classification/downloading_data/ni_lgd_downloading_data.py +++ b/area_classification/downloading_data/ni_lgd_downloading_data.py @@ -1,16 +1,18 @@ -import requests -import pandas as pd -from bs4 import BeautifulSoup import logging +import os from io import BytesIO -import os + +import pandas as pd +import requests +from bs4 import BeautifulSoup logger = logging.getLogger(__name__) -def ni_lgd_download_data(config): + +def ni_lgd_download_data(config): """ Wrapper function to download Northern Ireland Local Government District (LGD) data. - Data and metadata are downloaded and exported to csv files. + Data and metadata are downloaded and exported to csv files. Parameters ---------- @@ -20,15 +22,15 @@ def ni_lgd_download_data(config): Returns ------- None - The function saves the downloaded data and metadata as CSV files in the specified input directory. - """ + The function saves the downloaded data and metadata as CSV files in the + specified input directory. + """ meta_data_table = download_ni_lgd_data(config) format_and_export_ni_metadata_table(meta_data_table, config) reformat_pop_density_ni(config) - def reformat_pop_density_ni(config): """ Function to reformat Northern Ireland Local Government District (LGD) population density data. @@ -47,17 +49,19 @@ def reformat_pop_density_ni(config): """ # Load only the first sheet of the Excel file - df = pd.read_excel(config["ni_pop_density_filepath"], sheet_name=0, skiprows=5, header=0, index_col=None) + df = pd.read_excel( + config["ni_pop_density_filepath"], sheet_name=0, skiprows=5, header=0, index_col=None + ) # Remove unnecessary columns by index - df = df.drop(df.columns[[0, 2, 3,5]], axis=1) + df = df.drop(df.columns[[0, 2, 3, 5]], axis=1) # Rename the first remaining column to 'LGD' - df.columns.values[0] = "LGD" + df.columns.values[0] = "LGD" # Convert from per hectare to per km² - df.iloc[:, 1] = df.iloc[:, 1] * 100 - df.columns.values[1] = "population_density" + df.iloc[:, 1] = df.iloc[:, 1] * 100 + df.columns.values[1] = "population_density" # Save to a CSV output_csv_path = os.path.join(config["input_directory"], "./ni_downloads/") @@ -65,12 +69,9 @@ def reformat_pop_density_ni(config): os.makedirs(os.path.dirname(output_csv_path), exist_ok=True) # save to output_csv_path df.to_csv(output_csv_path + "ni_population_density.csv", index=False) - - - -def download_ni_lgd_data(config:dict)-> pd.DataFrame: +def download_ni_lgd_data(config: dict) -> pd.DataFrame: """ Function to download Northern Ireland Local Government District (LGD) data from the NISRA website and format it into a metadata table. @@ -86,8 +87,8 @@ def download_ni_lgd_data(config:dict)-> pd.DataFrame: pd.DataFrame ni metadata table containing information about the downloaded variables. Columns: Variable_Name, Variable_ID, Table_ID, Table_Name, Type, Unit - """ - + """ + variables = get_available_variables() meta_data_table = pd.DataFrame( columns=[ @@ -97,7 +98,7 @@ def download_ni_lgd_data(config:dict)-> pd.DataFrame: "Type", ] ) - + for var in variables: t_name = var[0] t_dcode = var[1] @@ -112,7 +113,7 @@ def download_ni_lgd_data(config:dict)-> pd.DataFrame: # Get metadata (more fields available if needed) meta_url = f"https://build.nisra.gov.uk/en/custom/table.csv-metadata.json?d={t_unit}&v=LGD14&v={t_dcode}&p=1" - r = requests.get(meta_url) + r = requests.get(meta_url, timeout=100) if r.status_code != 200: log_message = ( f"Failed to fetch metadata for variable {t_name} | " @@ -123,7 +124,6 @@ def download_ni_lgd_data(config:dict)-> pd.DataFrame: continue type = r.json()["tableSchema"]["columns"][4]["titles"] - df = pd.read_csv(BytesIO(data), skiprows=1) df.rename(columns={"Local Government District 2014 Code": "LGD"}, inplace=True) df.set_index("LGD", inplace=True) @@ -132,7 +132,7 @@ def download_ni_lgd_data(config:dict)-> pd.DataFrame: # Drop the "No code required" column if it exists if "No code required" in df.columns: df.drop(columns=["No code required"], inplace=True) - + # Create a total column, that includes everything except "No code required" df["All " + t_unit] = df.sum(axis=1) # Put the total column first @@ -143,7 +143,7 @@ def download_ni_lgd_data(config:dict)-> pd.DataFrame: df.columns = var_ids output_csv_path = os.path.join(config["input_directory"], "./ni_downloads/") - + # Ensure output directory exists os.makedirs(os.path.dirname(output_csv_path), exist_ok=True) # Save to csv @@ -167,7 +167,7 @@ def download_ni_lgd_data(config:dict)-> pd.DataFrame: return meta_data_table -def format_and_export_ni_metadata_table(meta_data_table: pd.DataFrame, config:dict): +def format_and_export_ni_metadata_table(meta_data_table: pd.DataFrame, config: dict): """ Formats and exports ni metadata table to csv. @@ -181,8 +181,9 @@ def format_and_export_ni_metadata_table(meta_data_table: pd.DataFrame, config:di Returns ------- None - The function saves the formatted metadata table as a CSV file in the specified input directory. - """ + The function saves the formatted metadata table as a CSV file in the + specified input directory. + """ # Rename units to match other scripts meta_data_table["Unit"] = meta_data_table["Unit"].replace( { @@ -202,17 +203,19 @@ def format_and_export_ni_metadata_table(meta_data_table: pd.DataFrame, config:di # Set Type to 'Count' for all tables meta_data_table["Type"] = "Count" - meta_data_table.to_csv(os.path.join(config["input_directory"],"ni_lgd_table_metadata.csv"), index=False) - + meta_data_table.to_csv( + os.path.join(config["input_directory"], "ni_lgd_table_metadata.csv"), index=False + ) def get_available_variables(): """ Fetches available variables from the NISRA dataset metadata page. - This function sends a GET request to the NISRA metadata page for both the PEOPLE and HOUSEHOLD datasets, - parses the HTML content to extract table data, and returns the data as a list of lists. - Each inner list represents a row in the table, containing the text content of each cell. + This function sends a GET request to the NISRA metadata page for both the PEOPLE + and HOUSEHOLD datasets, parses the HTML content to extract table data, and + returns the data as a list of lists. Each inner list represents a row in the + table, containing the text content of each cell. Returns ------- @@ -220,24 +223,24 @@ def get_available_variables(): A list containing rows of table data, where each row is a list of cell values. Each row also includes a column indicating whether the data is for PEOPLE or HOUSEHOLD. """ - + table_data = [] # Fetch and parse PEOPLE dataset metadata table url = "https://build.nisra.gov.uk/en/metadata/dataset?d=PEOPLE" - response = requests.get(url) + response = requests.get(url, timeout=100) soup = BeautifulSoup(response.content, "html.parser") for row in soup.select("tr"): cells = row.find_all(["td", "th"]) - table_data.append([cell.text.strip() for cell in cells] + ['PEOPLE']) + table_data.append([cell.text.strip() for cell in cells] + ["PEOPLE"]) # Fetch and parse HOUSEHOLD dataset metadata table url = "https://build.nisra.gov.uk/en/metadata/dataset?d=HOUSEHOLD" - response = requests.get(url) + response = requests.get(url, timeout=100) soup = BeautifulSoup(response.content, "html.parser") for row in soup.select("tr"): cells = row.find_all(["td", "th"]) - table_data.append([cell.text.strip() for cell in cells] + ['HOUSEHOLD']) + table_data.append([cell.text.strip() for cell in cells] + ["HOUSEHOLD"]) return table_data @@ -246,16 +249,16 @@ def fetch_data(var_code, var_name, var_unit): """ Fetches data from the Northern Ireland Census 2022 Data Zone. - Constructs a URL based on the provided variable code, variable name, + Constructs a URL based on the provided variable code, variable name, and variable unit, then sends a GET request to fetch the corresponding data in CSV format. Parameters ---------- - var_code (str): + var_code (str): The code of the variable to fetch. - var_name (str): + var_name (str): The name of the variable to fetch. - var_unit (str): + var_unit (str): The unit of the variable to fetch. Returns @@ -276,7 +279,7 @@ def fetch_data(var_code, var_name, var_unit): """ url = f"https://build.nisra.gov.uk/en/custom/table.csv?d={var_unit}&v=LGD14&v={var_code}&p=1" - r = requests.get(url) + r = requests.get(url, timeout=100) if r.status_code != 200: log_message = ( @@ -286,4 +289,4 @@ def fetch_data(var_code, var_name, var_unit): ) logger.error(log_message) return None - return r.content \ No newline at end of file + return r.content diff --git a/area_classification/downloading_data/scot_tables_reformatting.py b/area_classification/downloading_data/scot_tables_reformatting.py index a30e87c..6181b41 100644 --- a/area_classification/downloading_data/scot_tables_reformatting.py +++ b/area_classification/downloading_data/scot_tables_reformatting.py @@ -1,27 +1,25 @@ - - -import pandas as pd +import csv +import logging import os import re + import numpy as np -import csv -from functools import reduce -import logging +import pandas as pd logger = logging.getLogger(__name__) -def scot_reformatting_wrapper(scot_input_folder: str, - LAD_lookup_file_path: str, - config: dict): + +def scot_reformatting_wrapper(scot_input_folder: str, LAD_lookup_file_path: str, config: dict): """ - Wrapper function to perform the reformatting of the Scotland tables to be consistent with tables - downloaded for England and Wales and Northern Ireland and extract metadata from Scotland tables - to create a metadata table for Scotland. - + Wrapper function to perform the reformatting of the Scotland tables to be + consistent with tables downloaded for England and Wales and Northern Ireland + and extract metadata from Scotland tables to create a metadata table for + Scotland. + Certain tables have their own re-formatting functions. Note that the functions are hard coded to our scotland tables. - + Parameters ---------- scot_input_folder : str @@ -34,7 +32,8 @@ def scot_reformatting_wrapper(scot_input_folder: str, Returns ------- None - Metadata table is saved as a csv to the specified output path in the config dictionary. + Metadata table is saved as a csv to the specified output path + in the config dictionary. """ # Rename the tables based on their table ID values @@ -49,14 +48,14 @@ def scot_reformatting_wrapper(scot_input_folder: str, "Table_ID", "Type", "Unit", - "Full_Name" + "Full_Name", ] ) # Change the only xlsx (pop_density) in the folder to csv extract_pop_density_table(scot_input_folder) - # Function to extract metadata from files into table. + # Function to extract metadata from files into table. # 'metadata' is a list of table_name, table_id and unit variabless metadata = extract_metadata_from_files(scot_input_folder) @@ -64,7 +63,7 @@ def scot_reformatting_wrapper(scot_input_folder: str, replace_ca19_names_with_codes(scot_input_folder, LAD_lookup_file_path, config) # Remove rows with metadata/no data (first 10 and bottom 3 rows) - remove_rows(config, folderpath= config["reformat_scot_input_folder"]) + remove_rows(config, folderpath=config["reformat_scot_input_folder"]) # Reformat specific tables reformat_uv101b(scot_input_folder, LAD_lookup_file_path, config) @@ -78,18 +77,21 @@ def scot_reformatting_wrapper(scot_input_folder: str, # Add to metadata table # Manually add 'UV303a' entries to metadata and variable_names_ids lists - metadata.append({"table_id": "UV303a", "table_name": "Disability by sex by age (20)", "unit": "Person"}) + metadata.append( + {"table_id": "UV303a", "table_name": "Disability by sex by age (20)", "unit": "Person"} + ) variable_names_ids.append((["Disability"], ["UV303a"])) - # Iterate over the metadata dict and variable_names_ids list and add to the metadata table - for (meta, (variable_names, variable_ids)) in zip(metadata, variable_names_ids): + # Iterate over the metadata dict and variable_names_ids list and + # add to the metadata table + for meta, (variable_names, variable_ids) in zip(metadata, variable_names_ids): # Extract table_id, table_name, and unit from the metadata dictionary table_id = meta.get("table_id", "") table_name = meta.get("table_name", "") unit = meta.get("unit", "") # Exclude 'CA19' from variable_names and adjust variable_ids accordingly - if 'CA19' in variable_names: - variable_names = [name for name in variable_names if name != 'CA19'] + if "CA19" in variable_names: + variable_names = [name for name in variable_names if name != "CA19"] meta_data_table = pd.concat( [ meta_data_table, @@ -101,7 +103,7 @@ def scot_reformatting_wrapper(scot_input_folder: str, "Table_Name": table_name, "Unit": unit, } - ) + ), ] ) @@ -109,15 +111,19 @@ def scot_reformatting_wrapper(scot_input_folder: str, meta_data_table["Full_Name"] = ( meta_data_table["Table_Name"] + " - " + meta_data_table["Variable_Name"] ) - meta_data_table = meta_data_table[["Variable_Name", "Variable_ID", "Table_ID", "Table_Name", "Type", "Unit", "Full_Name"]] + meta_data_table = meta_data_table[ + ["Variable_Name", "Variable_ID", "Table_ID", "Table_Name", "Type", "Unit", "Full_Name"] + ] # Manually set Type to 'Count' for all tables - meta_data_table['Type'] = 'Count' + meta_data_table["Type"] = "Count" # Update the type for population density to ratio - meta_data_table.loc[meta_data_table['Variable_ID'] == 'population_density', 'Type'] = 'Ratio' + meta_data_table.loc[meta_data_table["Variable_ID"] == "population_density", "Type"] = "Ratio" # Drop rows where Variable_Name contains 'Unnamed' - meta_data_table = meta_data_table[~meta_data_table["Variable_Name"].str.contains("Unnamed", na=False)] + meta_data_table = meta_data_table[ + ~meta_data_table["Variable_Name"].str.contains("Unnamed", na=False) + ] # Ensure input directory exists os.makedirs(os.path.dirname(config["input_directory"]), exist_ok=True) @@ -132,20 +138,21 @@ def scot_reformatting_wrapper(scot_input_folder: str, return concat_reformatted_tables(config=config).reset_index(drop=False) - def rename_csv_files_by_table_id(scot_input_folder): """ - Renames CSV files in the specified folder based on the Table ID found in their content. - + Renames CSV files in the specified folder based on the Table ID + found in their content. + Paramaters ---------- - scot_input_folder) (str): Path to the folder containing the CSV files. + scot_input_folder) (str): Path to the folder containing the + CSV files. Returns ------- None """ - # Regular expression to extract the Table ID + # Regular expression to extract the Table ID table_id_pattern = r"UV\d+\w*" # Iterate through all files in the folder @@ -153,22 +160,22 @@ def rename_csv_files_by_table_id(scot_input_folder): # Process only files with a .csv extension if file_name.lower().endswith(".csv"): file_path = os.path.join(scot_input_folder, file_name) - + # Read the CSV file try: # Open and read the file content - with open(file_path, 'r', encoding='utf-8') as file: + with open(file_path, "r", encoding="utf-8") as file: content = file.read() - + # Search for the Table ID in the file content match = re.search(table_id_pattern, content) if match: - table_id = match.group(0) # Extract the Table ID - + table_id = match.group(0) # Extract the Table ID + # Create the new file name new_file_name = f"{table_id}.csv" new_file_path = os.path.join(scot_input_folder, new_file_name) - + # Rename the file os.rename(file_path, new_file_path) logger.info(f"Renamed '{file_name}' to '{new_file_name}'") @@ -179,25 +186,27 @@ def rename_csv_files_by_table_id(scot_input_folder): except Exception as e: logger.error(f"Error processing file '{file_name}': {e}") - + # Function to reformat the UV101b CSV file def reformat_uv101b(scot_input_folder, LAD_lookup_file_path, config): """ - Function to reformat the UV101b CSV file so it has rows removed and CA codes instead of names. - + Function to reformat the UV101b CSV file so it has rows removed + and CA codes instead of names. + Parameters ---------- - scot_input_folder : str + scot_input_folder : str Path to the directory containing the input CSV files. - LAD_lookup_file_path : str + LAD_lookup_file_path : str Path to the lookup file containing LAD codes and names. config : dict Configuration dictionary containing paths and file names. - + Returns ------- None - The function saves the reformatted DataFrame to a new CSV file in the specified output path. + The function saves the reformatted DataFrame to a new CSV file + in the specified output path. """ # Look for UV101b.csv in the directory file_path = os.path.join(scot_input_folder, "UV101b.csv") @@ -206,35 +215,40 @@ def reformat_uv101b(scot_input_folder, LAD_lookup_file_path, config): return # Load the CSV file and skip the first 12 rows - df = pd.read_csv(file_path, skiprows=11, header=None, names=['A', 'B', 'C', 'D', 'E', 'F']) - + df = pd.read_csv(file_path, skiprows=11, header=None, names=["A", "B", "C", "D", "E", "F"]) + # Remove the empty column (F) - df = df.dropna(axis=1, how='all') + df = df.dropna(axis=1, how="all") # List to store results results = [] # Iterate through rows to extract relevant data for index, row in df.iterrows(): - if str(row['A']).strip().lower() == 'sex': + if str(row["A"]).strip().lower() == "sex": # Get the council area name (two rows above the 'sex' row) - council_area = df.iloc[index - 2]['A'] if index - 2 >= 0 else None + council_area = df.iloc[index - 2]["A"] if index - 2 >= 0 else None # Get the 'All people' row (next row after 'sex') all_people_row = df.iloc[index + 1] if index + 1 < len(df) else None - if all_people_row is not None and str(all_people_row['A']).strip().lower() == 'all people': + if ( + all_people_row is not None + and str(all_people_row["A"]).strip().lower() == "all people" + ): # Extract the values from columns C, D, and E - all_people_value = all_people_row['C'] - household_value = all_people_row['D'] - communal_value = all_people_row['E'] - + all_people_value = all_people_row["C"] + household_value = all_people_row["D"] + communal_value = all_people_row["E"] + # Append the extracted values to the results - results.append({ - 'CA19': council_area, - 'All people': all_people_value, - 'Lives in a household': household_value, - 'Lives in a communal establishment': communal_value - }) + results.append( + { + "CA19": council_area, + "All people": all_people_value, + "Lives in a household": household_value, + "Lives in a communal establishment": communal_value, + } + ) # Convert results to a DataFrame if results: @@ -244,11 +258,13 @@ def reformat_uv101b(scot_input_folder, LAD_lookup_file_path, config): # Load the LAD codes and names lookup file lookup_df = pd.read_csv(LAD_lookup_file_path) - lookup_dict = dict(zip(lookup_df['LAD22NM'].str.lower().str.strip(), lookup_df['LAD22CD'])) + lookup_dict = dict(zip(lookup_df["LAD22NM"].str.lower().str.strip(), lookup_df["LAD22CD"])) # Replace council area names with LAD codes - output_df['CA19'] = output_df['CA19'].str.strip().str.lower().map(lookup_dict).fillna(output_df['CA19']) - + output_df["CA19"] = ( + output_df["CA19"].str.strip().str.lower().map(lookup_dict).fillna(output_df["CA19"]) + ) + # Ensure reformat_scot_input_folder exists os.makedirs(os.path.dirname(config["reformat_scot_input_folder"]), exist_ok=True) # Save the final DataFrame to a new CSV file @@ -260,7 +276,8 @@ def reformat_uv101b(scot_input_folder, LAD_lookup_file_path, config): def reformat_uv103(scot_input_folder, LAD_lookup_file_path, config): """ - Function to reformat the UV103 CSV file so it has rows removed and CA codes instead of names. + Function to reformat the UV103 CSV file so it has rows removed and + CA codes instead of names. Parameters ---------- @@ -270,11 +287,12 @@ def reformat_uv103(scot_input_folder, LAD_lookup_file_path, config): Path to the lookup file containing Counil area (CA) codes and names. config : dict Configuration dictionary containing paths and file names. - + Returns ------- None - The function saves the reformatted DataFrame to a new CSV file in the specified output path. + The function saves the reformatted DataFrame to a new CSV file + in the specified output path. """ # Look for UV103.csv in the directory file_path = os.path.join(scot_input_folder, "UV103.csv") @@ -284,65 +302,64 @@ def reformat_uv103(scot_input_folder, LAD_lookup_file_path, config): # Load the CSV file df = pd.read_csv(file_path, skiprows=11, header=0, on_bad_lines="skip") - # remove the last 3 rows + # remove the last 3 rows df = df.iloc[:-3, :] # Extract the original headers original_headers = df.columns.tolist() - + # Extract council area names and corresponding data reformatted_data = [] # Iterate through the DataFrame in steps of 4 rows - for i in range(0, len(df), 4): + for i in range(0, len(df), 4): # Initialize council_area with a default value for each iteration - council_area = "" + council_area = "" # Clean the value in the first column to remove extra spaces and ensure consistency - current_value = str(df.iloc[i, 0]).strip() + current_value = str(df.iloc[i, 0]).strip() # Check if the current row contains 'Counting' in the first column - if current_value.strip().lower() == 'counting': + if current_value.strip().lower() == "counting": # Ensure the row one row above 'counting' field exists if i - 2 >= 0: - council_area = str(df.iloc[i - 2, 0]).strip() + council_area = str(df.iloc[i - 2, 0]).strip() else: logger.warning(f"Row {i}: Missing council area value (missing expected for row 0)") # The first council_area needs to be added in 'Clackmanshire' if i == 0 and council_area == "": council_area = "Clackmannanshire" - + # Data row is directly below the 'counting' row - data_row_index = i + 1 + data_row_index = i + 1 # Ensure the data row index is within bounds if data_row_index < len(df): # Extract the data row starting from the second column (index 1) data_row = df.iloc[data_row_index, 1:].tolist() # Skip rows where all data columns are blank - if any(pd.notna(value) for value in data_row): + if any(pd.notna(value) for value in data_row): # Append the 'council_area' and the data row to the reformatted data reformatted_data.append([council_area] + data_row) - # Create the new DataFrame reformatted_df = pd.DataFrame(reformatted_data, columns=["CA19"] + original_headers[1:]) # Load the LAD codes and names lookup file lookup_df = pd.read_csv(LAD_lookup_file_path) - lookup_dict = dict(zip(lookup_df['LAD22NM'].str.lower().str.strip(), lookup_df['LAD22CD'])) + lookup_dict = dict(zip(lookup_df["LAD22NM"].str.lower().str.strip(), lookup_df["LAD22CD"])) # Replace council area names with LAD codes - reformatted_df['CA19'] = ( - reformatted_df['CA19'] + reformatted_df["CA19"] = ( + reformatted_df["CA19"] .str.strip() .str.lower() .map(lookup_dict) - .fillna(reformatted_df['CA19']) + .fillna(reformatted_df["CA19"]) ) # Drop the last column 'Unnamed:103' - reformatted_df = reformatted_df.loc[:, ~reformatted_df.columns.str.contains('^Unnamed')] + reformatted_df = reformatted_df.loc[:, ~reformatted_df.columns.str.contains("^Unnamed")] # Ensure reformat_scot_input_folder exists os.makedirs(os.path.dirname(config["reformat_scot_input_folder"]), exist_ok=True) @@ -368,9 +385,10 @@ def reformat_uv104(scot_input_folder, LAD_lookup_file_path, config): Returns ------- None - The function saves the reformatted DataFrame to a new CSV file in the specified output path. + The function saves the reformatted DataFrame to a new CSV file + in the specified output path. """ - + # Look for UV104.csv in the directory file_path = os.path.join(scot_input_folder, "UV104.csv") if not os.path.exists(file_path): @@ -384,15 +402,19 @@ def reformat_uv104(scot_input_folder, LAD_lookup_file_path, config): df = df.iloc[:-3, :] # Preserve the original order of 'Council Area 2019' and 'Marital Status' - council_area_order = df['Council Area 2019'].unique() - marital_status_order = df['Marital status'].unique() + council_area_order = df["Council Area 2019"].unique() + marital_status_order = df["Marital status"].unique() # Convert columns to Categorical to preserve order - df['Council Area 2019'] = pd.Categorical(df['Council Area 2019'], categories=council_area_order, ordered=True) - df['Marital status'] = pd.Categorical(df['Marital status'], categories=marital_status_order, ordered=True) + df["Council Area 2019"] = pd.Categorical( + df["Council Area 2019"], categories=council_area_order, ordered=True + ) + df["Marital status"] = pd.Categorical( + df["Marital status"], categories=marital_status_order, ordered=True + ) # Sort the DataFrame to ensure the order is preserved - df = df.sort_values(by=['Council Area 2019', 'Marital status']) + df = df.sort_values(by=["Council Area 2019", "Marital status"]) # Pivot the DataFrame pivoted_df = df.pivot(index="Council Area 2019", columns="Marital status", values="Count") @@ -402,19 +424,19 @@ def reformat_uv104(scot_input_folder, LAD_lookup_file_path, config): # Load the LAD codes and names lookup file lookup_df = pd.read_csv(LAD_lookup_file_path) - lookup_dict = dict(zip(lookup_df['LAD22NM'].str.lower().str.strip(), lookup_df['LAD22CD'])) + lookup_dict = dict(zip(lookup_df["LAD22NM"].str.lower().str.strip(), lookup_df["LAD22CD"])) # Replace council area names with LAD codes - pivoted_df['Council Area 2019'] = ( - pivoted_df['Council Area 2019'] + pivoted_df["Council Area 2019"] = ( + pivoted_df["Council Area 2019"] .str.strip() .str.lower() .map(lookup_dict) - .fillna(pivoted_df['Council Area 2019']) + .fillna(pivoted_df["Council Area 2019"]) ) - + # Replace the column name 'Council Area 2019' with 'CA19' - pivoted_df.rename(columns={'Council Area 2019': 'CA19'}, inplace=True) + pivoted_df.rename(columns={"Council Area 2019": "CA19"}, inplace=True) output_file_path = os.path.join(config["reformat_scot_input_folder"], "reformat_UV104.csv") pivoted_df.to_csv(output_file_path, index=False) @@ -437,9 +459,10 @@ def reformat_uv210(scot_input_folder, LAD_lookup_file_path, config): Returns ------- None - The function saves the reformatted DataFrame to a new CSV file in the specified output path. + The function saves the reformatted DataFrame to a new CSV file + in the specified output path. """ - + # Look for UV210.csv in the directory file_path = os.path.join(scot_input_folder, "UV210.csv") if not os.path.exists(file_path): @@ -453,46 +476,55 @@ def reformat_uv210(scot_input_folder, LAD_lookup_file_path, config): df = df.iloc[:-3, :] # Preserve the original order of 'Council Area 2019' and 'Marital Status' - council_area_order = df['Council Area 2019'].unique() - english_language_order = df['English language skills - 11 groups, all'].unique() + council_area_order = df["Council Area 2019"].unique() + english_language_order = df["English language skills - 11 groups, all"].unique() # Convert columns to Categorical to preserve order - df['Council Area 2019'] = pd.Categorical(df['Council Area 2019'], categories=council_area_order, ordered=True) - df['English language skills - 11 groups, all'] = pd.Categorical(df['English language skills - 11 groups, all'], categories=english_language_order, ordered=True) + df["Council Area 2019"] = pd.Categorical( + df["Council Area 2019"], categories=council_area_order, ordered=True + ) + df["English language skills - 11 groups, all"] = pd.Categorical( + df["English language skills - 11 groups, all"], + categories=english_language_order, + ordered=True, + ) # Sort the DataFrame to ensure the order is preserved - df = df.sort_values(by=['Council Area 2019', 'English language skills - 11 groups, all']) + df = df.sort_values(by=["Council Area 2019", "English language skills - 11 groups, all"]) # Pivot the DataFrame - pivoted_df = df.pivot(index="Council Area 2019", columns="English language skills - 11 groups, all", values="Count") + pivoted_df = df.pivot( + index="Council Area 2019", columns="English language skills - 11 groups, all", values="Count" + ) # Reset the index to make 'Council Area 2019' a column again pivoted_df.reset_index(inplace=True) # Load the LAD codes and names lookup file lookup_df = pd.read_csv(LAD_lookup_file_path) - lookup_dict = dict(zip(lookup_df['LAD22NM'].str.lower().str.strip(), lookup_df['LAD22CD'])) + lookup_dict = dict(zip(lookup_df["LAD22NM"].str.lower().str.strip(), lookup_df["LAD22CD"])) # Replace council area names with LAD codes - pivoted_df['Council Area 2019'] = ( - pivoted_df['Council Area 2019'] + pivoted_df["Council Area 2019"] = ( + pivoted_df["Council Area 2019"] .str.strip() .str.lower() .map(lookup_dict) - .fillna(pivoted_df['Council Area 2019']) + .fillna(pivoted_df["Council Area 2019"]) ) - + # Replace the column name 'Council Area 2019' with 'CA19' - pivoted_df.rename(columns={'Council Area 2019': 'CA19'}, inplace=True) + pivoted_df.rename(columns={"Council Area 2019": "CA19"}, inplace=True) output_file_path = os.path.join(config["reformat_scot_input_folder"], "reformat_UV210.csv") pivoted_df.to_csv(output_file_path, index=False) + def reformat_migrant_indicator(scot_input_folder, LAD_lookup_file_path, config): """ - Reformat the migrant indicator CSV file to move the last column of the DataFrame which contains total - to be the second column so that it is consistent with other tables. - Replace CA names with codes. + Reformat the migrant indicator CSV file to move the last column of the + DataFrame which contains total to be the second column so that it is + consistent with other tables. Replace CA names with codes. Parameters ---------- @@ -506,7 +538,8 @@ def reformat_migrant_indicator(scot_input_folder, LAD_lookup_file_path, config): Returns ------- None - The function saves the reformatted DataFrame to a new CSV file in the specified output path. + The function saves the reformatted DataFrame to a new CSV file + in the specified output path. """ # Look for migrant_indicator.csv in the directory file_path = os.path.join(scot_input_folder, "migrant_indicator.csv") @@ -518,67 +551,73 @@ def reformat_migrant_indicator(scot_input_folder, LAD_lookup_file_path, config): df = pd.read_csv(file_path, skiprows=9, header=None) # Remove rows where column A contains specific substrings - df = df[~df.iloc[:, 0].str.contains(r'Total|Dataset|INFO|\(c\)', case=False, na=False)] + df = df[~df.iloc[:, 0].str.contains(r"Total|Dataset|INFO|\(c\)", case=False, na=False)] # Remove columns where all rows except column 1 are blank - empty_columns_removed_df = df.dropna(axis=1, how='all') - + empty_columns_removed_df = df.dropna(axis=1, how="all") + # Look to see if there are more than two columns columns = list(empty_columns_removed_df.columns) if len(columns) < 2: # If there are less than 2 columns, no change is needed - return df + return df # Identify the last column - last_column = columns[-1] - # Rearrange the columns to move the last column which it totals to the second position + last_column = columns[-1] + # Rearrange the columns to move the last column which it totals to + # the second position new_order = [columns[0], last_column] + columns[1:-1] reformatted_df = empty_columns_removed_df[new_order] - + # Remove value from cell A1 reformatted_df.iloc[0, 0] = "" # Remove the value in A1 (table name) - + if len(reformatted_df) > 1 and reformatted_df.shape[1] > 1: - # Move the values from row 1 (index 0) in columns B onward (index 1 onward) to row 2 (index 1) + # Move the values from row 1 (index 0) in columns B onward + # (index 1 onward) to row 2 (index 1) reformatted_df.iloc[1, 1:] = reformatted_df.iloc[0, 1:].values - - # Clear the original values in row 1 (index 0) from column B onward (index 1 onward) + + # Clear the original values in row 1 (index 0) from column B onward + # (index 1 onward) reformatted_df.iloc[0, 1:] = np.nan - + # Drop the first (empty) row and reset the index reformatted_df = reformatted_df.drop(index=0).reset_index(drop=True) - + # Remove the row with default integer headers if it exists reformatted_df.columns = reformatted_df.iloc[0] # Set the first row as column headers - reformatted_df = reformatted_df[1:].reset_index(drop=True) # Drop the first row and reset the index + reformatted_df = reformatted_df[1:].reset_index( + drop=True + ) # Drop the first row and reset the index # Load the LAD codes and names lookup file lookup_df = pd.read_csv(LAD_lookup_file_path) - lookup_dict = dict(zip(lookup_df['LAD22NM'].str.lower().str.strip(), lookup_df['LAD22CD'])) + lookup_dict = dict(zip(lookup_df["LAD22NM"].str.lower().str.strip(), lookup_df["LAD22CD"])) # Replace council area names with LAD codes - reformatted_df['Council Area 2019'] = ( - reformatted_df['Council Area 2019'] + reformatted_df["Council Area 2019"] = ( + reformatted_df["Council Area 2019"] .str.strip() .str.lower() .map(lookup_dict) - .fillna(reformatted_df['Council Area 2019']) + .fillna(reformatted_df["Council Area 2019"]) ) # Replace the column name 'Council Area 2019' with 'CA19' - reformatted_df.rename(columns={'Council Area 2019': 'CA19'}, inplace=True) + reformatted_df.rename(columns={"Council Area 2019": "CA19"}, inplace=True) # Ensure reformat_scot_input_folder exists os.makedirs(os.path.dirname(config["reformat_scot_input_folder"]), exist_ok=True) # Save the new DataFrame to a CSV file - output_file_path = os.path.join(config["reformat_scot_input_folder"], "reformat_migrant_indicator.csv") + output_file_path = os.path.join( + config["reformat_scot_input_folder"], "reformat_migrant_indicator.csv" + ) reformatted_df.to_csv(output_file_path, index=False) - def extract_pop_density_table(scot_input_folder): """ - Extracts the 'Table 4' sheet from an Excel file, saves it as a CSV, + Extracts the 'Table 4' sheet from an Excel file, saves it as a CSV, and deletes the original Excel file. Parameters @@ -601,11 +640,11 @@ def extract_pop_density_table(scot_input_folder): try: # Load only the sheet named 'Table 4' df = pd.read_excel(population_density_xlsx, sheet_name="Table 4") - + # Save the extracted sheet as a CSV df.to_csv(population_density_csv, index=False) logger.info(f"Sheet 'Table 4' has been saved as {population_density_csv}.") - + # Remove the original Excel file os.remove(population_density_xlsx) logger.info(f"The file {population_density_xlsx} has been deleted from the folder.") @@ -615,9 +654,10 @@ def extract_pop_density_table(scot_input_folder): def reformat_pop_density(scot_input_folder, config): """ - Reformats the population density file by removing unnecessary rows and amending column headers. + Reformats the population density file by removing unnecessary rows and + amending column headers. The output includes CA codes. - + Parameters ---------- scot_input_folder : str @@ -628,7 +668,8 @@ def reformat_pop_density(scot_input_folder, config): Returns ------- None - The function saves the reformatted DataFrame to a new CSV file in the specified output path. + The function saves the reformatted DataFrame to a new CSV file in the + specified output path. """ # Look for population_density.csv in the directory @@ -641,29 +682,31 @@ def reformat_pop_density(scot_input_folder, config): df = pd.read_csv(file_path, skiprows=3, usecols=[0, 1, 2, 3]) # Rename columns using their index - df.columns.values[1] = "CA19" - df.columns.values[3] = "Population density (number of usual residents per square kilometre)" + df.columns.values[1] = "CA19" + df.columns.values[3] = "Population density (number of usual residents per square kilometre)" # Remove the first and third columns by index df = df.drop(df.columns[[0, 2]], axis=1) - # Check if the row contains 'S92000003' and remove the row if true. + # Check if the row contains 'S92000003' and remove the row if true. # 'S92000003' is the whole of Scotland - df = df[df.iloc[:, 0] != 'S92000003'] + df = df[df.iloc[:, 0] != "S92000003"] # Ensure reformat_scot_input_folder exists os.makedirs(os.path.dirname(config["reformat_scot_input_folder"]), exist_ok=True) - + # Save to a CSV - output_file_path = os.path.join(config["reformat_scot_input_folder"], "reformat_population_density.csv") + output_file_path = os.path.join( + config["reformat_scot_input_folder"], "reformat_population_density.csv" + ) df.to_csv(output_file_path, index=False) - def extract_metadata_from_files(scot_input_folder): """ Extracts metadata from CSV files in the specified input directory. - Special handling is applied for the 'migrant_indicator_percentage.csv' and 'population_density.csv' files. + Special handling is applied for the 'migrant_indicator_percentage.csv' + and 'population_density.csv' files. It also skips the 'reformat_' files as these are repeats of the UV files. Parameters @@ -682,7 +725,6 @@ def extract_metadata_from_files(scot_input_folder): la_files = os.listdir(scot_input_folder) metadata = [] # List to store metadata for each file - for file in la_files: # Skip the disability file 'UV303a.csv' if file == "UV303a.csv": @@ -691,23 +733,27 @@ def extract_metadata_from_files(scot_input_folder): if file.startswith("reformat_"): continue # Check for migrant_indicator table and explicitly define its metadata - if file == "migrant_indicator.csv": - metadata.append({ - "table_id": "migrant_indicator", - "table_name": "Migrant Indicator", - "unit": "Person" - }) + if file == "migrant_indicator.csv": + metadata.append( + { + "table_id": "migrant_indicator", + "table_name": "Migrant Indicator", + "unit": "Person", + } + ) continue # Skip further processing for this specific table - + # Check for population_density table and explicitly define its metadata - if file == "population_density.csv": - metadata.append({ - "table_id": "population_density", - "table_name": "Population Density", - "unit": "Persons per square kilometer" - }) + if file == "population_density.csv": + metadata.append( + { + "table_id": "population_density", + "table_name": "Population Density", + "unit": "Persons per square kilometer", + } + ) continue # Skip further processing for this specific table - + t_tab_loc = file # Extract the table id table_id = os.path.splitext(t_tab_loc)[0] @@ -716,64 +762,63 @@ def extract_metadata_from_files(scot_input_folder): with open(os.path.join(scot_input_folder, t_tab_loc), "r") as f: reader = csv.reader(f) rows = list(reader) - + table_name = None # Initialize table_name # Ensure there are at least 4 rows in the file if len(rows) >= 4: # Extract the first column of row 4 - row_4 = rows[3][0] - + row_4 = rows[3][0] + # Special case for UV607.csv if t_tab_loc.lower() == "uv607.csv": # Extract the portion after the second hyphen and ignore further hyphens - parts = row_4.split('-') + parts = row_4.split("-") if len(parts) > 2: # Combine everything after the second hyphen into one string - table_name = '-'.join(parts[2:]).strip() + table_name = "-".join(parts[2:]).strip() # If 'All' is present, extract only the part before 'All' - if 'All' in table_name: - table_name = table_name.split('All', 1)[0].strip() + if "All" in table_name: + table_name = table_name.split("All", 1)[0].strip() else: # Default behavior for other files - parts = row_4.split('-') + parts = row_4.split("-") if len(parts) > 2: # Extract the portion after the second hyphen table_name = parts[2].strip() - + # If there's a third hyphen, extract only the part before it if len(parts) > 3: - table_name = parts[2].split('-', 1)[0].strip() - + table_name = parts[2].split("-", 1)[0].strip() + # If the word 'All' is present, extract only the part before 'All' - if 'All' in table_name: - table_name = table_name.split('All', 1)[0].strip() - + if "All" in table_name: + table_name = table_name.split("All", 1)[0].strip() # Initialize table_includes with a default value table_includes = [] # Ensure there are at least 9 rows in the file if len(rows) > 8: table_includes = rows[8] # Directly point to the 9th row - + # Find the unit of measure unit = "-" if "Households" in table_includes: unit = "Household" elif "Individuals" in table_includes: - unit = "Person" + unit = "Person" # Append the metadata for the current file to the list - metadata.append({ - "table_id": table_id, - "table_name": table_name, - "unit": unit - }) - + metadata.append({"table_id": table_id, "table_name": table_name, "unit": unit}) + # Check if metadata list is populated correctly if not metadata: - logger.warning("Warning: Metadata list is empty. No files were processed or metadata extraction failed.") + logger.warning( + "Warning: Metadata list is empty. No files were processed or metadata extraction failed." + ) else: - logger.info(f"Metadata extraction completed successfully. Extracted {len(metadata)} entries.") + logger.info( + f"Metadata extraction completed successfully. Extracted {len(metadata)} entries." + ) for entry in metadata: if not all(key in entry for key in ["table_id", "table_name", "unit"]): logger.warning(f"Warning: Incomplete metadata entry found: {entry}") @@ -781,7 +826,6 @@ def extract_metadata_from_files(scot_input_folder): return metadata - def replace_ca19_names_with_codes(scot_input_folder, LAD_lookup_file_path, config): """ Replace council area names with council area codes in CSV files. @@ -792,85 +836,98 @@ def replace_ca19_names_with_codes(scot_input_folder, LAD_lookup_file_path, confi Path to the directory containing input CSV files. LAD_lookup_file_path : str Path to the lookup CSV file containing council area names and codes. - config : dict + config : dict Configuration dictionary containing paths and file names. Returns ------- None - The function modifies the CSV files in place and saves them to the specified output path in the config dictionary. + The function modifies the CSV files in place and saves them to + the specified output path in the config dictionary. """ # Load the LAD codes and names lookup file lookup_df = pd.read_csv(LAD_lookup_file_path) # Assuming the file has headers - lookup_dict = dict(zip(lookup_df['LAD22NM'].str.lower().str.strip(), lookup_df['LAD22CD'])) # Create a dictionary for lookup (place names -> place codes) - + lookup_dict = dict( + zip(lookup_df["LAD22NM"].str.lower().str.strip(), lookup_df["LAD22CD"]) + ) # Create a dictionary for lookup (place names -> place codes) # Process each CSV file in the input directory, skipping certain files for file_name in os.listdir(scot_input_folder): - if file_name.lower() in ["uv101b.csv", "uv303a.csv", "uv103.csv", "migrant_indicator.csv", "population_density.csv"]: + if file_name.lower() in [ + "uv101b.csv", + "uv303a.csv", + "uv103.csv", + "migrant_indicator.csv", + "population_density.csv", + ]: continue file_path = os.path.join(scot_input_folder, file_name) - + # Read the input CSV file - if "UV606" in file_name or "UV604" in file_name: + if "UV606" in file_name or "UV604" in file_name: df = pd.read_csv(file_path, header=None, skiprows=10) else: df = pd.read_csv(file_path, header=None, skiprows=9) - # Locate the row where 'Council Area 2019' appears in column 1 if 0 in df.columns: # Ensure column 0 exists in the input DataFrame - council_area_row_index = df[df.iloc[:, 0] == 'Council Area 2019'].index + council_area_row_index = df[df.iloc[:, 0] == "Council Area 2019"].index if not council_area_row_index.empty: # Check if the value exists start_index = council_area_row_index[0] + 1 # Start processing rows below this index - + # Slice the DataFrame to include only rows below the specified cell df_below = df.iloc[start_index:] - + # Strip spaces and convert to lowercase for consistent matching df_below.iloc[:, 0] = df_below.iloc[:, 0].str.strip().str.lower() - + # Replace values in column 0 using the lookup dictionary - df_below.iloc[:, 0] = df_below.iloc[:, 0].map(lookup_dict).fillna(df_below.iloc[:, 0]) # Replace matching values, keep original if no match - + df_below.iloc[:, 0] = ( + df_below.iloc[:, 0].map(lookup_dict).fillna(df_below.iloc[:, 0]) + ) # Replace matching values, keep original if no match + # Update the original DataFrame with the modified rows df.iloc[start_index:] = df_below else: logger.info(f"'Council Area 2019' not found in {file_name}. Skipping replacement.") else: logger.info(f"Column 0 not found in {file_name}. Skipping replacement.") - + # Ensure reformat_scot_input_folder exists os.makedirs(os.path.dirname(config["reformat_scot_input_folder"]), exist_ok=True) - + # Save the reformat DataFrame to a new CSV file - reformat_file_path = os.path.join(config["reformat_scot_input_folder"], f"reformat_{file_name}") + reformat_file_path = os.path.join( + config["reformat_scot_input_folder"], f"reformat_{file_name}" + ) df.to_csv(reformat_file_path, index=False, header=False) - - def remove_rows(config, folderpath): """ Processes all CSV files in the input directory that start with 'reformat_'. - Modifies the files in place by performing specific preprocessing steps which includ - removing the last three rows as there is extra informaiton outside of the data table, - renaming column heading 'Council Area 2019' to 'CA19' and resetting the index. + Modifies the files in place by performing specific preprocessing steps + which include removing the last three rows as there is extra informaiton + outside of the data table, renaming column heading 'Council Area 2019' to + 'CA19' and resetting the index. Parameters ---------- - config : dict + config : dict Configuration dictionary containing paths and file names. folderpath: str - The folder containing the files to loop through, likely as config["reformat_scot_input_folder"] + The folder containing the files to loop through, likely as + config["reformat_scot_input_folder"] Returns ------- None The function modifies the 'reformat_' CSV files in place """ - - reformat_scot_input_folder = config["reformat_scot_input_folder"] # Store the folder path in a variable + + reformat_scot_input_folder = config[ + "reformat_scot_input_folder" + ] # Store the folder path in a variable for file_name in os.listdir(folderpath): # Construct the full file path @@ -878,48 +935,45 @@ def remove_rows(config, folderpath): # Process files starting with "reformat_" if file_name.startswith("reformat_"): - try: # Read the CSV file - df = pd.read_csv(file_path, on_bad_lines='warn', header=None) - + df = pd.read_csv(file_path, on_bad_lines="warn", header=None) + # Remove the last 3 rows df = df.iloc[:-3, :] - + # Replace any cell in the DataFrame that says "Council Area 2019" with "CA19" df.replace("Council Area 2019", "CA19", inplace=True) # Remove value from cell A1 (table name) - df.iloc[0, 0] = "" - - # Move the values from row 1 (index 0) in columns B onward (index 1 onward) to row 2 (index 1) + df.iloc[0, 0] = "" + + # Move the values from row 1 (index 0) in columns B + # onward (index 1 onward) to row 2 (index 1) df.iloc[1, 1:] = df.iloc[0, 1:] - + # Clear the original values in row 1 (index 0) from column B onward (index 1 onward) df.iloc[0, 1:] = np.nan - + # Drop the first (empty) row and reset the index df = df.drop(index=0).reset_index(drop=True) - + # Check the save location exists os.makedirs(os.path.dirname(file_path), exist_ok=True) # Save the modified DataFrame back to the same file (edit in place) df.to_csv(file_path, index=False, header=False) - + except pd.errors.ParserError as e: logger.error(f"Error processing {file_name}: {e}") else: logger.info(f"Skipping non-reformat file: {file_name}") - - - def replace_variable_names_with_codes(config): """ Replace the variable names with the variable ids. - Extract the variable name and variable ids for use in the metadata table. + Extract the variable name and variable ids for use in the metadata table. Parameters ---------- @@ -931,19 +985,19 @@ def replace_variable_names_with_codes(config): Returns ------- variable_names_ids: list - A list of tuples, each containing a list of variable names and a list of variable ids for each processed file. + A list of tuples, each containing a list of variable names and a list + of variable ids for each processed file. """ # Initialize a list to store variable_names and variable_ids for each file - variable_names_ids = [] + variable_names_ids = [] # Iterate through each file in the input directory for file_name in os.listdir(config["reformat_scot_input_folder"]): - if "reformat_" in file_name and file_name.endswith(".csv"): - + if "reformat_" in file_name and file_name.endswith(".csv"): file_path = os.path.join(config["reformat_scot_input_folder"], file_name) - + # Read the CSV file - df = pd.read_csv(file_path, on_bad_lines='warn', header=0) + df = pd.read_csv(file_path, on_bad_lines="warn", header=0) # Create new column names with zero padding, excluding the first column variable_names = df.columns @@ -951,25 +1005,33 @@ def replace_variable_names_with_codes(config): # Check if the file is 'reformat_population_density.csv' if file_name == "reformat_population_density.csv": # Explicitly define variable names and variable IDs - variable_names = ["Population density (number of usual residents per square kilometre)"] + variable_names = [ + "Population density (number of usual residents per square kilometre)" + ] variable_ids = ["population_density"] df.columns = [df.columns[0]] + list(variable_ids) elif file_name == "reformat_migrant_indicator.csv": # For this specific table, keep variable IDs the same as variable names # Replace whitespaces and slashes with underscores in variable IDs - variable_ids = [name.replace(" ", "_").replace("/", "_") for name in variable_names[1:]] # Exclude the first column + variable_ids = [ + name.replace(" ", "_").replace("/", "_") for name in variable_names[1:] + ] # Exclude the first column df.columns = [df.columns[0]] + list(variable_ids) else: # Extract the table_id from the file name after "reformat_" table_id = file_name.split("reformat_")[1].split(".")[0] # Create a list of new column names - variable_ids = [f"{table_id}{str(i).zfill(4)}" for i in range(1, len(variable_names))] + variable_ids = [ + f"{table_id}{str(i).zfill(4)}" for i in range(1, len(variable_names)) + ] - # Replace the existing column names of the df from column B onward with the variable IDs - df.columns = [df.columns[0]] + list(variable_ids) # Keep column A unchanged, replace column B onward + # Replace the existing column names of the df from column B onward + # with the variable IDs + df.columns = [df.columns[0]] + list( + variable_ids + ) # Keep column A unchanged, replace column B onward - # Drop the last column unless the file name is one of the specified files if file_name not in [ "reformat_UV101b.csv", @@ -981,21 +1043,19 @@ def replace_variable_names_with_codes(config): # Ensure QA directory exists os.makedirs(os.path.dirname(config["reformat_scot_input_folder"]), exist_ok=True) - + # Save the modified DataFrame back to the original file path to overwrite it df.to_csv(file_path, index=False, header=True) - # Append the variable_names and variable_ids to the results list variable_names_ids.append((variable_names, variable_ids)) else: logger.info(f"Skipping file: {file_name}") - + # Return the list of results return variable_names_ids - def concat_reformatted_tables(config): """ Concatenates all CSV files in the QA folder that start with "reformat" @@ -1003,7 +1063,7 @@ def concat_reformatted_tables(config): Parameters ---------- - config : dict + config : dict Configuration dictionary containing paths and file names. Returns @@ -1013,7 +1073,7 @@ def concat_reformatted_tables(config): """ # Concat reformatted tables - folder_path = config["reformat_scot_input_folder"] + folder_path = config["reformat_scot_input_folder"] # List all files in the folder that start with "reformat" files = [f for f in os.listdir(folder_path) if f.startswith("reformat") and f.endswith(".csv")] @@ -1024,10 +1084,7 @@ def concat_reformatted_tables(config): for file in files: file_path = os.path.join(folder_path, file) df = pd.read_csv(file_path) # Read the CSV file - - # Extract the first column as the join key - join_key = df.iloc[:, 0] # First column - + if result is None: # For the first file, include all columns and set the first column as the index result = df.copy() # Keep all columns, including the join key @@ -1035,13 +1092,13 @@ def concat_reformatted_tables(config): else: # For subsequent files, exclude the first column and merge based on the join key data = df.set_index(df.columns[0]).iloc[:, :] # Set the first column as the index - result = pd.merge(result, data, left_index=True, right_index=True, how='outer') + result = pd.merge(result, data, left_index=True, right_index=True, how="outer") # Ensure QA directory exists os.makedirs(os.path.dirname(config["input_directory"]), exist_ok=True) - + # Save the concatenated DataFrame to a new CSV file (optional) concatenated_file_path = os.path.join(config["input_directory"], "CA19_all_variables.csv") result.to_csv(concatenated_file_path, index=True) - return result \ No newline at end of file + return result diff --git a/area_classification/main_pipeline.py b/area_classification/main_pipeline.py index 05137d8..fcf98a1 100644 --- a/area_classification/main_pipeline.py +++ b/area_classification/main_pipeline.py @@ -1,23 +1,22 @@ import os -import pandas as pd -from area_classification.utilities.load_config import load_config -from area_classification.utilities.loading_data import load_format_data -from area_classification.downloading_data.ew_lad_bulk_download import ew_lad_bulk_download -from area_classification.downloading_data.ni_lgd_downloading_data import ni_lgd_download_data +from area_classification.clustering.clustering import clustering_wrapper from area_classification.downloading_data.scot_tables_reformatting import scot_reformatting_wrapper -from area_classification.pre_processing.pre_processing import pre_processing -from area_classification.pre_processing.drop_variables import check_drop_columns_true -from area_classification.clustering.clustering import clustering_wrapper from area_classification.post_processing.post_processing import post_processing -from area_classification.pre_processing.prepare_clustering_data import prepare_clustering_data +from area_classification.pre_processing.drop_variables import check_drop_columns_true +from area_classification.pre_processing.pre_processing import pre_processing +from area_classification.pre_processing.prepare_clustering_data import prepare_clustering_data +from area_classification.utilities.load_config import load_config +from area_classification.utilities.loading_data import load_format_data + def main_pipeline(): """ Main pipeline to process area classification data. - This function runs the entire pipeline for creation of the Local Authority District area classification - clusters, including downloading, formatting, pre-processing, and clustering. + This function runs the entire pipeline for creation of the Local Authority + District area classification clusters, including downloading, formatting, + pre-processing, and clustering. Steps ----- @@ -28,7 +27,8 @@ def main_pipeline(): 5. Establish the variables which will be used for clustering (some may be dropped). 6. standardise the pre-processed data for clustering. 7. Perform clustering on the pre-processed data, using variables chosen. - 8. Reformat the cluster tables, calculate the means of the clustered data and generate radial plots and bar charts. + 8. Reformat the cluster tables, calculate the means of the clustered data and + generate radial plots and bar charts. Parameters ---------- @@ -36,38 +36,46 @@ def main_pipeline(): Notes ----- - - The configuration file `area_classification/config.yaml` is loaded to provide all necessary settings. - - The clustering step assumes pre-processed data is saved locally and loads it during clustering. + - The configuration file `area_classification/config.yaml` is loaded to + provide all necessary settings. + - The clustering step assumes pre-processed data is saved locally and + loads it during clustering. """ - config = load_config('area_classification/config.yaml') + config = load_config("area_classification/config.yaml") # Step 1: Download england and wales data and reformat to be processed and combined - ew_lad_bulk_download(config) + # ew_lad_bulk_download(config) ew_input_csv_path = os.path.join(config["input_directory"], "./ew_downloads/") - ew_df = load_format_data(ew_input_csv_path, config["ew_file_pattern"],config["ew_join_column_name"], config) + ew_df = load_format_data( + ew_input_csv_path, config["ew_file_pattern"], config["ew_join_column_name"], config + ) # Step 2: Download Northen Ireland data and reformat to be processed and combined - ni_lgd_download_data(config) + # ni_lgd_download_data(config) ni_input_csv_path = os.path.join(config["input_directory"], "./ni_downloads/") - ni_df = load_format_data(ni_input_csv_path, config["ni_file_pattern"],config["ni_join_column_name"], config) - + ni_df = load_format_data( + ni_input_csv_path, config["ni_file_pattern"], config["ni_join_column_name"], config + ) + # Step 3: Processing of Scotland data which was manually downloaded - scot_df = scot_reformatting_wrapper(config["scot_input_folder"], config["LAD_lookup_file_path"], config) + scot_df = scot_reformatting_wrapper( + config["scot_input_folder"], config["LAD_lookup_file_path"], config + ) # Step 4: Pre-processing - preprocessed_df = pre_processing(ew_df , ni_df, scot_df, config) + preprocessed_df = pre_processing(ew_df, ni_df, scot_df, config) # Step 5: Choose to drop/not drop - # If not running the full 60 variables, update the 'drop_columns' in the config to + # If not running the full 60 variables, update the 'drop_columns' in the config to # True and change the 'variables_to_drop' in the config chosen_clustering_variables = check_drop_columns_true(config, preprocessed_df) # Step 6: Standardise pre_clustering data (used in the clustering) pre_clustering_data_std_mean = prepare_clustering_data(chosen_clustering_variables) - # Save the standardised pre clusting data to a new file + # Save the standardised pre clusting data to a new file pre_clustering_data_std_mean.to_csv(config["pre_clustering_data_std_mean"], index=False) - + # Step 7: Clustering clustering_output = clustering_wrapper( config, @@ -76,14 +84,17 @@ def main_pipeline(): n_init=config["number_of_times_k_means_initialised"], output_directory=config["output_directory"], clustergram_directory=config["clustergram_directory"], - random_seed=config["random_seed"] + random_seed=config["random_seed"], ) - + # Add a break input("Press Enter to continue to post processing...") - + # Step 8: Post processing - combined_group_means, combined_subgroup_means, uk_std_cluster_means = post_processing(config, clustering_output, chosen_clustering_variables) + combined_group_means, combined_subgroup_means, uk_std_cluster_means = post_processing( + config, clustering_output, chosen_clustering_variables + ) + if __name__ == "__main__": - main_pipeline() \ No newline at end of file + main_pipeline() diff --git a/area_classification/post_processing/cluster_std_means_to_parent_clusters.py b/area_classification/post_processing/cluster_std_means_to_parent_clusters.py index 2879c27..48b5de0 100644 --- a/area_classification/post_processing/cluster_std_means_to_parent_clusters.py +++ b/area_classification/post_processing/cluster_std_means_to_parent_clusters.py @@ -2,16 +2,19 @@ # group means standardised to the supergroup mean # subgroup means standardised to the group mean -import pandas as pd import os -from collections import defaultdict -def cluster_std_means_to_parent_clusters(config, restructured_cluster_table_df, chosen_clustering_variables): +import pandas as pd + + +def cluster_std_means_to_parent_clusters( + config, restructured_cluster_table_df, chosen_clustering_variables +): """ This function reads the clustering output CSV file and the pre-clustering data CSV file. - It creates standardised means of the values in a cluster to their parent cluster. It then + It creates standardised means of the values in a cluster to their parent cluster. It then saves the standardised means to a new CSV file. - Once it has the standardised means, it creates means for each cluster. + Once it has the standardised means, it creates means for each cluster. Parameters ---------- @@ -21,7 +24,7 @@ def cluster_std_means_to_parent_clusters(config, restructured_cluster_table_df, DataFrame containing the pre-clustering data with the chosen clustering variables. restructured_cluster_table_df : pd.DataFrame DataFrame of cluster assignments. Data will have the following format: - + LAD_name | LAD_code | supergroup| group | subgroup ------------------------------------------- Hartlepool | E06000001 | 1 | 1c | 1c1 @@ -29,20 +32,21 @@ def cluster_std_means_to_parent_clusters(config, restructured_cluster_table_df, Returns ------- tuple of pd.DataFrame - (combined_group_means, combined_subgroup_means): DataFrames containing standardised means for each group and subgroup. + (combined_group_means, combined_subgroup_means): DataFrames containing + standardised means for each group and subgroup. - """ + """ # Merge the two DataFrames on the LAD CODE column merged_df = restructured_cluster_table_df.merge( - chosen_clustering_variables , on="LAD_code", how="left" + chosen_clustering_variables, on="LAD_code", how="left" ) - + # Define the output directory output_directory = config["output_directory"] - + # Define the path for the std_means output folder output_directory = os.path.join(output_directory, "std_means") - + # --- Generate CSVs for supergroups --- # Sort the data by the supergroup column merged_df = merged_df.sort_values(by=["supergroup"]) @@ -52,7 +56,7 @@ def cluster_std_means_to_parent_clusters(config, restructured_cluster_table_df, all_subgroup_means = [] # Group by supergroup and process each supergroup's data - for supergroup, supergroup_data in merged_df.groupby("supergroup"): + for _supergroup, supergroup_data in merged_df.groupby("supergroup"): # Sort the data by 'group' supergroup_data = supergroup_data.sort_values(by=["group"]) # Drop the 'subgroup' column @@ -81,7 +85,7 @@ def cluster_std_means_to_parent_clusters(config, restructured_cluster_table_df, merged_df = merged_df.sort_values(by=["group", "subgroup"]) # Group by group and process each group's data - for group, group_data in merged_df.groupby("group"): + for _group, group_data in merged_df.groupby("group"): # Identify columns starting with 'v' v_columns = [col for col in group_data.columns if col.startswith("v")] @@ -105,16 +109,22 @@ def cluster_std_means_to_parent_clusters(config, restructured_cluster_table_df, combined_subgroup_means = pd.concat(all_subgroup_means, ignore_index=True) # Create the 'parent_std_means' subfolder within 'std_means' - parent_std_means_directory = os.path.join(config["output_directory"], "std_means", "parent_std_means") + parent_std_means_directory = os.path.join( + config["output_directory"], "std_means", "parent_std_means" + ) os.makedirs(parent_std_means_directory, exist_ok=True) # Save the group output file path within the 'parent_std_means' folder - group_output_file_path = os.path.join(parent_std_means_directory, "parent_std_cluster_group_means_output.csv") + group_output_file_path = os.path.join( + parent_std_means_directory, "parent_std_cluster_group_means_output.csv" + ) combined_group_means.to_csv(group_output_file_path, index=False) # Save the subgroup output file path within the 'parent_std_means' folder - subgroup_output_file_path = os.path.join(parent_std_means_directory, "parent_std_cluster_subgroup_means_output.csv") + subgroup_output_file_path = os.path.join( + parent_std_means_directory, "parent_std_cluster_subgroup_means_output.csv" + ) combined_subgroup_means.to_csv(subgroup_output_file_path, index=False) # Return the concatenated DataFrames - return combined_group_means, combined_subgroup_means \ No newline at end of file + return combined_group_means, combined_subgroup_means diff --git a/area_classification/post_processing/cluster_summaries.py b/area_classification/post_processing/cluster_summaries.py index 8f02d4c..9423acc 100644 --- a/area_classification/post_processing/cluster_summaries.py +++ b/area_classification/post_processing/cluster_summaries.py @@ -1,19 +1,22 @@ # Creating print statements about the clusters import logging -logger = logging.getLogger(__name__) -import pandas as pd -import numpy as np import os import re -from area_classification.utilities.load_config import load_config +import numpy as np +import pandas as pd -def cluster_summaries_wrapper(config, restructured_cluster_table_long, uk_std_cluster_means, lookup_file, cluster_column): +logger = logging.getLogger(__name__) + + +def cluster_summaries_wrapper( + config, restructured_cluster_table_long, uk_std_cluster_means, lookup_file, cluster_column +): """ Wrapper function to execute a series of cluster summary operations post clustering. - This function calculates the cluster variances, population percentages, + This function calculates the cluster variances, population percentages, cluster summaries, and the identification of key drivers for each cluster. Parameters @@ -21,14 +24,16 @@ def cluster_summaries_wrapper(config, restructured_cluster_table_long, uk_std_cl config : dict main pipeline config dictionary containing output directory. restructured_cluster_table_long : pd.DataFrame - A DataFrame containing detailed information about clusters, including the clustering results - and associated variables. + A DataFrame containing detailed information about clusters, including the + clustering results and associated variables. uk_std_cluster_means : pd.DataFrame - A DataFrame containing the mean standardised values of clustering variables for each cluster. + A DataFrame containing the mean standardised values of clustering variables for + each cluster. lookup_file : str Path to the lookup file used for identifying cluster drivers. cluster_column : str - The name of the column in `restructured_cluster_table_long` that identifies the cluster assignments. + The name of the column in `restructured_cluster_table_long` that identifies the + cluster assignments. Steps ----- @@ -39,50 +44,59 @@ def cluster_summaries_wrapper(config, restructured_cluster_table_long, uk_std_cl Returns ------- None - This function does not return a value. It performs operations that generate summaries + This function does not return a value. It performs operations that generate summaries and insights about the clusters. """ - # Step 1 - Variance: + # Step 1 - Variance: variance_df = calculate_cluster_variance(restructured_cluster_table_long, cluster_column) - # Step 2 - Cluster summaries: - cluster_info = cluster_summary(restructured_cluster_table_long, uk_std_cluster_means, variance_df, cluster_column) + # Step 2 - Cluster summaries: + cluster_info = cluster_summary( + config, restructured_cluster_table_long, uk_std_cluster_means, variance_df, cluster_column + ) + + # Step 3 - Cluster drivers: + identify_cluster_drivers( + uk_std_cluster_means, lookup_file, cluster_info, variance_df, cluster_column, top_n=3 + ) - # Step 3 - Cluster drivers: - identify_cluster_drivers(uk_std_cluster_means, lookup_file, cluster_info, variance_df, cluster_column, top_n=3) + return - return def calculate_cluster_variance(restructured_cluster_table_long, cluster_column): """ - Calculates the variance for all columns starting with 'v' for each cluster, computes the average variance - for each cluster + Calculates the variance for all columns starting with 'v' for each cluster, + computes the average variance for each cluster Parameters ---------- restructured_cluster_table_long : pd.DataFrame - A DataFrame containing the data, including columns for LAD code / names, the cluster allocation at different levels - (supergroup, group, and subgroup) and columns starting with 'v' for which variance will be calculated. + A DataFrame containing the data, including columns for LAD code / names, the cluster + allocation at different levels (supergroup, group, and subgroup) and columns starting + with 'v' for which variance will be calculated. cluster_column : str - The name of the column in the DataFrame that contains cluster allocations (likely supergroup, group, and subgroup). + The name of the column in the DataFrame that contains cluster allocations + (likely supergroup, group, and subgroup). Returns ------- pd.DataFrame - A DataFrame containing the cluster allocation column called (supergroup, group or subgroup) the variance of each - 'v' column for each cluster, along with an additional column 'cluster_average_variance' that represents the average - variance of all 'v' columns for each cluster. The cluster column becomes the index for the dataframe. + A DataFrame containing the cluster allocation column called (supergroup, group or subgroup) + the variance of each 'v' column for each cluster, along with an additional column + 'cluster_average_variance' that represents the average variance of all 'v' columns for + each cluster. The cluster column becomes the index for the dataframe. Notes ----- - Variance is calculated using the sample variance formula (degrees of freedom = 1). - - The average variance for each cluster is computed by excluding None values from the variance calculations. + - The average variance for each cluster is computed by excluding None values + from the variance calculations. """ data = restructured_cluster_table_long # Identify columns starting with 'v' - v_columns = [col for col in data.columns if col.startswith('v')] + v_columns = [col for col in data.columns if col.startswith("v")] # Initialize a dictionary to store variances cluster_variances = {} @@ -115,21 +129,26 @@ def calculate_cluster_variance(restructured_cluster_table_long, cluster_column): else: cluster_average_variance[cluster_number] = None - variance_df = pd.DataFrame.from_dict(cluster_variances, orient='index') + variance_df = pd.DataFrame.from_dict(cluster_variances, orient="index") # Make the cluster column the index variance_df.index.name = cluster_column variance_df = variance_df.sort_index() # Add the average variance as an additional column - variance_df['cluster_average_variance'] = variance_df.index.map(cluster_average_variance) + variance_df["cluster_average_variance"] = variance_df.index.map(cluster_average_variance) return variance_df -def cluster_summary(restructured_cluster_table_long, uk_std_cluster_means, variance_df, cluster_column): + +def cluster_summary( + config, restructured_cluster_table_long, uk_std_cluster_means, variance_df, cluster_column +): """ Generate a text summary for each cluster based on various metrics and data sources. Parameters ---------- + config : dict + main pipeline config dictionary containing QA directory. restructured_cluster_table_long : pd.DataFrame A DataFrame containing detailed information about clusters, including columns such as 'supergroup', 'LAD_name', and 'v12'. @@ -140,7 +159,8 @@ def cluster_summary(restructured_cluster_table_long, uk_std_cluster_means, varia A DataFrame containing variance information for clusters, indexed by cluster IDs, with a column 'cluster_average_variance'. cluster_column : str - The name of the column in the DataFrame that contains cluster allocations (likely supergroup, group, and subgroup). + The name of the column in the DataFrame that contains cluster allocations + (likely supergroup, group, and subgroup). Returns ---------- @@ -160,54 +180,68 @@ def cluster_summary(restructured_cluster_table_long, uk_std_cluster_means, varia if cluster_column == "supergroup": # Sort clusters in ascending order clusters = sorted(clusters) - # Filter rows where 'hierarchy_level' is the same as the cluster_column specified and convert 'cluster' column to integers - filtered_df = ( - uk_std_cluster_means.loc[uk_std_cluster_means['hierarchy_level'] == cluster_column] - .assign(cluster=lambda df: pd.to_numeric(df['cluster'], errors='coerce').astype(int)) - ) + # Filter rows where 'hierarchy_level' is the same as the cluster_column specified + # and convert 'cluster' column to integers + filtered_df = uk_std_cluster_means.loc[ + uk_std_cluster_means["hierarchy_level"] == cluster_column + ].assign(cluster=lambda df: pd.to_numeric(df["cluster"], errors="coerce").astype(int)) elif cluster_column in ["group", "subgroup"]: # Sort based on the numeric part - clusters = sorted(clusters, key=lambda x: int(''.join(filter(str.isdigit, str(x))))) - # Filter rows where 'hierarchy_level' is the same as the cluster_column specified and ensure 'cluster' column is treated as strings - filtered_df = ( - uk_std_cluster_means.loc[uk_std_cluster_means['hierarchy_level'] == cluster_column] - .assign(cluster=lambda df: df['cluster'].astype(str)) - ) + clusters = sorted(clusters, key=lambda x: int("".join(filter(str.isdigit, str(x))))) + # Filter rows where 'hierarchy_level' is the same as the cluster_column specified and + # ensure 'cluster' column is treated as strings + filtered_df = uk_std_cluster_means.loc[ + uk_std_cluster_means["hierarchy_level"] == cluster_column + ].assign(cluster=lambda df: df["cluster"].astype(str)) + filtered_df_QA = os.path.join(config["qa_directory"], "filtered_df") + os.makedirs(filtered_df_QA, exist_ok=True) + filtered_df.to_csv(f"{filtered_df_QA}/filtered_df.csv", index=False) # Initialize a list to store outputs for all clusters cluster_info = [] # Iterate through each cluster for cluster in clusters: - # Filter rows for the current cluster - cluster_data = restructured_cluster_table_long[restructured_cluster_table_long[cluster_column] == cluster] + cluster_data = restructured_cluster_table_long[ + restructured_cluster_table_long[cluster_column] == cluster + ] # Number of local authorities in the current cluster - num_local_authorities = cluster_data['LAD_name'].nunique() + num_local_authorities = cluster_data["LAD_name"].nunique() # Total number of unique local authorities in the dataset - total_local_authorities = restructured_cluster_table_long['LAD_name'].nunique() - + total_local_authorities = restructured_cluster_table_long["LAD_name"].nunique() + # Percentage of local authorities in the current cluster percentage_local_authorities = (num_local_authorities / total_local_authorities) * 100 - + # Find example areas from the restructured_cluster_table_long table - example_areas = restructured_cluster_table_long[restructured_cluster_table_long[cluster_column] == cluster] + example_areas = restructured_cluster_table_long[ + restructured_cluster_table_long[cluster_column] == cluster + ] if not example_areas.empty: - area_names = example_areas['LAD_name'].sample(n=min(3, len(example_areas)), random_state=42).tolist() + area_names = ( + example_areas["LAD_name"] + .sample(n=min(3, len(example_areas)), random_state=42) + .tolist() + ) else: area_names = ["No area found"] # Print the summary for the cluster # Combine the print statements into a single string output = ( - f"Cluster {cluster} contains {num_local_authorities} local authorities which is {percentage_local_authorities:.2f}% of UK local authorities. " + f"Cluster {cluster} contains {num_local_authorities} local authorities " + f"which is {percentage_local_authorities:.2f}% of UK local authorities. " ) # Check if the cluster exists in the DataFrame if cluster in variance_df.index: - cluster_avg_variance = variance_df.loc[cluster, 'cluster_average_variance'] - output += f"The average variance for cluster {cluster} is {cluster_avg_variance:.3f}. Example areas: {', '.join(area_names)}" + cluster_avg_variance = variance_df.loc[cluster, "cluster_average_variance"] + output += ( + f"The average variance for cluster {cluster} is {cluster_avg_variance:.3f}. " + f"Example areas: {', '.join(area_names)}" + ) else: output += f"Cluster {cluster} not found in the DataFrame.\n" @@ -216,17 +250,20 @@ def cluster_summary(restructured_cluster_table_long, uk_std_cluster_means, varia return cluster_info -def identify_cluster_drivers(uk_std_cluster_means, lookup_file, cluster_info, variance_df, cluster_column, top_n=5): + +def identify_cluster_drivers( + uk_std_cluster_means, lookup_file, cluster_info, variance_df, cluster_column, top_n=5 +): """ Identifies the key variables that differentiate each cluster from others by analyzing the mean values of variables within a cluster compared to the mean values across all other clusters. The function also maps variable names using a lookup file and provides detailed descriptions of the differences for each cluster. - + Parameters ---------- uk_std_cluster_means : pd.DataFrame - A DataFrame where rows represent clusters and columns represent the mean values + A DataFrame where rows represent clusters and columns represent the mean values of variables for each cluster. lookup_file : str Path to a CSV file containing a lookup table with columns @@ -252,140 +289,186 @@ def identify_cluster_drivers(uk_std_cluster_means, lookup_file, cluster_info, va detailed descriptions and variance values. """ - # Filter the uk_std_cluster_means_output to include only the top row and rows with the specified cluster_column - # in the hierarchy_level column - if 'hierarchy_level' not in uk_std_cluster_means.columns: + # Filter the uk_std_cluster_means_output to include only the top row and rows + # with the specified cluster_column in the hierarchy_level column + if "hierarchy_level" not in uk_std_cluster_means.columns: raise ValueError("Means table must contain a 'hierarchy_level' column.") - - uk_std_cluster_means = pd.concat([ - uk_std_cluster_means[uk_std_cluster_means['hierarchy_level'] == cluster_column] - ]) + + uk_std_cluster_means = pd.concat( + [uk_std_cluster_means[uk_std_cluster_means["hierarchy_level"] == cluster_column]] + ) # Remove the hierarchy_level column - uk_std_cluster_means = uk_std_cluster_means.drop(columns=['hierarchy_level']) + uk_std_cluster_means = uk_std_cluster_means.drop(columns=["hierarchy_level"]) # Load the lookup file lookup_df = pd.read_csv(lookup_file) - + # Ensure the lookup file has the required columns - if 'new_code' not in lookup_df.columns or 'variable_name' not in lookup_df.columns: + if "new_code" not in lookup_df.columns or "variable_name" not in lookup_df.columns: raise ValueError("Lookup file must contain 'new_code' and 'variable_name' columns.") - + # Create a mapping dictionary for variable names with new_code in brackets code_to_variable = { - row['new_code']: f"{row['variable_name']} ({row['new_code']})" + row["new_code"]: f"{row['variable_name']} ({row['new_code']})" for _, row in lookup_df.iterrows() } - + # Replace column names in the uk_std_cluster_means uk_std_cluster_means = uk_std_cluster_means.rename(columns=code_to_variable) - for index, row in uk_std_cluster_means.iterrows(): - + for _index, row in uk_std_cluster_means.iterrows(): # Use the value in the 'cluster' column as the cluster_number # cluster_number = int(row['cluster']) - cluster_number = row['cluster'] - - # Create a Pandas Series of the mean values of all numeric columns in uk_std_cluster_means, excluding rows where the cluster column equals cluster_number. - other_clusters_means = uk_std_cluster_means[uk_std_cluster_means['cluster'] != cluster_number].select_dtypes(include='number').mean() + cluster_number = row["cluster"] + + # Create a Pandas Series of the mean values of all numeric columns in uk_std_cluster_means, + # excluding rows where the cluster column equals cluster_number. + other_clusters_means = ( + uk_std_cluster_means[uk_std_cluster_means["cluster"] != cluster_number] + .select_dtypes(include="number") + .mean() + ) - # Calculate the difference between the cluster's values and the other clusters' means (which excludes the row of the current cluster_number) - differences = row.drop('cluster') - other_clusters_means + # Calculate the difference between the cluster's values and the other clusters' means + # (which excludes the row of the current cluster_number) + differences = row.drop("cluster") - other_clusters_means # Sort variables by the absolute difference in descending order - # The variable at the top of the list will then have the greatest difference between the current cluster and the other clusters + # The variable at the top of the list will then have the greatest difference between the + # current cluster and the other clusters sorted_differences = differences.abs().sort_values(ascending=False) # Select the top N variables with the greatest difference variables_with_greatest_differnce = sorted_differences.head(top_n) - + # Print the results for the cluster print(f"Cluster {cluster_number}") if cluster_number is not None: for output in cluster_info: if f"Cluster {cluster_number}" in output: print(output) - print(f"""Values in the brackets below are the difference between the mean of the variable for this cluster - compared with the mean of the other clusters combined. The population of cluster {cluster_number} has a:""") - + print(f"""Values in the brackets below are the difference between the mean of the + variable for this cluster compared with the mean of the other clusters combined. + The population of cluster {cluster_number} has a:""") + for variable in variables_with_greatest_differnce.index: # Remove anything in brackets from the variable name - variable_name = re.sub(r'\(.*?\)', '', variable).strip() + variable_name = re.sub(r"\(.*?\)", "", variable).strip() # Determine if the difference is "higher" or "lower" - if differences[variable] > 0: - difference_status = "higher" - else: - difference_status = "lower" - + difference_status = "higher" if differences[variable] > 0 else "lower" + # Extract the "V" followed by two digits using regex - match = re.search(r'v\d{2}', variable) + match = re.search(r"v\d{2}", variable) if match: v_code = match.group(0) # Extracted code (e.g., "v22") - + # Find the first row in the lookup table where the code matches - domain_value = lookup_df.loc[lookup_df['new_code'].str.contains(v_code, na=False), 'domain'].head(1) - + domain_value = lookup_df.loc[ + lookup_df["new_code"].str.contains(v_code, na=False), "domain" + ].head(1) + # Define a dictionary to map domains to their specific message logic domain_logic = { "Demography and Migration": lambda table_name_value, variable_name: ( - f"proportion of households comprised of {variable_name}" if "Household composition" in table_name_value else - f"proportion of people who live in a communal establishment" if "Residency type" in table_name_value else - f"proportion of people whose address one year ago is the same as the address of enumeration" if "Migrant Indicator" in table_name_value else - f"proportion of people who are {variable_name}" if "Age structure" in table_name_value or "Legal partnership status" in table_name_value else - f"proportion of people with a country of birth in {variable_name}" if "Country of birth" in table_name_value else - f"{variable_name}" if "Population density" in table_name_value else - f"people {variable_name}" + f"proportion of households comprised of {variable_name}" + if "Household composition" in table_name_value + else "proportion of people who live in a communal establishment" + if "Residency type" in table_name_value + else "proportion of people whose address one year ago is the " + + "same as the address of enumeration" + if "Migrant Indicator" in table_name_value + else f"proportion of people who are {variable_name}" + if "Age structure" in table_name_value + or "Legal partnership status" in table_name_value + else f"proportion of people with a country of birth in {variable_name}" + if "Country of birth" in table_name_value + else f"{variable_name}" + if "Population density" in table_name_value + else f"people {variable_name}" ), "Labour Market": lambda table_name_value, variable_name: ( - f"proportion of people working jobs which are {variable_name}" if "hours worked" in table_name_value else - "proportion of full-time students" if "NS-SeC" in table_name_value else - f"proportion of people who work in {variable_name.lstrip('0123456789. ').strip()}" if "occupation" in table_name_value else - f"proportion of people who work in {variable_name}" + f"proportion of people working jobs which are {variable_name}" + if "hours worked" in table_name_value + else "proportion of full-time students" + if "NS-SeC" in table_name_value + else "proportion of people who work in " + + "{variable_name.lstrip('0123456789. ').strip()}" + if "occupation" in table_name_value + else f"proportion of people who work in {variable_name}" ), "Health, Disability and Unpaid Care": lambda table_name_value, variable_name: ( - variable_name if "Disability" in table_name_value else - f"proportion of people who provide unpaid care" if "Provision of unpaid care" in table_name_value else - f"proportion of people {variable_name}" + variable_name + if "Disability" in table_name_value + else "proportion of people who provide unpaid care" + if "Provision of unpaid care" in table_name_value + else f"proportion of people {variable_name}" ), "Housing": lambda table_name_value, variable_name: ( - f"proportion of people who live in a flat" if "Accommodation type" in table_name_value and "flat" in variable_name.lower() else - f"proportion of people living in a {variable_name}" if "Accommodation type" in table_name_value else - f"proportion of dwellings which are {variable_name}" if "Occupancy rating for rooms" in table_name_value else - f"proportion of people who own {variable_name}" if "Car or van availability" in table_name_value else - f"proportion of people living in {variable_name} accommodation" if "Tenure" in table_name_value else - f"proportion of people {variable_name}" + "proportion of people who live in a flat" + if "Accommodation type" in table_name_value + and "flat" in variable_name.lower() + else f"proportion of people living in a {variable_name}" + if "Accommodation type" in table_name_value + else f"proportion of dwellings which are {variable_name}" + if "Occupancy rating for rooms" in table_name_value + else f"proportion of people who own {variable_name}" + if "Car or van availability" in table_name_value + else f"proportion of people living in {variable_name} accommodation" + if "Tenure" in table_name_value + else f"proportion of people {variable_name}" ), - "Ethnicity, Identity, Language and Religion": lambda table_name_value, variable_name: ( - f"proportion of people who are {variable_name}" if "Ethnic group" in table_name_value else - f"proportion of households where all household members have the same ethnic group" if "Multiple ethnic group" in table_name_value else - f"proportion of whose religion is {variable_name}" if "Religion" in table_name_value else - f"proportion of people who {variable_name}" if "Proficient in English" in table_name_value else - f"proportion of people {variable_name}" + "Ethnicity, Identity, Language and Religion": lambda table_name_value, + variable_name: ( + f"proportion of people who are {variable_name}" + if "Ethnic group" in table_name_value + else "proportion of households where all household members" + + " have the same ethnic group" + if "Multiple ethnic group" in table_name_value + else f"proportion of whose religion is {variable_name}" + if "Religion" in table_name_value + else f"proportion of people who {variable_name}" + if "Proficient in English" in table_name_value + else f"proportion of people {variable_name}" ), "Education": lambda table_name_value, variable_name: ( - f"proportion of people whose highest level of qualification is {variable_name}" - ) + "proportion of people whose highest level of " + + "qualification is {variable_name}" + ), } # Check if a match is found and print the domain-specific message and variance value if not domain_value.empty: domain = domain_value.iloc[0] # Retrieve the table_name value for the specific variable - table_name_value = lookup_df.loc[lookup_df['new_code'].str.contains(v_code, na=False), 'table_name'].head(1).iloc[0] - + table_name_value = ( + lookup_df.loc[ + lookup_df["new_code"].str.contains(v_code, na=False), "table_name" + ] + .head(1) + .iloc[0] + ) + # Convert cluster_number to string to match the index type cluster_number_str = str(cluster_number) - #cluster_number_int = int(cluster_number_str) # if running through main hash this! + # if running through main hash the line below! + # cluster_number_int = int(cluster_number_str) - variance_value = variance_df.loc[cluster_number_str, v_code] # if running through main un hash - # variance_value = variance_df.loc[cluster_number_int, v_code] # if running through main hash this! + variance_value = variance_df.loc[ + cluster_number_str, v_code + ] # if running through main un hash / if running through main hash this! + # variance_value = variance_df.loc[cluster_number_int, v_code] # Generate the specific message based on the domain logic if domain in domain_logic: specific_message = domain_logic[domain](table_name_value, variable_name) - message = f"• {difference_status} ({differences[variable]:.3f}) {specific_message}. Variance:{variance_value:.3f} ({domain} domain)" + message = ( + f"• {difference_status} ({differences[variable]:.3f}) " + f"{specific_message}. " + f"Variance:{variance_value:.3f} ({domain} domain)" + ) print(message) else: # Default message for unrecognized domains message = f"Domain {domain} not recognized for variable {variable_name}." - print("-" * 40) \ No newline at end of file + print("-" * 40) diff --git a/area_classification/post_processing/cluster_table_restructure.py b/area_classification/post_processing/cluster_table_restructure.py index d5990d9..456bbfe 100644 --- a/area_classification/post_processing/cluster_table_restructure.py +++ b/area_classification/post_processing/cluster_table_restructure.py @@ -1,14 +1,21 @@ # Restructuring of cluster assignments table import os + import pandas as pd + from area_classification.utilities.load_config import load_config -config = load_config('area_classification/config.yaml') -def cluster_table_restructure(config, clustering_output, split_column, keep_column, standardised_data): +config = load_config("area_classification/config.yaml") + + +def cluster_table_restructure( + config, clustering_output, split_column, keep_column, standardised_data +): """ - Using the cluster output column one (LAD_codes) is kept, but column two containing cluster codes are - separated out into seperate columns for supergroup, group, and subgroup. The final character in the - subgroup column is then converted to a number (a=1, b=2, c=3, etc.). + Using the cluster output column one (LAD_codes) is kept, but column two containing + cluster codes are separated out into seperate columns for supergroup, group, and + subgroup. The final character in the subgroup column is then converted to a number + (a=1, b=2, c=3, etc.). Parameters ---------- @@ -19,28 +26,31 @@ def cluster_table_restructure(config, clustering_output, split_column, keep_colu keep_column : str The column header which will be kept in the final output clustering_output : pd.DataFrame - DataFrame of cluster assignments which have been output from running the clustering algorithm. + DataFrame of cluster assignments which have been output from running the + clustering algorithm. Data will have the following format: - - LAD_code | subsub cluster + + LAD_code | subsub cluster ---------------------------- S12000005 | 1ca - + Returns ------- tuple of pd.DataFrame (restructured_cluster_table, restructured_cluster_table_long) - - restructured_cluster_table: DataFrame with LAD_code, supergroup, group, subgroup, and LAD_name. - - restructured_cluster_table_long: Merged DataFrame with standardised data for summaries. + - restructured_cluster_table: DataFrame with LAD_code, supergroup, group, + subgroup, and LAD_name. + - restructured_cluster_table_long: Merged DataFrame with standardised + data for summaries. """ # Reset the LAD_codes column so it is no longer an index and can be used to merge a table df = clustering_output.reset_index() - - # Change the cluster number 0 to 6 (Python indexes to 0, but for cluster number we need 1 to 6) - for col in ['cluster', 'subcluster', 'subsubcluster']: + + # Change the cluster number 0 to 6 (Python indexes to 0, but for cluster number we need 1 to 6) + for col in ["cluster", "subcluster", "subsubcluster"]: if col in df.columns: - df[col] = df[col].astype(str).str.replace(r'^0', '6', regex=True) + df[col] = df[col].astype(str).str.replace(r"^0", "6", regex=True) # Check if the specified columns exist if keep_column not in df.columns: @@ -60,14 +70,22 @@ def cluster_table_restructure(config, clustering_output, split_column, keep_colu def convert_final_char_to_number(value): if pd.notna(value) and len(value) > 0: final_char = value[-1].lower() - if 'a' <= final_char <= 'z': # Check if it's a letter - return value[:-1] + str(ord(final_char) - ord('a') + 1) + if "a" <= final_char <= "z": # Check if it's a letter + return value[:-1] + str(ord(final_char) - ord("a") + 1) return value subgroup = subgroup.apply(convert_final_char_to_number) # Combine the kept column with the processed columns - restructured_cluster_table = pd.concat([kept_data, supergroup.rename('supergroup'), group.rename('group'), subgroup.rename('subgroup')], axis=1) + restructured_cluster_table = pd.concat( + [ + kept_data, + supergroup.rename("supergroup"), + group.rename("group"), + subgroup.rename("subgroup"), + ], + axis=1, + ) # Load the LAD lookup file into a DataFrame lad_lookup_file_path = config["LAD_lookup_file_path"] @@ -75,29 +93,32 @@ def convert_final_char_to_number(value): # Merge with LAD names from the lookup file restructured_cluster_table = restructured_cluster_table.merge( - lad_lookup[['LAD22CD', 'LAD22NM']], - left_on='LAD_code', - right_on='LAD22CD', - how='left' + lad_lookup[["LAD22CD", "LAD22NM"]], left_on="LAD_code", right_on="LAD22CD", how="left" ) # Drop the LAD22CD column after the join if it's no longer needed - restructured_cluster_table = restructured_cluster_table.drop(columns=['LAD22CD']) + restructured_cluster_table = restructured_cluster_table.drop(columns=["LAD22CD"]) # Rename the LAD22NM column to LAD_name - restructured_cluster_table = restructured_cluster_table.rename(columns={'LAD22NM': 'LAD_name'}) + restructured_cluster_table = restructured_cluster_table.rename(columns={"LAD22NM": "LAD_name"}) # Move the LAD_name column to the first position - columns = ['LAD_name'] + [col for col in restructured_cluster_table.columns if col != 'LAD_name'] + columns = ["LAD_name"] + [col for col in restructured_cluster_table.columns if col != "LAD_name"] restructured_cluster_table = restructured_cluster_table[columns] # Save the resulting DataFrame to a new file - output_file = os.path.join(config["output_directory"], f"cluster_assignments/restructured_subclustering_output.csv") + output_file = os.path.join( + config["output_directory"], "cluster_assignments/restructured_subclustering_output.csv" + ) restructured_cluster_table.to_csv(output_file, index=False) # Create and save out restructured long table (for use in summaries) - restructured_cluster_table_long = pd.merge(restructured_cluster_table, standardised_data, on='LAD_code', how='inner') - output_file_long = os.path.join(config["output_directory"], f"cluster_assignments/restructured_subclustering_output_long.csv") + restructured_cluster_table_long = pd.merge( + restructured_cluster_table, standardised_data, on="LAD_code", how="inner" + ) + output_file_long = os.path.join( + config["output_directory"], "cluster_assignments/restructured_subclustering_output_long.csv" + ) restructured_cluster_table_long.to_csv(output_file_long, index=False) - return restructured_cluster_table, restructured_cluster_table_long \ No newline at end of file + return restructured_cluster_table, restructured_cluster_table_long diff --git a/area_classification/post_processing/cluster_variables_mean.py b/area_classification/post_processing/cluster_variables_mean.py index 934b94b..cacb53c 100644 --- a/area_classification/post_processing/cluster_variables_mean.py +++ b/area_classification/post_processing/cluster_variables_mean.py @@ -1,30 +1,32 @@ # Cluster variables mean averages -import pandas as pd import os +import pandas as pd + + def cluster_variable_means(config, restructured_cluster_table, standardised_data): """ - Calculates the mean of each variable for different hierarchical clusters (supergroup, group, subgroup), - and outputs the results in a structured format. - + Calculates the mean of each variable for different hierarchical clusters + (supergroup, group, subgroup), and outputs the results in a structured format. + Parameters ---------- config : dict Configuration dictionary containing the filepath and name to the cluster data - + restructured_cluster_table_df : pd.DataFrame DataFrame of cluster assignments. Data will have the following format: - + LAD_name | LAD_code | supergroup| group | subgroup ------------------------------------------- Hartlepool | E06000001 | 1 | 1c | 1c1 - + standardised_data : pd.DataFrame DataFrame containing standardised variable values for each LAD_code. Returns ------- - pd.DataFrame + pd.DataFrame Dataframe containing the mean of each variable for each cluster, structured as: Cluster_code | Hierarchy_level | variable_name | variable_mean ----------------------------------------------------------------- @@ -32,27 +34,41 @@ def cluster_variable_means(config, restructured_cluster_table, standardised_data 1a | group | TS001 | 120.0 1a1 | subgroup | TS001 | 130.0 """ - + # Merge cluster results with standardised means census data merged_data = pd.merge(restructured_cluster_table, standardised_data, on="LAD_code", how="left") - # Reshape from wide to long format to create one variable_name column (rather than 61 columns, one for each) - long_data = pd.melt(merged_data, id_vars=["LAD_code", "LAD_name", "supergroup", "group", "subgroup"], - var_name="variable_name", value_name="variable_value") - - # Reshape to even longer by making one hierarchy_level column (rather than supergroup, group, subgroup) - long_data = pd.melt(long_data, id_vars=["LAD_code", "LAD_name", "variable_name", "variable_value"], - value_vars=["supergroup", "group", "subgroup"], - var_name="hierarchy_level", value_name="cluster") - + # Reshape from wide to long format to create one variable_name column + # (rather than 61 columns, one for each) + long_data = pd.melt( + merged_data, + id_vars=["LAD_code", "LAD_name", "supergroup", "group", "subgroup"], + var_name="variable_name", + value_name="variable_value", + ) + + # Reshape to even longer by making one hierarchy_level column (rather than + # supergroup, group, subgroup) + long_data = pd.melt( + long_data, + id_vars=["LAD_code", "LAD_name", "variable_name", "variable_value"], + value_vars=["supergroup", "group", "subgroup"], + var_name="hierarchy_level", + value_name="cluster", + ) + # Group by cluster and variable, and calculate the mean - uk_std_cluster_means = long_data.groupby(["variable_name", "hierarchy_level", "cluster"]).mean("variable_value").reset_index() + uk_std_cluster_means = ( + long_data.groupby(["variable_name", "hierarchy_level", "cluster"]) + .mean("variable_value") + .reset_index() + ) # Pivot the data to create the wide format - uk_std_cluster_means = uk_std_cluster_means.pivot(index=["cluster", "hierarchy_level"], - columns="variable_name", - values="variable_value").reset_index() - + uk_std_cluster_means = uk_std_cluster_means.pivot( + index=["cluster", "hierarchy_level"], columns="variable_name", values="variable_value" + ).reset_index() + # Create the 'std_means' folder in the output directory std_means_directory = os.path.join(config["output_directory"], "std_means") os.makedirs(std_means_directory, exist_ok=True) @@ -66,5 +82,5 @@ def cluster_variable_means(config, restructured_cluster_table, standardised_data # Save the output as a CSV file uk_std_cluster_means.to_csv(output_file_path, index=False) - - return uk_std_cluster_means \ No newline at end of file + + return uk_std_cluster_means diff --git a/area_classification/post_processing/create_radial_plots.py b/area_classification/post_processing/create_radial_plots.py index 6d8291f..2fd3d1c 100644 --- a/area_classification/post_processing/create_radial_plots.py +++ b/area_classification/post_processing/create_radial_plots.py @@ -1,16 +1,21 @@ # Creation of radial plots import os + +import matplotlib.pyplot as plt import numpy as np import pandas as pd -import matplotlib.pyplot as plt -def create_radial_plots_wrapper(config, uk_std_cluster_means, combined_group_means, combined_subgroup_means): + +def create_radial_plots_wrapper( + config, uk_std_cluster_means, combined_group_means, combined_subgroup_means +): """ Wrapper function to create radial plots for UK clusters and parent clusters. - Radial plots for parent clusters represent difference from parent cluster standardised means. - Radial plots for the area classification clusters to represent difference from UK standardised means. + Radial plots for parent clusters represent difference from parent cluster + standardised means. Radial plots for the area classification clusters to + represent difference from UK standardised means. Parameters ---------- @@ -25,16 +30,22 @@ def create_radial_plots_wrapper(config, uk_std_cluster_means, combined_group_mea """ # Create radial plots for supergroups, groups and subgroups against UK - create_radial_plots(config, uk_std_cluster_means, level="UK", domain_colours = config['domain_colours']) - + create_radial_plots( + config, uk_std_cluster_means, level="UK", domain_colours=config["domain_colours"] + ) + # Create radial plots for groups against their parent (groups) - create_radial_plots(config, combined_group_means, level="group", domain_colours = config['domain_colours']) + create_radial_plots( + config, combined_group_means, level="group", domain_colours=config["domain_colours"] + ) # Create radial plots for subgroups against their parent (groups) - create_radial_plots(config, combined_subgroup_means, level="subgroup", domain_colours = config['domain_colours']) + create_radial_plots( + config, combined_subgroup_means, level="subgroup", domain_colours=config["domain_colours"] + ) # Create legends - legend_creation (config, config['domain_colours']) + legend_creation(config, config["domain_colours"]) def create_radial_plots(config, dataframe, level, domain_colours): @@ -44,18 +55,18 @@ def create_radial_plots(config, dataframe, level, domain_colours): Parameters ---------- config : dict - Configuration dictionary containing settings for the plotting. + Configuration dictionary containing settings for the plotting. dataframe : DataFrame The input DataFrame (either combined_group_means or combined_subgroup_means). level : str Either 'group' or 'subgroup' to indicate the type of data. domain_colours : Dictionary - The list of colours used for the domains. + The list of colours used for the domains. """ # Load lookup for variable labels and domains - lookup = pd.read_csv(config['select_variables_lookup']) - label_dict = lookup.set_index('new_code')['radial_plot_label'].to_dict() - domain_dict = lookup.set_index('new_code')['domain'].to_dict() + lookup = pd.read_csv(config["select_variables_lookup"]) + label_dict = lookup.set_index("new_code")["radial_plot_label"].to_dict() + domain_dict = lookup.set_index("new_code")["domain"].to_dict() # Output directories parent_dir = os.path.join(config["radial_plot_directory"], "parent_cluster_radial_plots") @@ -77,31 +88,45 @@ def create_radial_plots(config, dataframe, level, domain_colours): ax.set_xticks([]) ax.set_yticks([]) ax.set_ylim(-3, 4.3) - # Make the outer polar axis border (spine) transparent - ax.spines['polar'].set_visible(False) + # Make the outer polar axis border (spine) transparent + ax.spines["polar"].set_visible(False) # Draw colored domain segments for i, col in enumerate(feature_cols): domain = domain_dict.get(col, "Other") color = domain_colours.get(domain, "black") - ax.fill([ - angles[i], angles[i+1], angles[i+1], angles[i] - # Set where the colored segments start (3.25) and end (3.5) on the y-axis - ], [3.35, 3.35, 3.6, 3.6], color=color) + ax.fill( + [ + angles[i], + angles[i + 1], + angles[i + 1], + angles[i], + # Set where the colored segments start (3.25) and end (3.5) on the y-axis + ], + [3.35, 3.35, 3.6, 3.6], + color=color, + ) # Draw grid and radial lines for angle in angles[:-1]: - ax.plot([angle, angle], [-3, 3], color='grey', linewidth=0.8, linestyle='solid') + ax.plot([angle, angle], [-3, 3], color="grey", linewidth=0.8, linestyle="solid") # set the scale for y axis ax.set_yticks([-3, -2, -1, 0, 1, 2, 3]) - ax.yaxis.set_tick_params(width=1.0, color='grey', size=5, labelsize=16) - ax.grid(color='grey', linestyle='solid', linewidth=1.0, alpha=0.4) + ax.yaxis.set_tick_params(width=1.0, color="grey", size=5, labelsize=16) + ax.grid(color="grey", linestyle="solid", linewidth=1.0, alpha=0.4) # User-defined radii for top 6 and bottom 6 labels (edit these as needed) top_label_radii = [4.6, 4.6, 4.4, 4.4, 4.2, 4.2] # Closest to top (angle=0), then next, etc. - top_tick_ends = [r - 0.25 for r in top_label_radii] # [5.0, 5.0, 4.5, 4.5, 4.0, 4.0] - bottom_label_radii = [4.6, 4.6, 4.4, 4.4, 4.2, 4.2] # Closest to bottom (angle=pi), then next, etc. - bottom_tick_ends = [r - 0.25 for r in bottom_label_radii] # [5.0, 5.0, 4.5, 4.5, 4.0, 4.0] + top_tick_ends = [r - 0.25 for r in top_label_radii] # [5.0, 5.0, 4.5, 4.5, 4.0, 4.0] + bottom_label_radii = [ + 4.6, + 4.6, + 4.4, + 4.4, + 4.2, + 4.2, + ] # Closest to bottom (angle=pi), then next, etc. + bottom_tick_ends = [r - 0.25 for r in bottom_label_radii] # [5.0, 5.0, 4.5, 4.5, 4.0, 4.0] default_label_radius = 4.0 default_tick_end = 3.8 @@ -111,11 +136,18 @@ def create_radial_plots(config, dataframe, level, domain_colours): top_indices = set(idx for idx, _ in top_indices_sorted) # Bottom 6: closest to 3Ļ€/2 (270°), but exclude any already in top_indices bottom_candidates = [item for item in angle_label_indices if item[0] not in top_indices] - bottom_indices_sorted = sorted(bottom_candidates, key=lambda x: abs(x[1] - (3 * np.pi / 2)))[:6] + bottom_indices_sorted = sorted(bottom_candidates, key=lambda x: abs(x[1] - (3 * np.pi / 2)))[ + :6 + ] # Build maps for radii/tick_end - top_map = {idx: (r, t) for (idx, _), r, t in zip(top_indices_sorted, top_label_radii, top_tick_ends)} - bottom_map = {idx: (r, t) for (idx, _), r, t in zip(bottom_indices_sorted, bottom_label_radii, bottom_tick_ends)} + top_map = { + idx: (r, t) for (idx, _), r, t in zip(top_indices_sorted, top_label_radii, top_tick_ends) + } + bottom_map = { + idx: (r, t) + for (idx, _), r, t in zip(bottom_indices_sorted, bottom_label_radii, bottom_tick_ends) + } for i, col in enumerate(feature_cols): angle = angles[i] @@ -129,26 +161,54 @@ def create_radial_plots(config, dataframe, level, domain_colours): tick_end = default_tick_end tick_start = 3.35 - ax.plot([angle, angle], [tick_start, tick_end], color='black', linewidth=2.0, zorder=10) - ha = 'left' if 0 <= angle < np.pi / 2 or 3 * np.pi / 2 <= angle < 2 * np.pi else 'right' - ax.text(angle, label_radius, label, fontsize=18, color="black", ha=ha, va='center', zorder=11) + ax.plot([angle, angle], [tick_start, tick_end], color="black", linewidth=2.0, zorder=10) + ha = "left" if 0 <= angle < np.pi / 2 or 3 * np.pi / 2 <= angle < 2 * np.pi else "right" + ax.text( + angle, label_radius, label, fontsize=18, color="black", ha=ha, va="center", zorder=11 + ) # Plot the data line and set title/filename # colour of the plotted line is blue for groups, green for subgroups and black for UK if level == "group": - ax.plot(angles, [0] * len(angles), color='black', linewidth=1.0, linestyle='solid', label='Zero Line') - ax.plot(angles, values, color='blue', linewidth=1.5, linestyle='solid') - ax.set_title(f"{row[level]} {level} (supergroup mean)", size=26, pad=80, weight='bold') + ax.plot( + angles, + [0] * len(angles), + color="black", + linewidth=1.0, + linestyle="solid", + label="Zero Line", + ) + ax.plot(angles, values, color="blue", linewidth=1.5, linestyle="solid") + ax.set_title(f"{row[level]} {level} (supergroup mean)", size=26, pad=80, weight="bold") plot_path = os.path.join(parent_dir, f"{row[level]}_{level}.png") elif level == "subgroup": - ax.plot(angles, [0] * len(angles), color='blue', linewidth=1.0, linestyle='solid', label='Zero Line') - ax.plot(angles, values, color='green', linewidth=1.5, linestyle='solid') - ax.set_title(f"{row[level]} {level} (group mean)", size=26, pad=80, weight='bold') + ax.plot( + angles, + [0] * len(angles), + color="blue", + linewidth=1.0, + linestyle="solid", + label="Zero Line", + ) + ax.plot(angles, values, color="green", linewidth=1.5, linestyle="solid") + ax.set_title(f"{row[level]} {level} (group mean)", size=26, pad=80, weight="bold") plot_path = os.path.join(parent_dir, f"{row[level]}_{level}.png") elif level == "UK": - ax.plot(angles, [0] * len(angles), color='red', linewidth=1.0, linestyle='solid', label='Zero Line') - ax.plot(angles, values, color='black', linewidth=1.5, linestyle='solid') - ax.set_title(f"{row['cluster']} {row['hierarchy_level']} (UK mean)", size=26, pad=80, weight='bold') + ax.plot( + angles, + [0] * len(angles), + color="red", + linewidth=1.0, + linestyle="solid", + label="Zero Line", + ) + ax.plot(angles, values, color="black", linewidth=1.5, linestyle="solid") + ax.set_title( + f"{row['cluster']} {row['hierarchy_level']} (UK mean)", + size=26, + pad=80, + weight="bold", + ) plot_path = os.path.join(uk_dir, f"{row['cluster']}_{row['hierarchy_level']}.png") plt.savefig(plot_path) @@ -159,7 +219,8 @@ def create_radial_plots(config, dataframe, level, domain_colours): elif level in ["group", "subgroup"]: print(f"Parent cluster radial plots saved in: {parent_dir}") -def legend_creation (config, domain_colours): + +def legend_creation(config, domain_colours): """ Function to great pngs of the legend for both the radial plot domains and the lines on the radial plots. @@ -169,37 +230,37 @@ def legend_creation (config, domain_colours): config : dict config dictionary containing filepaths. domain_colours : Dictionary - The list of colours used for the domains. + The list of colours used for the domains. """ # Create the domain legend fig, ax = plt.subplots(figsize=(1, 3)) - ax.axis('off') + ax.axis("off") y = 0.1 for domain, color in domain_colours.items(): - ax.scatter(0.013, y, s=300, color=color, marker='s') - ax.text(0.0134, y, domain, va='center', fontsize=14) + ax.scatter(0.013, y, s=300, color=color, marker="s") + ax.text(0.0134, y, domain, va="center", fontsize=14) y -= 0.13 domain_colour_filepath = os.path.join(config["radial_plot_directory"], "Domain key.png") - plt.savefig(domain_colour_filepath, bbox_inches='tight', dpi=150) + plt.savefig(domain_colour_filepath, bbox_inches="tight", dpi=150) plt.close(fig) - #Create the line legend + # Create the line legend # Define the line types and their colours line_info = [ ("UK mean", "#FA0000"), ("Group mean", "blue"), ("Subgroup mean", "green"), - ("Supergroup mean", "black") + ("Supergroup mean", "black"), ] fig, ax = plt.subplots(figsize=(2, 2)) - ax.axis('off') + ax.axis("off") y_positions = [0.8, 0.6, 0.4, 0.2] for (label, color), y in zip(line_info, y_positions): ax.plot([0.1, 0.11], [y, y], color=color, linewidth=6) - ax.text(0.111, y, label, va='center', fontsize=14) + ax.text(0.111, y, label, va="center", fontsize=14) line_colour_filepath = os.path.join(config["radial_plot_directory"], "Line colour.png") - plt.savefig(line_colour_filepath, bbox_inches='tight', dpi=200) - plt.close(fig) \ No newline at end of file + plt.savefig(line_colour_filepath, bbox_inches="tight", dpi=200) + plt.close(fig) diff --git a/area_classification/post_processing/horizontal_bar_chart.py b/area_classification/post_processing/horizontal_bar_chart.py index 1943c8c..de7b417 100644 --- a/area_classification/post_processing/horizontal_bar_chart.py +++ b/area_classification/post_processing/horizontal_bar_chart.py @@ -1,14 +1,24 @@ # This script creates horizontal bar charts -import matplotlib.pyplot as plt -from matplotlib.patches import Rectangle -from matplotlib import gridspec -import matplotlib.gridspec as gridspec import os +import textwrap + +import matplotlib.font_manager as fm +import matplotlib.pyplot as plt import pandas as pd -from area_classification.utilities.load_config import load_config +from matplotlib import gridspec + +# Download the Open Sans font from https://fonts.google.com/specimen/Open+Sans +# Manually save the downloaded font to font_path: +font_path = "data/output_data/bar_charts/Open_Sans/OpenSans-VariableFont_wdth,wght.ttf" +# Add the font to matplotlib +fm.fontManager.addfont(font_path) +plt.rcParams["font.family"] = "Open Sans" + -def create_bar_charts_wrapper(config, uk_std_cluster_means, combined_group_means, combined_subgroup_means): +def create_bar_charts_wrapper( + config, uk_std_cluster_means, combined_group_means, combined_subgroup_means +): """ Wrapper function to create horizional bar charts. @@ -29,105 +39,14 @@ def create_bar_charts_wrapper(config, uk_std_cluster_means, combined_group_means Created bar charts are saved to the 'bar_charts' folder """ - - # Create horizontal bar charts for supergroups, groups and subgroups against UK - horizontal_bar_charts(config, uk_std_cluster_means, level="UK") - - # Create horizontal bar charts for groups and subgroups against their parent (groups) - horizontal_bar_charts(config, combined_group_means, level="group") - horizontal_bar_charts(config, combined_subgroup_means, level="subgroup") - # Create small multiples for supergroups, groups and subgroups against UK - small_multiples(config, uk_std_cluster_means, level = "UK", domain_col = "domain") + small_multiples(config, uk_std_cluster_means, level="UK", domain_col="domain") # Create small multiples for groups and subgroups against their parent (groups) - small_multiples(config, combined_group_means, level = "group", domain_col = "domain") - small_multiples(config, combined_subgroup_means, level = "subgroup", domain_col = "domain") - - - -def horizontal_bar_charts(config, dataframe, level): - """ - Create horizontal bar charts for a given dataframe. - - Parameters - ---------- - config : dict - Configuration dictionary containing settings for the plotting. - dataframe : DataFrame - The input DataFrame (either combined_group_means or combined_subgroup_means). - level : str - Either 'group' or 'subgroup' to indicate the type of data. - - """ - # Load lookup for variable labels and domains - lookup = pd.read_csv(config['select_variables_lookup']) - label_dict = lookup.set_index('new_code')['radial_plot_label'].to_dict() - domain_dict = lookup.set_index('new_code')['domain'].to_dict() - - v01_index = dataframe.columns.get_loc("v01") - categories = list(dataframe.columns[v01_index:]) - category_domains = {cat: domain_dict.get(cat, None) for cat in categories} - # Use label_dict to get y-axis labels - y_labels = [label_dict.get(cat, cat) for cat in categories] + small_multiples(config, combined_group_means, level="group", domain_col="domain") + small_multiples(config, combined_subgroup_means, level="subgroup", domain_col="domain") - # Output directories - bar_parent_dir = os.path.join(config["bar_chart_directory"], "parent_cluster_bar_charts") - bar_uk_dir = os.path.join(config["bar_chart_directory"], "uk_bar_charts") - os.makedirs(bar_parent_dir, exist_ok=True) - os.makedirs(bar_uk_dir, exist_ok=True) - - for idx, row in dataframe.iterrows(): - values = row[categories].tolist() - - fig = plt.figure(figsize=(12, 8)) - gs = gridspec.GridSpec(1, 2, width_ratios=[0.3, 5], wspace=0.05) - - # Color strip axis - ax2 = fig.add_subplot(gs[0]) - ax2.set_ylim(-0.5, len(categories) - 0.5) - ax2.set_xlim(0, 1) - ax2.axis('off') - - # Draw colored strips for each domain group - current_domain = None - start_idx = 0 - for i, cat in enumerate(categories + [None]): - domain = category_domains.get(cat) if cat else None - if domain != current_domain: - if current_domain is not None and current_domain in config['domain_colours']: - ax2.add_patch(Rectangle( - (0.7, start_idx - 0.4), - 0.9, - i - start_idx, - facecolor=config['domain_colours'][current_domain], - edgecolor='none' - )) - current_domain = domain - start_idx = i - - # Main bar chart axis - ax = fig.add_subplot(gs[1], sharey=ax2) - ax.set_xlim(-3, 3) - ax.barh(y_labels, values, color='#206095') - ax.axvline(0, color='grey', linewidth=2, linestyle='--', zorder=2) - ax.tick_params(axis='y', pad=30) # Move y-axis labels further left - ax.set_xlabel('Values') - plt.tight_layout() - - # Plot the data line and set title/filename - if level == "group": - ax.set_title(f"{row[level]} {level} (supergroup mean)", size=26, pad=80, weight='bold') - plot_path = os.path.join(bar_parent_dir, f"{row[level]}_{level}.png") - elif level == "subgroup": - ax.set_title(f"{row[level]} {level} (group mean)", size=26, pad=80, weight='bold') - plot_path = os.path.join(bar_parent_dir, f"{row[level]}_{level}.png") - elif level == "UK": - ax.set_title(f"{row['cluster']} {row['hierarchy_level']} (UK mean)", size=26, pad=80, weight='bold') - plot_path = os.path.join(bar_uk_dir, f"{row['cluster']}_{row['hierarchy_level']}.png") - - # Save the plot - fig.savefig(plot_path, bbox_inches='tight', dpi=150) - plt.close(fig) + # Save the data used in the bar charts into a data tabele + bar_chart_data_table() def small_multiples(config, dataframe, level, domain_col): @@ -138,51 +57,65 @@ def small_multiples(config, dataframe, level, domain_col): Parameters ---------- config : dict - Configuration dictionary containing settings for the plotting. + Configuration dictionary containing settings for the plotting. dataframe : DataFrame The input DataFrame (either combined_group_means or combined_subgroup_means). level : str Either 'group' or 'subgroup' to indicate the type of data. domain_colours : Dictionary - The list of colours used for the domains. + The list of colours used for the domains. """ + # Replace 'your_file.csv' with your actual CSV file path + name_lookup = pd.read_csv("./data/output_data/Name_lookup.csv") + # Output directories - small_multiples_parent_dir = os.path.join(config["bar_chart_directory"], "parent_cluster_small_multiples") + small_multiples_parent_dir = os.path.join( + config["bar_chart_directory"], "parent_cluster_small_multiples" + ) small_multiples_uk_dir = os.path.join(config["bar_chart_directory"], "uk_small_multiples") os.makedirs(small_multiples_parent_dir, exist_ok=True) os.makedirs(small_multiples_uk_dir, exist_ok=True) v01_index = dataframe.columns.get_loc("v01") categories = list(dataframe.columns[v01_index:]) - lookup = pd.read_csv(config['select_variables_lookup']) - domain_dict = lookup.set_index('new_code')[domain_col].to_dict() - label_dict = lookup.set_index('new_code')['radial_plot_label'].to_dict() + lookup = pd.read_csv(config["select_variables_lookup"]) + domain_dict = lookup.set_index("new_code")[domain_col].to_dict() + label_dict = lookup.set_index("new_code")["radial_plot_label"].to_dict() desired_order = [ - "Demography and Migration", - "Labour Market", - "Ethnicity, Identity, Language and Religion", - "Housing", - "Health, Disability and Unpaid Care", - "Education" + "Demography and Migration", + "Labour Market", + "Ethnicity, Identity, Language and Religion", + "Housing", + "Health, Disability and Unpaid Care", + "Education", ] - + # Identify grouping column - if level == "group": - grouped = dataframe.groupby(level) - elif level == "subgroup": + if level == "group" or level == "subgroup": grouped = dataframe.groupby(level) elif level == "UK": grouped = dataframe.groupby("cluster") else: raise ValueError(f"Unknown level: {level}") - + for group_name, group_df in grouped: - fig = plt.figure(figsize=(18, 10)) - # Adjust the height of the small multiples (top row, middle row, bottom row) - gs = gridspec.GridSpec( 3, 2, height_ratios=[1.5, 1, 0.25], ) - axes = [fig.add_subplot(gs[i, j]) for i in range(3) for j in range(2)] + fig = plt.figure(figsize=(4, 20)) # 4 inches wide Ɨ 150 dpi = 600px + # fig = plt.figure(figsize=(18, 10)) + + # Calculate number of bars for each domain + num_bars_per_domain = [] + for domain in desired_order: + domain_cats = [cat for cat in categories if domain_dict.get(cat) == domain] + num_bars_per_domain.append(len(domain_cats)) + + # Set height ratios: 0.09 per bar (no minimum) + height_ratios = [0.09 * n for n in num_bars_per_domain] + + # Create GridSpec with dynamic height ratios + gs = gridspec.GridSpec(6, 1, height_ratios=height_ratios) + axes = [fig.add_subplot(gs[i, 0]) for i in range(6)] for i, domain in enumerate(desired_order): domain_cats = [cat for cat in categories if domain_dict.get(cat) == domain] @@ -190,23 +123,132 @@ def small_multiples(config, dataframe, level, domain_col): axes[i].set_visible(False) continue means = group_df[domain_cats].mean() - bar_colors = [config['domain_colours'].get(domain, '#206095')] * len(domain_cats) + axes[i].set_axisbelow(True) + axes[i].grid(axis="x", color="lightgrey", linestyle="-", linewidth=1) + # bar_colors = [config["domain_colours"].get(domain, "#206095")] * len(domain_cats) + bar_colors = ["#206095" if val >= 0 else "#f66068" for val in means] y_labels = [label_dict.get(cat, cat) for cat in domain_cats] - axes[i].barh(y_labels, means, color=bar_colors) - axes[i].set_title(domain) - axes[i].axvline(0, color='grey', linewidth=2, linestyle='--', zorder=2) + + # Split y-axis labels at 28 characters + def split_label(label, width=31): + return "\n".join(textwrap.wrap(label, width=width)) + + y_labels = [split_label(label, 31) for label in y_labels] + + # Horizontal gridlines + for y in range(len(y_labels)): + # 2px dash, 2px gap # square cap + axes[i].axhline( + y, + color="#D9D9D9", + linewidth=1, + linestyle=(0, (2, 2)), + solid_capstyle="butt", + zorder=1, + ) + + # Make bars equal thickness + axes[i].barh(y_labels, means, color=bar_colors, height=0.6) + + # Remove outline box + for spine in axes[i].spines.values(): + spine.set_visible(False) + + # Subtitle formatting + axes[i].set_title(domain, fontsize=14, color="#414042", fontweight="bold", loc="left") + + # Vertical line + axes[i].axvline(0, color="#B3B3B3", linewidth=1.5, linestyle="", zorder=2) + + # Set label formatting + # axes[i].set_xlabel("Value") + axes[i].set_yticks(range(len(y_labels))) + axes[i].set_yticklabels(y_labels, color="#414042", fontsize=8) + + # Remove x and y axis tick marks + axes[i].tick_params(axis="y", length=0) + axes[i].tick_params(axis="x", length=0, labelsize=8) + axes[i].tick_params(axis="x", colors="#707071") + # Set axis limits axes[i].set_xlim(-3, 3) - axes[i].set_xlabel('Value') - axes[i].set_yticklabels(y_labels, fontsize=8) + # Find the row where cluster_code == 'group_name' + row = name_lookup[name_lookup["cluster_code"] == group_name] + # Print the value(s) in the cluster_name column for that row + cluster_name = row["cluster_name"].iloc[0] if level == "group": - plt.suptitle(f"{group_name} {level} (supergroup mean)", fontsize=16, weight='bold') - plot_path = os.path.join(small_multiples_parent_dir, f"{group_name}_{level}.png") + # plt.suptitle(f"{group_name} {level} (supergroup mean)", fontsize=16, weight="bold") + plot_path = os.path.join( + small_multiples_parent_dir, + f"Group {group_name} - {cluster_name} characteristics.png", + ) if level == "subgroup": - plt.suptitle(f"{group_name} {level} (group mean)", fontsize=16, weight='bold') - plot_path = os.path.join(small_multiples_parent_dir, f"{group_name}_{level}.png") + # plt.suptitle(f"{group_name} {level} (group mean)", fontsize=16, weight="bold") + plot_path = os.path.join( + small_multiples_parent_dir, + f"Subgroup {group_name} - {cluster_name} characteristics.png", + ) elif level == "UK": - plt.suptitle(f"{group_name} (UK mean)", fontsize=16, weight='bold') - plot_path = os.path.join(small_multiples_uk_dir, f"{group_name}.png") + # plt.suptitle(f"{group_name} (UK mean)", fontsize=16, weight="bold") + plot_path = os.path.join( + small_multiples_uk_dir, + f"Supergroup {group_name} - {cluster_name} characteristics.png", + ) plt.tight_layout(rect=[0, 0, 1, 0.95]) - plt.savefig(plot_path, bbox_inches='tight', dpi=150) - plt.close(fig) \ No newline at end of file + plt.subplots_adjust(left=0.18, right=1, top=0.98, bottom=0.05) + plt.savefig(plot_path, bbox_inches="tight", dpi=150) + plt.close(fig) + + +def bar_chart_data_table(): + """ + Export cluster means data used in bar charts to an Excel file with multiple sheets. + + Reads three CSV files containing cluster means data, filters the UK standard cluster means + to include only the Supergroup data and and writes combines this with the other two CSVs to + create an Excel file with three sheets: 'Supergroups', 'Groups', and 'Subgroups'. + + Parameters + ---------- + None + + Returns + ------- + None + Writes 'bar_chart_data_table.xlsx' to the current directory with three sheets: + - 'Supergroups': Filtered UK standard cluster means (only 'Supergroup' rows) + - 'Groups': Parent cluster group means + - 'Subgroups': Parent cluster subgroup means + """ + # Read the CSV files + lookup = pd.read_csv("./data/lookups/UK_selected_codes_lookup.csv") + uk_std = pd.read_csv("./data/output_data/std_means/uk_std_means/uk_std_cluster_means_output.csv") + group_means = pd.read_csv( + "./data/output_data/std_means/parent_std_means/parent_std_cluster_group_means_output.csv" + ) + subgroup_means = pd.read_csv( + "./data/output_data/std_means/parent_std_means/parent_std_cluster_subgroup_means_output.csv" + ) + + # Filter rows where 'hierarchy_level' contains ' supergroup' + uk_supergroups = uk_std[uk_std["hierarchy_level"].str.contains("supergroup", na=False)] + # Remove the 'hierarchy_level' column and rename 'cluster' to 'supergroups' + uk_supergroups = uk_supergroups.drop(columns=["hierarchy_level"]) + uk_supergroups = uk_supergroups.rename(columns={"cluster": "supergroups"}) + + rename_dict = dict(zip(lookup["new_code"], lookup["radial_plot_label"])) + + # Rename columns in all three DataFrames + uk_supergroups = uk_supergroups.rename(columns=rename_dict) + group_means = group_means.rename(columns=rename_dict) + subgroup_means = subgroup_means.rename(columns=rename_dict) + + # Round all numeric columns to 3 decimal places + uk_supergroups = uk_supergroups.round(3) + group_means = group_means.round(3) + subgroup_means = subgroup_means.round(3) + + # Write to Excel with three tabs + with pd.ExcelWriter("data/output_data/bar_charts/bar_chart_data_table.xlsx") as writer: + uk_supergroups.to_excel(writer, sheet_name="Supergroups", index=False) + group_means.to_excel(writer, sheet_name="Groups", index=False) + subgroup_means.to_excel(writer, sheet_name="Subgroups", index=False) diff --git a/area_classification/post_processing/post_processing.py b/area_classification/post_processing/post_processing.py index 9479f9e..81ac7c7 100644 --- a/area_classification/post_processing/post_processing.py +++ b/area_classification/post_processing/post_processing.py @@ -1,21 +1,21 @@ # Post clustering wrapper -import os -import pandas as pd -from area_classification.utilities.load_config import load_config -from area_classification.post_processing.cluster_table_restructure import cluster_table_restructure +from area_classification.post_processing.cluster_std_means_to_parent_clusters import ( + cluster_std_means_to_parent_clusters, +) +from area_classification.post_processing.cluster_summaries import cluster_summaries_wrapper +from area_classification.post_processing.cluster_table_restructure import cluster_table_restructure from area_classification.post_processing.cluster_variables_mean import cluster_variable_means -from area_classification.post_processing.cluster_std_means_to_parent_clusters import cluster_std_means_to_parent_clusters from area_classification.post_processing.create_radial_plots import create_radial_plots_wrapper -from area_classification.post_processing.cluster_summaries import cluster_summaries_wrapper +from area_classification.post_processing.horizontal_bar_chart import create_bar_charts_wrapper from area_classification.pre_processing.prepare_clustering_data import standardise_data -from area_classification.post_processing.horizontal_bar_chart import create_bar_charts_wrapper + def post_processing(config, clustering_output, chosen_clustering_variables): """ - Wrapper function to standardise the data and restructure the table created when clustering. + Wrapper function to standardise the data and restructure the table created when clustering. Calculates means of each cluster, based on the restructured table and standardised data. Creates radial plots and drafts cluster summaries. - + Parameters ---------- config : dict @@ -28,19 +28,23 @@ def post_processing(config, clustering_output, chosen_clustering_variables): Returns ------- tuple of pd.DataFrame - (combined_group_means, combined_subgroup_means): DataFrames containing means for group and subgroup clusters. + (combined_group_means, combined_subgroup_means): DataFrames containing means + for group and subgroup clusters. """ # Run the standardise_data function on chosen_clustering_variables standardised_data = standardise_data(chosen_clustering_variables) - # Step 1: Restructure the cluster table to have separate columns for supergroup, group and subgroup + # Step 1: Restructure the cluster table to have separate columns for + # supergroup, group and subgroup restructured_cluster_table, restructured_cluster_table_long = cluster_table_restructure( - config, clustering_output, config["split_column"], config["keep_column"], standardised_data + config, clustering_output, config["split_column"], config["keep_column"], standardised_data ) - # Step 2: Calculate means for each variable for each cluster - uk_std_cluster_means = cluster_variable_means(config, restructured_cluster_table, standardised_data) + # Step 2: Calculate means for each variable for each cluster + uk_std_cluster_means = cluster_variable_means( + config, restructured_cluster_table, standardised_data + ) # Step 3: Run cluster_std_means_to_parent_clusters and capture the returned means combined_group_means, combined_subgroup_means = cluster_std_means_to_parent_clusters( @@ -48,15 +52,37 @@ def post_processing(config, clustering_output, chosen_clustering_variables): ) # Step 4: Create radial plots for the clusters using the combined means - create_radial_plots_wrapper(config, uk_std_cluster_means, combined_group_means, combined_subgroup_means) + create_radial_plots_wrapper( + config, uk_std_cluster_means, combined_group_means, combined_subgroup_means + ) # Step 5: Data visualisation - create_bar_charts_wrapper(config, uk_std_cluster_means, combined_group_means, combined_subgroup_means) + create_bar_charts_wrapper( + config, uk_std_cluster_means, combined_group_means, combined_subgroup_means + ) # Step 6: Draft cluster summaries - cluster_summaries_wrapper(config, restructured_cluster_table_long, uk_std_cluster_means, config["select_variables_lookup"], cluster_column = 'supergroup') - cluster_summaries_wrapper(config, restructured_cluster_table_long, uk_std_cluster_means, config["select_variables_lookup"], cluster_column = 'group') - cluster_summaries_wrapper(config, restructured_cluster_table_long, uk_std_cluster_means, config["select_variables_lookup"], cluster_column = 'subgroup') - + cluster_summaries_wrapper( + config, + restructured_cluster_table_long, + uk_std_cluster_means, + config["select_variables_lookup"], + cluster_column="supergroup", + ) + cluster_summaries_wrapper( + config, + restructured_cluster_table_long, + uk_std_cluster_means, + config["select_variables_lookup"], + cluster_column="group", + ) + cluster_summaries_wrapper( + config, + restructured_cluster_table_long, + uk_std_cluster_means, + config["select_variables_lookup"], + cluster_column="subgroup", + ) + # Return the combined means for further use if needed - return combined_group_means, combined_subgroup_means, uk_std_cluster_means \ No newline at end of file + return combined_group_means, combined_subgroup_means, uk_std_cluster_means diff --git a/area_classification/pre_processing/aggregating_variables.py b/area_classification/pre_processing/aggregating_variables.py index 59c6a8a..c24b12a 100644 --- a/area_classification/pre_processing/aggregating_variables.py +++ b/area_classification/pre_processing/aggregating_variables.py @@ -1,9 +1,9 @@ -import pandas as pd -import os import logging +import os logger = logging.getLogger(__name__) + def aggregating_variables(df_temp, aggregation_configs, config): """ This function aggregates specified columns in a temporary DataFrame and adds the aggregated @@ -11,13 +11,13 @@ def aggregating_variables(df_temp, aggregation_configs, config): Parameters ---------- - df_temp : pd.DataFrame + df_temp : pd.DataFrame The temporary DataFrame to update. aggregation_configs : list of dict A list of dictionaries, one each for EW, NI and Scot. Each dictionary contains: - 'col_names' (list): List of column codes to aggregate. - 'new_col_name' (str): Name of the new column to create. - For example: + For example: cars_2_or_more: [ts0450004, ts0450005] config : dict A dictionary containing user configuration settings, including the QA path. @@ -31,28 +31,29 @@ def aggregating_variables(df_temp, aggregation_configs, config): col_names = aggregation_configs[key] new_col_name = key - # Check if all columns in col_names exist in df_temp + # Check if all columns in col_names exist in df_temp missing_cols = [col for col in col_names if col not in df_temp.columns] if missing_cols: - logger.warning(f"Warning: Missing columns {missing_cols} in DataFrame. Skipping aggregation for {new_col_name}.") + logger.warning( + f"Warning: Missing columns {missing_cols} in DataFrame. " + + "Skipping aggregation for {new_col_name}." + ) continue # Add the new column by summing the specified columns df_temp[new_col_name] = df_temp[col_names].sum(axis=1) # Extract the header of column 1 (the country area code type e.g. LTLA, LGD or CA19) - if not df_temp.empty: - # Convert to string for use in the file name - country_lad_code = str(df_temp.columns[0]) - else: - # Handle empty DataFrame case - country_lad_code = "N/A" + # Convert to string for use in the file name else Handle empty DataFrame case + country_lad_code = str(df_temp.columns[0]) if not df_temp.empty else "N/A" # Ensure QA directory exists os.makedirs(os.path.dirname(config["qa_directory"]), exist_ok=True) # Save to data QA folder with country area code type in the file name - output_file_path = f"{config['qa_directory']}preprocessing_aggregated_all_variables_{country_lad_code}.csv" + output_file_path = ( + f"{config['qa_directory']}preprocessing_aggregated_all_variables_{country_lad_code}.csv" + ) df_temp.to_csv(output_file_path, index=False) - - return df_temp \ No newline at end of file + + return df_temp diff --git a/area_classification/pre_processing/convert_to_percentages.py b/area_classification/pre_processing/convert_to_percentages.py index 81477f2..4f7821f 100644 --- a/area_classification/pre_processing/convert_to_percentages.py +++ b/area_classification/pre_processing/convert_to_percentages.py @@ -1,37 +1,37 @@ -import pandas as pd - def convert_to_percentages(raw_totals_df): """ Converts raw totals DataFrame to percentages by dividing variable columns by their corresponding '_total' columns and multiplying by 100. - + Parameters ---------- raw_totals_df : pd.DataFrame - Input DataFrame with area codes as the first column followed by columns like 'v01', 'v01_total'. - Values are raw counts. - + Input DataFrame with area codes as the first column followed by columns like + 'v01', 'v01_total'. Values are raw counts. + Returns ------- pd.DataFrame - DataFrame with area codes in the first column followed by percentage values for + DataFrame with area codes in the first column followed by percentage values for each variable from v1 to v60 (excluding '_total' columns). """ - + # Create a copy of the DataFrame to store percentages percentages_df = raw_totals_df.copy() - + # Iterate over columns to calculate percentages for col in raw_totals_df.columns: - if col.endswith('_total'): + if col.endswith("_total"): # Get the base column name (e.g., 'v01' from 'v01_total') - base_col = col.replace('_total', '') - + base_col = col.replace("_total", "") + if base_col in raw_totals_df.columns: # Calculate percentage and update the base column percentages_df[base_col] = (raw_totals_df[base_col] / raw_totals_df[col]) * 100 - + # Drop all '_total' columns - percentages_df = percentages_df[[col for col in percentages_df.columns if not col.endswith('_total')]] - - return percentages_df \ No newline at end of file + percentages_df = percentages_df[ + [col for col in percentages_df.columns if not col.endswith("_total")] + ] + + return percentages_df diff --git a/area_classification/pre_processing/drop_variables.py b/area_classification/pre_processing/drop_variables.py index e9b0c69..52c8793 100644 --- a/area_classification/pre_processing/drop_variables.py +++ b/area_classification/pre_processing/drop_variables.py @@ -1,12 +1,10 @@ # This script which does not include all 60 variables in the clustering. -import pandas as pd -import yaml def check_drop_columns_true(config, preprocessed_df): """ - This function checks if the 'drop_columns' key in the config is set to - True. If it is, it calls the drop_variables_pre_clustering function to + This function checks if the 'drop_columns' key in the config is set to + True. If it is, it calls the drop_variables_pre_clustering function to drop specified columns from the preprocessed input table. Parameters @@ -19,19 +17,19 @@ def check_drop_columns_true(config, preprocessed_df): Returns ------- pd.DataFrame - DataFrame with specified columns dropped if 'drop_columns' is True; + DataFrame with specified columns dropped if 'drop_columns' is True; therwise, the original DataFrame. """ - + # Check if 'drop_columns' is set to True in the config if config["drop_columns"]: - return drop_variables_pre_clustering(config, preprocessed_df, - config.get('variables_to_drop', [])) - else: + return drop_variables_pre_clustering( + config, preprocessed_df, config.get("variables_to_drop", []) + ) + else: return preprocessed_df - def drop_variables_pre_clustering(config, preprocessed_df, variables_to_drop): """ Duplicates the preprocessed input table, removes columns listed in 'variables_to_drop', @@ -40,7 +38,8 @@ def drop_variables_pre_clustering(config, preprocessed_df, variables_to_drop): Parameters ---------- config : dict - Configuration dictionary containing the output file path under 'pre_clustering_data_filtered'. + Configuration dictionary containing the output file path under + 'pre_clustering_data_filtered'. preprocessed_df : pd.DataFrame DataFrame containing the preprocessed table. variables_to_drop : list @@ -50,17 +49,14 @@ def drop_variables_pre_clustering(config, preprocessed_df, variables_to_drop): ------- pd.DataFrame The filtered DataFrame with specified columns removed. - """ + """ # Duplicate the preprocessed input table processed_input_table = preprocessed_df.copy() - + # Drop the specified columns - pre_clustering_filtered = processed_input_table.drop(columns=variables_to_drop, errors='ignore') + pre_clustering_filtered = processed_input_table.drop(columns=variables_to_drop, errors="ignore") - # Save the filtered table as a new CSV file + # Save the filtered table as a new CSV file pre_clustering_filtered.to_csv(config["pre_clustering_data_filtered"], index=False) return pre_clustering_filtered - - - \ No newline at end of file diff --git a/area_classification/pre_processing/pre_processing.py b/area_classification/pre_processing/pre_processing.py index 0461853..7a023fd 100644 --- a/area_classification/pre_processing/pre_processing.py +++ b/area_classification/pre_processing/pre_processing.py @@ -1,16 +1,20 @@ -import pandas as pd import os -from area_classification.utilities.load_config import load_config -from area_classification.pre_processing.standardised_illness_ratio import sir_processing + +import pandas as pd + from area_classification.pre_processing.aggregating_variables import aggregating_variables -from area_classification.pre_processing.select_variables import select_variables -from area_classification.pre_processing.select_totals_columns import select_totals_columns from area_classification.pre_processing.convert_to_percentages import convert_to_percentages +from area_classification.pre_processing.select_totals_columns import select_totals_columns +from area_classification.pre_processing.select_variables import select_variables +from area_classification.pre_processing.standardised_illness_ratio import sir_processing +from area_classification.utilities.load_config import load_config -# Assume that the data has been loaded and is in a pandas dataframe (e.g. ran NI / EW bulks and downloaded Scot) + +# Assume that the data has been loaded and is in a pandas dataframe (e.g. ran NI / EW bulks +# and downloaded Scot) def pre_processing(ew_df, ni_df, scot_df, config): """ - Processes census data from England, Wales, Northern Ireland, and Scotland to ensure + Processes census data from England, Wales, Northern Ireland, and Scotland to ensure consistency of datasets before being fed into clustering algorithm. Parameters @@ -27,7 +31,7 @@ def pre_processing(ew_df, ni_df, scot_df, config): Returns ------- pd.DataFrame - Combined and pre-processed DataFrame containing data for England, Wales, Northern Ireland, + Combined and pre-processed DataFrame containing data for England, Wales, Northern Ireland, and Scotland, for the 60 specific variables required for area classification clustering. Notes @@ -49,7 +53,7 @@ def pre_processing(ew_df, ni_df, scot_df, config): If there are issues with data merging or transformations. """ - aggregation_config = load_config('area_classification/aggregation_setup.yaml') + aggregation_config = load_config("area_classification/aggregation_setup.yaml") select_variables_lookup = pd.read_csv(config["select_variables_lookup"]) dfs = {"ew": ew_df, "ni": ni_df, "scot": scot_df} @@ -59,34 +63,45 @@ def pre_processing(ew_df, ni_df, scot_df, config): for key in dfs: # Make the key to extract the information from config file join_column_name = key + "_join_column_name" - exclude_form_code_key = key + "_excluded_form_code" + # exclude_form_code_key = key + "_excluded_form_code" df_temp = dfs[key] # Aggregate variables which need to be combined categories - aggregation_configs = aggregation_config[key + '_file_configs'] + aggregation_configs = aggregation_config[key + "_file_configs"] df_temp = aggregating_variables(df_temp, aggregation_configs, config) # Join SIR column into the main DataFrame - df_temp = pd.merge(df_temp,sir_output_df[["area_code","SIR"]],how = "left", left_on = config[join_column_name], right_on = "area_code").drop(columns=["area_code"]) - + df_temp = pd.merge( + df_temp, + sir_output_df[["area_code", "SIR"]], + how="left", + left_on=config[join_column_name], + right_on="area_code", + ).drop(columns=["area_code"]) + # Check cases where SIR is NaN and try to match with sir_output_df - # This is a workaround for cases where the area code in the main df does not match exactly with the area code in the sir_output_df - # Occurs where Area code is combined for small areas + # This is a workaround for cases where the area code in the main df does not + # match exactly with the area code in the sir_output_df + # Occurs where Area code is combined for small areas for idx, row in df_temp[df_temp["SIR"].isna()].iterrows(): area_code = row[config[join_column_name]] - match_in_sir = sir_output_df[sir_output_df["area_code"].str.contains(str(area_code), na=False)] + match_in_sir = sir_output_df[ + sir_output_df["area_code"].str.contains(str(area_code), na=False) + ] if not match_in_sir.empty: df_temp.at[idx, "SIR"] = match_in_sir["SIR"].values[0] # Select the 60 variables a used in previous itterations of the area classification country_variables_lookup = select_variables_lookup[select_variables_lookup["country"] == key] df_temp = select_variables(df_temp, country_variables_lookup) - df_temp.rename(columns={config[join_column_name]: "LAD_code"},inplace=True) + df_temp.rename(columns={config[join_column_name]: "LAD_code"}, inplace=True) # Write the DataFrame to a CSV file os.makedirs(os.path.dirname(config["qa_directory"]), exist_ok=True) - output_csv_path = os.path.join(config["qa_directory"], f"preprocessing_{key}_selected_variables.csv") + output_csv_path = os.path.join( + config["qa_directory"], f"preprocessing_{key}_selected_variables.csv" + ) df_temp.to_csv(output_csv_path, index=False) # overwriting original df with processed df @@ -97,8 +112,8 @@ def pre_processing(ew_df, ni_df, scot_df, config): # Convert counts to percentages preprocessed_df = convert_to_percentages(raw_totals_df) - + # Save pre-clustering data (unstandardized, used in cluster means) preprocessed_df.to_csv(config["pre_clustering_data"], index=False) - return preprocessed_df \ No newline at end of file + return preprocessed_df diff --git a/area_classification/pre_processing/prepare_clustering_data.py b/area_classification/pre_processing/prepare_clustering_data.py index c91f767..7238a9c 100644 --- a/area_classification/pre_processing/prepare_clustering_data.py +++ b/area_classification/pre_processing/prepare_clustering_data.py @@ -1,4 +1,3 @@ -import pandas as pd import numpy as np @@ -46,17 +45,17 @@ def standardise_data(dataframe): A DataFrame with standardised numeric columns. """ standardised_data = dataframe.copy() - # Skip the first column (e.g., area codes) - for column in dataframe.columns[1:]: + # Skip the first column (e.g., area codes) + for column in dataframe.columns[1:]: mean = dataframe[column].mean() # Use population standard deviation - std = dataframe[column].std(ddof=0) + std = dataframe[column].std(ddof=0) # Avoid division by zero - if std != 0: + if std != 0: standardised_data[column] = (dataframe[column] - mean) / std else: # If std is 0, set standardised values to 0 - standardised_data[column] = 0 + standardised_data[column] = 0 return standardised_data @@ -97,4 +96,4 @@ def apply_min_max_scaling(dataframe): scaled_data.iloc[:, 1:] = (scaled_data.iloc[:, 1:] - scaled_data.iloc[:, 1:].min()) / ( scaled_data.iloc[:, 1:].max() - scaled_data.iloc[:, 1:].min() ) - return scaled_data \ No newline at end of file + return scaled_data diff --git a/area_classification/pre_processing/select_totals_columns.py b/area_classification/pre_processing/select_totals_columns.py index 199f2c2..186966d 100644 --- a/area_classification/pre_processing/select_totals_columns.py +++ b/area_classification/pre_processing/select_totals_columns.py @@ -1,14 +1,16 @@ +import logging import os + import pandas as pd -import logging logger = logging.getLogger(__name__) + def select_totals_columns(config, inputs_folder): """ Extracts select files for England and Wales (ew), Northern Ireland (ni), and Scotland (scot), - matches variable columns with their corresponding totals using a lookup file, and appends the - totals to the select files. The processed files are then concatenated into a single DataFrame + matches variable columns with their corresponding totals using a lookup file, and appends the + totals to the select files. The processed files are then concatenated into a single DataFrame and saved to an output file. This is used to calculate percentages later in the pipeline. Parameters @@ -16,21 +18,22 @@ def select_totals_columns(config, inputs_folder): config : dict Configuration dictionary containing paths and settings inputs_folder : str - Path to the folder containing the select files and aggregated output tables for each + Path to the folder containing the select files and aggregated output tables for each country (ew, ni, scot). - - select files contain the area codes and raw counts for only the variables from v1 to v60. - - aggregated output tables contain the area codes and raw counts and totals for every + - select files contain the area codes and raw counts for only the variables + from v1 to v60. + - aggregated output tables contain the area codes and raw counts and totals for every variable using variables codes like ts, ni and uv. Codes ending '001' are the totals. Returns ------- - pd.DataFrame - A new DataFrame with the area codes in the first column followed by raw count values for each + pd.DataFrame + A new DataFrame with the area codes in the first column followed by raw count values for each variable from v1 to v60 and the total respondents to the question relating to that variable. """ # Load the lookup file - lookup_file = config["select_variables_lookup"] + lookup_file = config["select_variables_lookup"] lookup_df = pd.read_csv(lookup_file) # Filter out rows where 'new_code' is 'v12' or 'v33' (population density and SIR) @@ -45,21 +48,29 @@ def select_totals_columns(config, inputs_folder): # Loop through all files in the inputs folder for file_name in os.listdir(inputs_folder): - if file_name.endswith("_selected_variables.csv"): # Process only files ending with '_selected_variables.csv' + if file_name.endswith( + "_selected_variables.csv" + ): # Process only files ending with '_selected_variables.csv' # Determine the country and corresponding aggregated file based on the file name if "preprocessing_ew_selected_variables.csv" in file_name: country = "ew" - agg_file = os.path.join(inputs_folder, "preprocessing_aggregated_all_variables_LTLA.csv") + agg_file = os.path.join( + inputs_folder, "preprocessing_aggregated_all_variables_LTLA.csv" + ) # Decapitalize the table_ID_with_suffix column for England and Wales lookup_df["table_ID_with_suffix"] = lookup_df["table_ID_with_suffix"].str.lower() elif "preprocessing_ni_selected_variables.csv" in file_name: country = "ni" - agg_file = os.path.join(inputs_folder, "preprocessing_aggregated_all_variables_LGD.csv") + agg_file = os.path.join( + inputs_folder, "preprocessing_aggregated_all_variables_LGD.csv" + ) # Decapitalize the table_ID_with_suffix column for Northern Ireland lookup_df["table_ID_with_suffix"] = lookup_df["table_ID_with_suffix"].str.lower() elif "preprocessing_scot_selected_variables" in file_name: country = "scot" - agg_file = os.path.join(inputs_folder, "preprocessing_aggregated_all_variables_CA19.csv") + agg_file = os.path.join( + inputs_folder, "preprocessing_aggregated_all_variables_CA19.csv" + ) # Do not decapitalize the table_ID_with_suffix column for Scotland lookup_df["table_ID_with_suffix"] = lookup_df["table_ID"].astype(str) + "0001" else: @@ -76,31 +87,40 @@ def select_totals_columns(config, inputs_folder): # Iterate through each variable column in the select file for variable in select_df.columns[1:]: # Skip the first column - # Ignore 'v12' and 'v33' columns as these are already ratios, don't need to be percentages - if variable in ["v12", "v33"]: + # Ignore 'v12' and 'v33' columns as these are already ratios, + # don't need to be percentages + if variable in ["v12", "v33"]: continue # Only process variable columns - if variable.startswith("v"): + if variable.startswith("v"): # Special case for v19 in Scotland if country == "scot" and variable == "v19": # Directly set the total column to 'Total' - total_column = "Total" + total_column = "Total" else: # Find the corresponding total column in the lookup - match = country_lookup_df.loc[country_lookup_df["new_code"] == variable, "table_ID_with_suffix"] + match = country_lookup_df.loc[ + country_lookup_df["new_code"] == variable, "table_ID_with_suffix" + ] if not match.empty: - total_column = match.values[0] # Get the matching total column name (e.g., ts0010001) + total_column = match.values[ + 0 + ] # Get the matching total column name (e.g., ts0010001) else: - logger.warning(f"Warning: No match found for variable '{variable}' in lookup_df for {country}.") + logger.warning( + f"Warning: No match found for variable '{variable}' " + + "in lookup_df for {country}." + ) continue - - + # Check if the total column exists in the aggregated variables file if total_column in agg_df.columns: # Add the total column to the select DataFrame select_df[f"{variable}_total"] = agg_df[total_column] else: - logger.warning(f"Warning: Total column '{total_column}' not found in agg file.") + logger.warning( + f"Warning: Total column '{total_column}' not found in agg file." + ) # Append the processed DataFrame to the list processed_dfs.append(select_df) @@ -116,7 +136,9 @@ def select_totals_columns(config, inputs_folder): raw_totals_df = raw_totals_df[reordered_columns] # Save the concatenated DataFrame to the output file - output_file = os.path.join(config["qa_directory"], "preprocessing_selected_variables_raw_totals.csv") + output_file = os.path.join( + config["qa_directory"], "preprocessing_selected_variables_raw_totals.csv" + ) raw_totals_df.to_csv(output_file, index=False) return raw_totals_df diff --git a/area_classification/pre_processing/select_variables.py b/area_classification/pre_processing/select_variables.py index 989d6dc..6444ac9 100644 --- a/area_classification/pre_processing/select_variables.py +++ b/area_classification/pre_processing/select_variables.py @@ -1,15 +1,14 @@ -import pandas as pd -import re -import os import logging +import re logger = logging.getLogger(__name__) + def select_variables(df_temp, lookup_df): """ Selects specific columns from a main DataFrame based on a lookup table and returns a new DataFrame with only the specified columns. It also takes - the variable_codes (which start either TS, ni or UV, based on the country) + the variable_codes (which start either TS, ni or UV, based on the country) and converts these all into new_codes which all start 'v'. Parameters @@ -17,20 +16,21 @@ def select_variables(df_temp, lookup_df): df_temp : pd.DataFrame The temp DataFrame containing all data. lookup_df : pd.DataFrame - DataFrame containing 'variable_code' and 'new_code' columns to select and rename columns. - + DataFrame containing 'variable_code' and 'new_code' columns to select and + rename columns. + Returns ------- - pd.DataFrame - A new DataFrame with only the specified columns, with area codes in the first column followed by - raw count values for each variable from v1 to v60. - + pd.DataFrame + A new DataFrame with only the specified columns, with area codes in the + first column followed by raw count values for each variable from v1 to v60. + """ # Extract the columns to select and their new names - selected_columns = lookup_df['variable_code'].dropna().tolist() - new_code = dict(zip(lookup_df['variable_code'], lookup_df['new_code'])) - + selected_columns = lookup_df["variable_code"].dropna().tolist() + new_code = dict(zip(lookup_df["variable_code"], lookup_df["new_code"])) + # Check for missing columns and log them valid_columns = [] for col in selected_columns: @@ -38,30 +38,31 @@ def select_variables(df_temp, lookup_df): logger.warning(f"Warning: Column '{col}' is missing in the temp DataFrame.") else: valid_columns.append(col) - + # Ensure the first column (area codes) of df_temp is included as the first column first_column = df_temp.columns[0] if first_column not in valid_columns: valid_columns.insert(0, first_column) - + logger.info(f"Columns to be selected: {valid_columns}") - + # Filter the main DataFrame to include only the valid columns # Rename the columns based on lookup table (V codes) filtered_df = df_temp[valid_columns].copy().rename(columns=new_code) # Keep the first column (area codes) in place and reorder the remaining columns first_column = filtered_df.columns[0] - remaining_columns = filtered_df.columns[1:] + remaining_columns = filtered_df.columns[1:] # Combine the first column (area codes) with the reordered remaining columns - ordered_columns = [first_column] + sorted(remaining_columns) + ordered_columns = [first_column] + sorted(remaining_columns) filtered_df = filtered_df[ordered_columns] return filtered_df + # Order the remaining columns based on the numeric value following 'v' def extract_numeric_value(col_name): - match = re.search(r'v(\d+)', col_name) + match = re.search(r"v(\d+)", col_name) # Default to infinity if no match - return int(match.group(1)) if match else float('inf') \ No newline at end of file + return int(match.group(1)) if match else float("inf") diff --git a/area_classification/pre_processing/standardised_illness_ratio.py b/area_classification/pre_processing/standardised_illness_ratio.py index 7b65964..bb11280 100644 --- a/area_classification/pre_processing/standardised_illness_ratio.py +++ b/area_classification/pre_processing/standardised_illness_ratio.py @@ -1,16 +1,15 @@ # Expected input # area_code | age_group | total_disabled | total_population -#--------------------------------------------------------------- +# --------------------------------------------------------------- # E06000001 | 0-14 | 50 | 1000 # | | | # ^^ Areas code for all of UK, age groups ='<15 and >=65' and '15_64' -import pandas as pd -import os import logging +import os -logger = logging.getLogger(__name__) +import pandas as pd from area_classification.utilities.disability_age_group_conversion import ( convert_disability_age_group_england_wales, @@ -18,13 +17,16 @@ convert_disability_age_group_scotland, ) +logger = logging.getLogger(__name__) + + def sir_processing(config): """ Process disability data to calculate the Standardised Illness Ratio (SIR) for each area code. - This function first checks if the required disability data files are present, if missing it runs a function - to generate them. When all required files are present, it combines them into one dataframe to then calculate - the SIR for each area code. + This function first checks if the required disability data files are present, if missing + it runs a function to generate them. When all required files are present, it combines them + into one dataframe to then calculate the SIR for each area code. Parameters ----------- @@ -41,7 +43,7 @@ def sir_processing(config): required_files = [ config["england_wales_disability_file"], config["ni_disability_file"], - config["scotland_disability_file"] + config["scotland_disability_file"], ] # Create a list of the missing files missing_files = [] @@ -53,41 +55,61 @@ def sir_processing(config): # If each file is in the missing list, then run the conversion function to create that file if config["england_wales_disability_file"] in missing_files: - logger.warning(f"Warning: The file {config['england_wales_disability_file']} was not found in the input directory.") - convert_disability_age_group_england_wales(config["input_directory"] + config["england_wales_disability_input"], config) + logger.warning( + f"Warning: The file {config['england_wales_disability_file']} " + + "was not found in the input directory." + ) + convert_disability_age_group_england_wales( + config["input_directory"] + config["england_wales_disability_input"], config + ) if config["ni_disability_file"] in missing_files: - logger.warning(f"Warning: The file {config['ni_disability_file']} was not found in the input directory.") - convert_disability_age_group_northern_ireland(config["input_directory"] + config["ni_disability_input"], config) + logger.warning( + f"Warning: The file {config['ni_disability_file']} was not found in the input directory." + ) + convert_disability_age_group_northern_ireland( + config["input_directory"] + config["ni_disability_input"], config + ) if config["scotland_disability_file"] in missing_files: - convert_disability_age_group_scotland(config["input_directory"] + config["scotland_disability_input"], config) - logger.warning(f"Warning: The file {config['scotland_disability_file']} was not found in the input directory.") + convert_disability_age_group_scotland( + config["input_directory"] + config["scotland_disability_input"], config + ) + logger.warning( + f"Warning: The file {config['scotland_disability_file']} was" + + " not found in the input directory." + ) logger.warning(f"Warning: The following files were not found: {missing_files}") # Load the files for all three and then combine into a single dataframe - ew_disability_df = pd.read_csv(config["input_directory"]+config["england_wales_disability_file"]) - ni_disability_df = pd.read_csv(config["input_directory"]+config["ni_disability_file"]) - scotland_disability_df = pd.read_csv(config["input_directory"]+config["scotland_disability_file"]) - combined_disability_df = pd.concat( - [ew_disability_df, ni_disability_df, scotland_disability_df]) + ew_disability_df = pd.read_csv( + config["input_directory"] + config["england_wales_disability_file"] + ) + ni_disability_df = pd.read_csv(config["input_directory"] + config["ni_disability_file"]) + scotland_disability_df = pd.read_csv( + config["input_directory"] + config["scotland_disability_file"] + ) + combined_disability_df = pd.concat([ew_disability_df, ni_disability_df, scotland_disability_df]) # Perfrom the SIR on the cominbed df sir_output_df = SIR_calculation(combined_disability_df, config) return sir_output_df + def SIR_calculation(df: pd.DataFrame, config: dict) -> pd.DataFrame: """ - Calculate the Standardised Illness Ratio (SIR) for a given DataFrame containing disability data. - This is calculated as a ratio of the 'disability_count' (observed disability) and 'exp_ill_all' (expected - disability count) based on national proportions. The SIR is calculated for each area code. + Calculate the Standardised Illness Ratio (SIR) for a given DataFrame containing disability + data. This is calculated as a ratio of the 'disability_count' (observed disability) and + 'exp_ill_all' (expected disability count) based on national proportions. The SIR is + calculated for each area code. At the end it runs the QA function which saves the file into the QA directory. Parameters ---------- df : DataFrame - DataFrame containing columns 'Area_Code', 'local_authority', 'age_group', 'Count', 'Population' - + DataFrame containing columns 'Area_Code', 'local_authority', 'age_group', + 'Count', 'Population' + Returns ------- pd.DataFrame @@ -100,63 +122,79 @@ def SIR_calculation(df: pd.DataFrame, config: dict) -> pd.DataFrame: """ # Calculate proportion of ill or disabled people for each age group at the national level - df_nat_summary = df.groupby('age_group').agg( - sum_population=('total_population', 'sum'), - sum_disability_count=('total_disabled', 'sum')).reset_index() - + df_nat_summary = ( + df.groupby("age_group") + .agg( + sum_population=("total_population", "sum"), + sum_disability_count=("total_disabled", "sum"), + ) + .reset_index() + ) + # Create national proportions for each age group by dividing the disability count by population - df_nat_summary['nat_prop'] = df_nat_summary['sum_disability_count'] / df_nat_summary['sum_population'] + df_nat_summary["nat_prop"] = ( + df_nat_summary["sum_disability_count"] / df_nat_summary["sum_population"] + ) # Join the national proportions back to the original DataFrame - df = df.merge(df_nat_summary[['age_group', 'nat_prop']], on='age_group', how='left', suffixes=('', '_nat')) + df = df.merge( + df_nat_summary[["age_group", "nat_prop"]], on="age_group", how="left", suffixes=("", "_nat") + ) - # Calculate exp_ill for each age group - df['exp_ill'] = df['nat_prop'] * df['total_population'] + # Calculate exp_ill for each age group + df["exp_ill"] = df["nat_prop"] * df["total_population"] # Sum exp ill and disability count (across age groups, to get one value per area code) - df_all = df.groupby(['area_code']).agg(exp_ill_all=('exp_ill', 'sum'), - disability_count=('total_disabled', 'sum')).reset_index() + df_all = ( + df.groupby(["area_code"]) + .agg(exp_ill_all=("exp_ill", "sum"), disability_count=("total_disabled", "sum")) + .reset_index() + ) # Calculate SIR for each Area Code - df_all['SIR'] = df_all.apply(lambda row: round((row['disability_count'] / row['exp_ill_all']) * 100, 4), axis=1) + df_all["SIR"] = df_all.apply( + lambda row: round((row["disability_count"] / row["exp_ill_all"]) * 100, 4), axis=1 + ) # QA check the SIR dataframe before returning sir_qa_checks(df_all, config) logger.info(f"SIR_DF_ALL: {df_all.head()}") return df_all + def sir_qa_checks(df: pd.DataFrame, config: dict) -> None: """ - Perform quality assurance (QA) checks on the SIR DataFrame and saves the table out as a CSV + Perform quality assurance (QA) checks on the SIR DataFrame and saves the table out as a CSV into the QA directory. - - This function check the data type is 'int64', checks the spatial distribution of SIR values, - and creates a summary of the SIR for each country. It then ensures the QA directory exists before + + This function check the data type is 'int64', checks the spatial distribution of SIR values, + and creates a summary of the SIR for each country. It then ensures the QA directory exists before saving the output. - + Parameters ---------- df : DataFrame DataFrame containing SIR values. - + Returns ------- None """ # Check if disability_count is int - assert df['disability_count'].dtype == 'int64', "Disability count should be of type int64" + if df["disability_count"].dtype != "int64": + logger.warning("Disability count should be of type int64") # Check expected spatial distribution logger.info("SIR values distribution:") - logger.info(df['SIR'].describe()) + logger.info(df["SIR"].describe()) - for country_code_starts_with in [["E", "W"], ['S'], ['N']]: + for country_code_starts_with in [["E", "W"], ["S"], ["N"]]: df_subset = df[df["area_code"].str.startswith(tuple(country_code_starts_with))] - logger.info(df_subset['SIR'].describe()) + logger.info(df_subset["SIR"].describe()) # Ensure QA directory exists os.makedirs(os.path.dirname(config["qa_directory"]), exist_ok=True) # Save to data QA folder output_file_path = config["qa_directory"] + "sir_calculation_qa_output.csv" - df.to_csv(output_file_path, index=False) \ No newline at end of file + df.to_csv(output_file_path, index=False) diff --git a/area_classification/utilities/disability_age_group_conversion.py b/area_classification/utilities/disability_age_group_conversion.py index 78ec073..26a1729 100644 --- a/area_classification/utilities/disability_age_group_conversion.py +++ b/area_classification/utilities/disability_age_group_conversion.py @@ -1,9 +1,12 @@ -import pandas as pd from pathlib import Path +import pandas as pd + + def define_age_bands_and_bools(df, lower_age_band_col="lower_age_band"): - """ - Function to define age bands and their corresponding boolean conditions based on the lower age band column. + """ + Function to define age bands and their corresponding boolean conditions based on + the lower age band column. Parameters ---------- @@ -16,22 +19,21 @@ def define_age_bands_and_bools(df, lower_age_band_col="lower_age_band"): ------- dict Dictionary with age band names as keys and boolean conditions as values. - """ - age_band_names_and_bools = { - "<15 and >=65": (df[lower_age_band_col]<15)|(df[lower_age_band_col]>=65), - "15-64": (df[lower_age_band_col]>=15) & (df[lower_age_band_col]<65), - } - return age_band_names_and_bools - - - -def convert_disability_age_group_scotland(filepath:str, config: dict) -> pd.DataFrame: + """ + age_band_names_and_bools = { + "<15 and >=65": (df[lower_age_band_col] < 15) | (df[lower_age_band_col] >= 65), + "15-64": (df[lower_age_band_col] >= 15) & (df[lower_age_band_col] < 65), + } + return age_band_names_and_bools + + +def convert_disability_age_group_scotland(filepath: str, config: dict) -> pd.DataFrame: """ Function to convert disability age group data from Scotland into a standard format, iterating based on council areas. - As mentioned in the main README for this repo, disability data for Scotland needs to be downloaded manually - from the Scotland's Census Flexible Table Builder (UV303a) and saved into the 'data/inputs/scot_downloads folder. - The file should be named 'UV303a.csv'. + As mentioned in the main README for this repo, disability data for Scotland needs to + be downloaded manually from the Scotland's Census Flexible Table Builder (UV303a) and + saved into the 'data/inputs/scot_downloads folder. The file should be named 'UV303a.csv'. Output is written to a csv file in the input_directory @@ -47,13 +49,13 @@ def convert_disability_age_group_scotland(filepath:str, config: dict) -> pd.Data pd.DataFrame disability data combined into two age groups: "<15 and >=65" and "15-64". Columns: council_area, age_group, total_population, total_disabled. - """ + """ # Read the CSV file # Specify number of columns to read from the CSV n = 6 - df = pd.read_csv(filepath, skiprows=10, header=1 , usecols=range(n)) - + df = pd.read_csv(filepath, skiprows=10, header=1, usecols=range(n)) + df.columns = ["A", "B", "C", "D", "E", "F"] # Initialize an empty DataFrame to store results @@ -61,60 +63,69 @@ def convert_disability_age_group_scotland(filepath:str, config: dict) -> pd.Data # Iterate through rows to extract relevant data for index, row in df.iterrows(): - if str(row.iloc[0]).strip().lower() == 'sex': + if str(row.iloc[0]).strip().lower() == "sex": # Get the council area name if index == 0: - # If the index is 0, set council_area to "Clackmannanshire" (as this CA was removed in skip rows reformat) + # If the index is 0, set council_area to "Clackmannanshire" (as this CA was + # removed in skip rows reformat) council_area = "Clackmannanshire" # Set to "Clackmannanshire" for index 1 else: # If it's not the first one, instead get the area name from two rows above - council_area = df.iloc[index - 2, 0] if index - 2 >= 0 else None # Get value from two rows above + council_area = ( + df.iloc[index - 2, 0] if index - 2 >= 0 else None + ) # Get value from two rows above - # Ensure council_area is not None before proceeding if council_area is None: raise ValueError(f"Council area could not be determined at row {index}.") - + # Process the current council area sex_row_index = index - + # Keep only the 21 rows after the 'Sex' row (all people rows) council_df = df.iloc[sex_row_index + 1 : sex_row_index + 22].copy() - + # Rename columns for clarity - council_df = council_df.rename(columns={'A': "Sex", 'B': "age_band"}) - + council_df = council_df.rename(columns={"A": "Sex", "B": "age_band"}) + # Extract 'age_band' column age_band_list = council_df["age_band"].tolist()[1:] - + # Extract the first number from each age band string - first_element_list = [int(s.split()[0]) if isinstance(s, str) and len(s.split()) > 0 else '' for s in age_band_list] - + first_element_list = [ + int(s.split()[0]) if isinstance(s, str) and len(s.split()) > 0 else "" + for s in age_band_list + ] + # Map each age band to its lower boundary mapping_dictionary = dict(zip(age_band_list, first_element_list)) council_df["lower_age_band"] = council_df["age_band"].map(mapping_dictionary) - + # Select columns to convert to numeric - columns_to_convert = council_df.columns[2:] # Select all columns starting from the 3rd column onward + columns_to_convert = council_df.columns[ + 2: + ] # Select all columns starting from the 3rd column onward # Convert the selected columns to numeric for col in columns_to_convert: council_df[col] = pd.to_numeric(council_df[col], errors="coerce") - + # Call the function to define age bands and conditions - age_band_names_and_bools = define_age_bands_and_bools(council_df, lower_age_band_col="lower_age_band") + age_band_names_and_bools = define_age_bands_and_bools( + council_df, lower_age_band_col="lower_age_band" + ) # Define columns that contain 'limited a' in their name limited_a_cols = ["D", "E"] - + for age_band_name, condition in age_band_names_and_bools.items(): new_row = { "CA19": council_area, "age_group": age_band_name, "total_population": council_df.loc[condition, "C"].sum(), - "total_disabled": council_df.loc[condition, limited_a_cols].sum(axis=1).sum() + "total_disabled": council_df.loc[condition, limited_a_cols].sum(axis=1).sum(), } - if 'result_df' not in locals(): + if "result_df" not in locals(): result_df = pd.DataFrame([new_row]) else: result_df = pd.concat([result_df, pd.DataFrame([new_row])], ignore_index=True) @@ -122,24 +133,27 @@ def convert_disability_age_group_scotland(filepath:str, config: dict) -> pd.Data # Load the LAD codes and names lookup file lookup_file_path = config["LAD_lookup_file_path"] lookup_df = pd.read_csv(lookup_file_path) - lookup_dict = dict(zip(lookup_df['LAD22NM'].str.lower().str.strip(), lookup_df['LAD22CD'])) + lookup_dict = dict(zip(lookup_df["LAD22NM"].str.lower().str.strip(), lookup_df["LAD22CD"])) # Replace council area names with LAD codes - result_df["CA19"] = result_df["CA19"].str.strip().str.lower().map(lookup_dict).fillna(result_df["CA19"]) + result_df["CA19"] = ( + result_df["CA19"].str.strip().str.lower().map(lookup_dict).fillna(result_df["CA19"]) + ) - result_df.rename(columns={'CA19': 'area_code'}, inplace=True) + result_df.rename(columns={"CA19": "area_code"}, inplace=True) output_path = Path(config["input_directory"]) / "scot_disability_age_group.csv" result_df.to_csv(output_path, index=False) return result_df + def convert_disability_age_group_england_wales(filepath: str, config: dict) -> pd.DataFrame: """ function to convert disability age group data from England and Wales into a standard format. - As mentioned in the main README for this repo, disability data for England and Wales needs to be downloaded - manually from the Office for National Statistics (ONS) website as it is not available in the bulk download. - It should have been manually saved into the 'data/inputs/ew_downloads folder. The file name should be - 'disabilitycensus2021.xlsx'. + As mentioned in the main README for this repo, disability data for England and Wales needs + to be downloaded manually from the Office for National Statistics (ONS) website as it is not + available in the bulk download. It should have been manually saved into the + 'data/inputs/ew_downloads' folder. The file name should be 'disabilitycensus2021.xlsx'. Output is written to a csv file in the input_directory @@ -155,16 +169,27 @@ def convert_disability_age_group_england_wales(filepath: str, config: dict) -> p pd.DataFrame disability data combined into two age groups: "<15 and >=65" and "15-64". Columns: local_authority, area_code, age_group, total_population, total_disabled. - """ + """ df_ew = pd.read_excel(filepath, sheet_name="Table 6", skiprows=4) # Filter for persons and age bands; exclude gender breakdown - df_ew = df_ew.loc[(df_ew["Sex"]=="Persons")&(df_ew["Category"] == "Two category")] - df_ew = df_ew[["Year", "Local Authority", "Area Code", "Category", "Disability status", "Age","Count","Population"]] + df_ew = df_ew.loc[(df_ew["Sex"] == "Persons") & (df_ew["Category"] == "Two category")] + df_ew = df_ew[ + [ + "Year", + "Local Authority", + "Area Code", + "Category", + "Disability status", + "Age", + "Count", + "Population", + ] + ] df_ew = df_ew.rename(columns={"Local Authority": "local_authority", "Area Code": "area_code"}) - df_ew["Count"] = df_ew["Count"].replace({'[c]': 0, '[x]': 0}) - df_ew["Population"] = df_ew["Population"].replace({'[c]': 0, '[x]': 0}) + df_ew["Count"] = df_ew["Count"].replace({"[c]": 0, "[x]": 0}) + df_ew["Population"] = df_ew["Population"].replace({"[c]": 0, "[x]": 0}) # Extract the first integer from the Age column for comparison - df_ew["lower_age_band"] = df_ew["Age"].str.extract(r'(\d+)').astype(float) + df_ew["lower_age_band"] = df_ew["Age"].str.extract(r"(\d+)").astype(float) age_band_names_and_bools = define_age_bands_and_bools(df_ew, lower_age_band_col="lower_age_band") for age_band_name, condition in age_band_names_and_bools.items(): @@ -178,27 +203,33 @@ def convert_disability_age_group_england_wales(filepath: str, config: dict) -> p "local_authority": geo_name, "age_group": age_band_name, "total_disabled": group_df.loc[ - (group_df["age_group"] == age_band_name) & - (group_df["Disability status"] == "Disabled"), - "Count" - ].astype(int).sum(), - "total_population": group_df.loc[(group_df["age_group"] == age_band_name) , "Count"].sum() + (group_df["age_group"] == age_band_name) + & (group_df["Disability status"] == "Disabled"), + "Count", + ] + .astype(int) + .sum(), + "total_population": group_df.loc[ + (group_df["age_group"] == age_band_name), "Count" + ].sum(), } + print(condition) result_df_list.append(new_row) result_df = pd.DataFrame(result_df_list) output_path = Path(config["input_directory"]) / "ew_disability_age_group.csv" result_df.to_csv(output_path, index=False) - + return result_df -def convert_disability_age_group_northern_ireland(filepath:str, config:dict) -> pd.DataFrame: + +def convert_disability_age_group_northern_ireland(filepath: str, config: dict) -> pd.DataFrame: """ function to convert disability age group data from Northern Ireland into a standard format. - As mentioned in the main README for this repo, disability data for Northern Ireland needs to be downloaded - manually from the Northern Ireland Statistics and Research Agency (NISRA) website as it is not available in - the bulk download. It should have been manually saved into the 'data/inputs/ni_downloads folder. The file - should be named 'census-2021-ms-d02.xlsx'. + As mentioned in the main README for this repo, disability data for Northern Ireland needs to + be downloaded manually from the Northern Ireland Statistics and Research Agency (NISRA) + website as it is not available in the bulk download. It should have been manually saved + into the 'data/inputs/ni_downloads folder. The file should be named 'census-2021-ms-d02.xlsx'. Output is written to a csv file in the input_directory Parameters @@ -207,38 +238,42 @@ def convert_disability_age_group_northern_ireland(filepath:str, config:dict) -> filepath to the excel file containing the disability age group data. config : str Configuration dictionary containing paths and file names. - + Returns ------- pd.DataFrame disability data combined into two age groups: "<15 and >=65" and "15-64". Columns: lgd_code, lgd, age_group, total_population, total_disabled. - """ + """ # Read and preprocess the Excel file ni_df = pd.read_excel(filepath, sheet_name="LGD", skiprows=8).iloc[0:-14] - ni_df.columns = ni_df.columns.str.replace('\n', '').str.lower() + ni_df.columns = ni_df.columns.str.replace("\n", "").str.lower() ni_df.columns = ni_df.columns.str.replace("usual residents aged ", "", regex=False) ni_df.columns = ni_df.columns.str.replace(r":\s*day-to-day activities\s*", " ", regex=True) # Reshape the DataFrame to long format ni_long_df = ni_df.melt( - id_vars=["geography code", "geography"], - var_name="age_disability_group", - value_name="count" + id_vars=["geography code", "geography"], var_name="age_disability_group", value_name="count" ) # Extract lower age band from the group name - ni_long_df["lower_age_band"] = ni_long_df["age_disability_group"].str.extract(r'(\d*)').replace('',None).astype(float) + ni_long_df["lower_age_band"] = ( + ni_long_df["age_disability_group"].str.extract(r"(\d*)").replace("", None).astype(float) + ) # Define age band conditions age_band_names_and_bools = { - "<15 and >=65": (ni_long_df["lower_age_band"]<15)|(ni_long_df["lower_age_band"]>=65), - "15-64": (ni_long_df["lower_age_band"]>=15) & (ni_long_df["lower_age_band"]<65), - } - - disability_condition = ni_long_df["age_disability_group"].str.contains(r"limited a l.*", case=False, regex=True) - non_disability_condition = ni_long_df["age_disability_group"].str.contains(r"not limited", case=False, regex=True) - + "<15 and >=65": (ni_long_df["lower_age_band"] < 15) | (ni_long_df["lower_age_band"] >= 65), + "15-64": (ni_long_df["lower_age_band"] >= 15) & (ni_long_df["lower_age_band"] < 65), + } + + disability_condition = ni_long_df["age_disability_group"].str.contains( + r"limited a l.*", case=False, regex=True + ) + non_disability_condition = ni_long_df["age_disability_group"].str.contains( + r"not limited", case=False, regex=True + ) + # Aggregate results for each area and age band result_df_list = [] for (geo_code, geo_name), group_df in ni_long_df.groupby(["geography code", "geography"]): @@ -248,12 +283,14 @@ def convert_disability_age_group_northern_ireland(filepath:str, config:dict) -> "local_authority": geo_name, "age_group": age_band_name, "total_disabled": group_df.loc[condition & disability_condition, "count"].sum(), - "total_population": group_df.loc[condition & (disability_condition | non_disability_condition), "count"].sum() + "total_population": group_df.loc[ + condition & (disability_condition | non_disability_condition), "count" + ].sum(), } result_df_list.append(new_row) - # Create the result DataFrame and write to CSV + # Create the result DataFrame and write to CSV result_df = pd.DataFrame(result_df_list) output_path = Path(config["input_directory"]) / "ni_disability_age_group.csv" result_df.to_csv(output_path, index=False) - return result_df \ No newline at end of file + return result_df diff --git a/area_classification/utilities/load_config.py b/area_classification/utilities/load_config.py index 3bfcfc9..8b3668e 100644 --- a/area_classification/utilities/load_config.py +++ b/area_classification/utilities/load_config.py @@ -1,6 +1,8 @@ import getpass + import yaml + def replace_username_in_dict(d, username: str, placeholder: str = "{USERNAME}"): """ Recursively replaces all instances of a placeholder in a dictionary or list with @@ -15,7 +17,7 @@ def replace_username_in_dict(d, username: str, placeholder: str = "{USERNAME}"): The username to replace the placeholder with. placeholder : str, optional The string to be replaced by the username. Default is "{USERNAME}". - + Returns ------- dict, list, or str @@ -32,9 +34,8 @@ def replace_username_in_dict(d, username: str, placeholder: str = "{USERNAME}"): def load_config( - config_path: str = "./area_classification/config.yaml", - placeholder: str = "{USERNAME}" - ) -> dict: + config_path: str = "./area_classification/config.yaml", placeholder: str = "{USERNAME}" +) -> dict: """ Loads a YAML configuration file and replaces placeholders with the username. diff --git a/area_classification/utilities/loading_data.py b/area_classification/utilities/loading_data.py index 34ad2c7..5e083d4 100644 --- a/area_classification/utilities/loading_data.py +++ b/area_classification/utilities/loading_data.py @@ -1,11 +1,13 @@ -import pandas as pd -import os import glob -from functools import reduce import logging +import os +from functools import reduce + +import pandas as pd logger = logging.getLogger(__name__) + def load_data(filepath): """ Function to load data from a CSV file and handle missing values. @@ -14,23 +16,29 @@ def load_data(filepath): ---------- filepath : str Path to the CSV file to be loaded. - + Returns ------- pd.DataFrame A pandas DataFrame containing the data from the CSV file. """ input_df = pd.read_csv(filepath, index_col=0) - + # Check for missing values missing_values = input_df.isnull().sum().sum() if missing_values > 0: - logger.warning(f"Warning: {missing_values} missing values found in input data. Missing values will be replaced with 0.") + logger.warning( + f"Warning: {missing_values} missing values found in input data." + + "Missing values will be replaced with 0." + ) input_df.fillna(0, inplace=True) - + return input_df -def load_format_data(filepath:str, file_pattern:str, join_column_name:str, config: str) -> pd.DataFrame: + +def load_format_data( + filepath: str, file_pattern: str, join_column_name: str, config: str +) -> pd.DataFrame: """ Function to load and format data downloaded from API calls @@ -55,25 +63,27 @@ def load_format_data(filepath:str, file_pattern:str, join_column_name:str, confi FileNotFoundError raises error if no files matching the pattern are found in the given filepath ValueError - raises error if the number of columns in the merged dataframe does not match the expected number - expected number is the sum of columns in all files minus the join column which is only present in the first file + raises error if the number of columns in the merged dataframe does not + match the expected number. The expected number is the sum of columns in + all files minus the join column which is only present in the first file (i.e. len(file_list) - 1) - """ + """ - # Load all of the data tables into a single DataFrame - # First column will be geo code, others be questions and rows indicate responses - # Find all files matching the pattern "ts" followed by any three digits and ".csv" in the given filepath + # Load all of the data tables into a single DataFrame + # First column will be geo code, others be questions and rows indicate responses + # Find all files matching the pattern "ts" followed by any three digits and ".csv" in + # the given filepath pattern = os.path.join(filepath, file_pattern) file_list = glob.glob(pattern) - + # Raise an error if no files match the pattern if not file_list: raise FileNotFoundError(f"No files matching {file_pattern} found in {filepath}") - + # Initialize an empty list to store DataFrames dfs = [] num_columns = 0 - + # Read all files and store them in dfs for file in file_list: df = pd.read_csv(file) @@ -81,17 +91,21 @@ def load_format_data(filepath:str, file_pattern:str, join_column_name:str, confi dfs.append(df) # Remove the join column from count, only added in first df - num_columns -= (len(file_list) - 1) - + num_columns -= len(file_list) - 1 + # Merge all dataframes on join_column_name column - merged_df = reduce(lambda left, right: pd.merge(left, right, on=join_column_name, how='outer'), dfs) + merged_df = reduce( + lambda left, right: pd.merge(left, right, on=join_column_name, how="outer"), dfs + ) if num_columns != merged_df.shape[1]: - raise ValueError(f"Expected {num_columns} columns, but got {merged_df.shape[1]} columns after merging.") - + raise ValueError( + f"Expected {num_columns} columns, but got {merged_df.shape[1]} columns after merging." + ) + # Write the DataFrame to a CSV file country_lad = join_column_name output_csv_path = os.path.join(config["input_directory"], f"{country_lad}_all_variables.csv") os.makedirs(os.path.dirname(output_csv_path), exist_ok=True) merged_df.to_csv(output_csv_path, index=False) - - return merged_df \ No newline at end of file + + return merged_df diff --git a/area_classification/utilities/qa_functions.py b/area_classification/utilities/qa_functions.py index 0e94d40..1cf49e7 100644 --- a/area_classification/utilities/qa_functions.py +++ b/area_classification/utilities/qa_functions.py @@ -1,15 +1,15 @@ +import pandas as pd -import pandas as pd -import os def run_qa_checks(df): """ - Main function to run QA checks on the provided dataframe. - - The user selects which type of QA checks to perform: an automatic summary or user-input checks - - If the user selects automatic summary checks, the function 'quality_checks_all_dfs' is called. + Main function to run QA checks on the provided dataframe. + + The user selects which type of QA checks to perform: an automatic summary or user-input checks + - If the user selects automatic summary checks, the function 'quality_checks_all_dfs' + is called. - If the user selects user-input checks, the function 'user_input_qa' is called - + Parameters ---------- df : pd.DataFrame @@ -21,10 +21,14 @@ def run_qa_checks(df): """ # Ask the user which QA function(s) to run - qa_choice = input( - "Which QA function do you want to run?\n" - "Type 'auto' for automatic summary checks or'user' for user input checks:" - ).strip().lower() + qa_choice = ( + input( + "Which QA function do you want to run?\n" + "Type 'auto' for automatic summary checks or'user' for user input checks:" + ) + .strip() + .lower() + ) if qa_choice == "auto": quality_checks_all_dfs(df) @@ -33,46 +37,52 @@ def run_qa_checks(df): else: print("Invalid choice. No QA checks run.") + # Ask user to put in standard rows (for this area classification this would be number of LADs) -def quality_checks_all_dfs(df): +def quality_checks_all_dfs(df): """ Function to perform automatic quality checks on the dataframe. - The user can select between a basic or a full auto check. Each option will print a summary report of the dataframe but will provide - a different level of detail. - + The user can select between a basic or a full auto check. Each option will print a + summary report of the dataframe but will provide a different level of detail. + Basic Auto Check: - Prints the total number of rows and columns. - Checks if any column contains a mix of data types and prints a warning if so. - Checks for missing values and zero values, printing the total counts if any are found. - - After completing the basic auto check, the user is prompted to decide if they want to proceed with the full auto check. - + - After completing the basic auto check, the user is prompted to decide if they want + to proceed with the full auto check. + Full Auto Check: - Prints the total number of rows and columns. - Prints the column names and data types of each column. - - Checks if any column contains a mix of data types. Prints a warning and lists the columns containing mixed data types. - - Checks for missing values and zero values - lists the columns containing missing or zero values and the total count found in each. - - After completing the full auto check, the user is prompted to decide if they want to proceed with the user input checks. + - Checks if any column contains a mix of data types. Prints a warning and lists the + columns containing mixed data types. + - Checks for missing values and zero values - lists the columns containing missing or + zero values and the total count found in each. + - After completing the full auto check, the user is prompted to decide if they want to + proceed with the user input checks. Parameters ---------- df : pd.DataFrame The DataFrame to be checked. - Returns + Returns ---------- - Print statement at the specified level of detail. + Print statement at the specified level of detail. """ - + # Ask the user if they want a basic or detailed auto check - auto_check_type = input("Do you want to run a basic or a full auto check? (basic/full): ").strip().lower() + auto_check_type = ( + input("Do you want to run a basic or a full auto check? (basic/full): ").strip().lower() + ) - if auto_check_type == 'basic': + if auto_check_type == "basic": # Complete basic auto check while True: - print("Summary report for DataFrame:") # Print the total number of rows and columns @@ -86,8 +96,7 @@ def quality_checks_all_dfs(df): print(f"Warning: Column '{col}' contains different data types: {types_in_col}") mixed_type_columns.append(col) if not mixed_type_columns: - print('All columns contain a single data type') - + print("All columns contain a single data type") # Check for missing values in the DataFrame missing_values = df.isnull().sum().sum() @@ -95,26 +104,27 @@ def quality_checks_all_dfs(df): print("No missing values") else: print(f"Total missing values in DataFrame: {missing_values}") - + # Check for zero values in the DataFrame zero_values = (df == 0).sum().sum() if zero_values == 0: print("No zero values") else: print(f"Total zero values in DataFrame: {zero_values}") - + # Ask the user if they want to complete the full auto check - run_full = input('Do you want to complete the full auto check? (yes/no) ').strip().lower() - if run_full == 'yes': - auto_check_type = 'full' + run_full = ( + input("Do you want to complete the full auto check? (yes/no) ").strip().lower() + ) + if run_full == "yes": + auto_check_type = "full" break - + else: - print('QA checks complete') + print("QA checks complete") break - - if auto_check_type == 'full': + if auto_check_type == "full": # Complete full auto check print("Summary report for DataFrame:") @@ -126,7 +136,7 @@ def quality_checks_all_dfs(df): print(f"Column Headings: {df.columns.tolist()}") # Print data types of each column - print(f"Data Types of each column:") + print("Data Types of each column:") print(df.dtypes) # Check if any column contains a mix of data types @@ -137,7 +147,7 @@ def quality_checks_all_dfs(df): print(f"Warning: Column '{col}' contains different data types: {types_in_col}") mixed_type_columns.append(col) if not mixed_type_columns: - print('All columns contain a single data type') + print("All columns contain a single data type") # Check for missing values in the DataFrame missing_values = df.isnull().sum().sum() @@ -162,189 +172,230 @@ def quality_checks_all_dfs(df): zero_per_column = (df == 0).sum() print("Zero values per column:") print(zero_per_column[zero_per_column > 0]) - + # Ask the user if they want to complete the user input checks - run_user_input = input('Do you want to complete the user input checks? (yes/no) ').strip().lower() - if run_user_input == 'yes': + run_user_input = ( + input("Do you want to complete the user input checks? (yes/no) ").strip().lower() + ) + if run_user_input == "yes": user_input_qa(df) else: - print('QA checks complete') + print("QA checks complete") return - def user_input_qa(df): - """ Function to perform tailored quality checks on the dataframe based on user inputs. - The function performs three main checks - the user is prompted at each stage to select if they want to perform or skip the check: + The function performs three main checks - the user is prompted at each stage to select if + they want to perform or skip the check: 1. Structure of the dataframe - - User inputs the expected number of rows and columns. - - The function checks if the dataframe matches these expectations and prints a warning if not. + - User inputs the expected number of rows and columns. + - The function checks if the dataframe matches these expectations and prints a warning + if not. 2. Ranges - - User selects whether they want to check the value ranges for the entire dataframe or specific columns. - - For the entire dataframe - the user inputs the expected min and max values. - The function checks if any values fall outside this range and prints a warning if so. The duplicate count per column is listed. - The user is then prompted to decide if they want to check specific columns. - - For specific columns - the user specifies which columns to check and inputs the expected min and max for each. - The function checks if any values in those columns fall outside the specified ranges and prints a warning if so. - The user is then prompted to decide if they want to check any further columns. + - User selects whether they want to check the value ranges for the entire dataframe or + specific columns. + - For the entire dataframe - the user inputs the expected min and max values. + The function checks if any values fall outside this range and prints a warning if so. + The duplicate count per column is listed. The user is then prompted to decide if they + want to check specific columns. + - For specific columns - the user specifies which columns to check and inputs the expected + min and max for each. + The function checks if any values in those columns fall outside the specified ranges and + prints a warning if so. The user is then prompted to decide if they want to check any + further columns. 3. Unique Values/ Duplicates - - User selects whether they want to check for duplicate values in the entire dataframe or specific columns. - - For the entire dataframe - the function checks each column for duplicate values and prints the total number found in each column. - - For specific columns - the user specifies which columns to check. The function checks each specified column for duplicate values and prints the total number found. + - User selects whether they want to check for duplicate values in the entire dataframe + or specific columns. + - For the entire dataframe - the function checks each column for duplicate values and prints + the total number found in each column. + - For specific columns - the user specifies which columns to check. The function checks each + specified column for duplicate values and prints the total number found. Parameters ---------- df : pd.DataFrame The DataFrame to be checked. - Returns + Returns ---------- Print statements based on the checks performed. - + """ # Check One - Structure of the dataframe - # Asks the user to input the expected number of rows and columns. Then checks if the dataframe matches these expectations. + # Asks the user to input the expected number of rows and columns. Then checks if the + # dataframe matches these expectations. # Ask the user if they want to check the number of rows and columns - print('Check 1/3 - Structure of the DataFrame') - check_shape = input("Do you want to check the number of rows and columns? (yes/no): ").strip().lower() + print("Check 1/3 - Structure of the DataFrame") + check_shape = ( + input("Do you want to check the number of rows and columns? (yes/no): ").strip().lower() + ) if check_shape == "yes": expected_rows = int(input("Enter the expected number of rows: ")) expected_cols = int(input("Enter the expected number of columns: ")) - + # Check if actual shape matches expected shape actual_rows, actual_cols = df.shape if actual_rows != expected_rows: - print(f"Warning: DataFrame contains {actual_rows} rows, expected {expected_rows}.") + print(f"Warning: DataFrame contains {actual_rows} rows, expected {expected_rows}.") else: print("Row count matches expected value.") - + if actual_cols != expected_cols: print(f"Warning: DataFrame contains {actual_cols} columns, expected {expected_cols}.") - else: - print("Column count matches expected value.") - + else: + print("Column count matches expected value.") + else: print("Skipping row and column count check.") - # Check Two - Ranges - # Asks the user to input the expected range for data values (min and max). Then checks if any values fall outside this range. + # Asks the user to input the expected range for data values (min and max). Then checks if + # any values fall outside this range. - # Ask the user if they want to check value ranges - print('Check 2/3 - Ranges') - check_ranges = input("Do you want to check for values outside a specific range? (yes/no): ").strip().lower() - if check_ranges == 'yes': - + print("Check 2/3 - Ranges") + check_ranges = ( + input("Do you want to check for values outside a specific range? (yes/no): ").strip().lower() + ) + if check_ranges == "yes": # Ask the user if they want to check ranges for the entire dataframe or only specific columns - range_scope = input("Do you want to check the ranges for the entire dataframe? (yes/no): ").strip().lower() - if range_scope == 'yes': + range_scope = ( + input("Do you want to check the ranges for the entire dataframe? (yes/no): ") + .strip() + .lower() + ) + if range_scope == "yes": while True: # Ask the user to input the expected min and max values for the dataframe min_expected = float(input("Enter the minimum expected value: ")) max_expected = float(input("Enter the maximum expected value: ")) - + # Convert all columns to numeric where possible (required to check ranges) - df_numeric = df.apply(pd.to_numeric, errors='coerce') - + df_numeric = df.apply(pd.to_numeric, errors="coerce") + # Check for values outside the expected range outside_range = df_numeric[(df_numeric < min_expected) | (df_numeric > max_expected)] if outside_range.any().any(): print("Warning: There are values outside the expected range!") - + # Print columns and counts of out-of-range values - out_of_range_counts = ((df_numeric < min_expected) | (df_numeric > max_expected)).sum() + out_of_range_counts = ( + (df_numeric < min_expected) | (df_numeric > max_expected) + ).sum() print("Out-of-range values per column:") print(out_of_range_counts[out_of_range_counts > 0]) - + else: print("All values are within the expected range.") - - # Ask the user if they want to check the range of any specific columns - specific_check = input("Do you want to check the range of any specific columns? (yes/no): ").strip().lower() - if specific_check == 'yes': - range_scope = 'no' + + # Ask the user if they want to check the range of any specific columns + specific_check = ( + input("Do you want to check the range of any specific columns? (yes/no): ") + .strip() + .lower() + ) + if specific_check == "yes": + range_scope = "no" break else: break - - if range_scope == 'no': + + if range_scope == "no": while True: # Ask the user to specify which columns to check - columns_to_check = input("Enter the column heading(s) you want to check, separated by commas: ").split(',') + columns_to_check = input( + "Enter the column heading(s) you want to check, separated by commas: " + ).split(",") columns_to_check = [col.strip() for col in columns_to_check] for col in columns_to_check: if col in df.columns: - min_expected = float(input(f"Enter the minimum expected value for column '{col}': ")) - max_expected = float(input(f"Enter the maximum expected value for column '{col}': ")) + min_expected = float( + input(f"Enter the minimum expected value for column '{col}': ") + ) + max_expected = float( + input(f"Enter the maximum expected value for column '{col}': ") + ) # Convert column to numeric for comparison - col_numeric = pd.to_numeric(df[col], errors='coerce') - out_of_range_mask = (col_numeric < min_expected) | (col_numeric > max_expected) + col_numeric = pd.to_numeric(df[col], errors="coerce") + out_of_range_mask = (col_numeric < min_expected) | ( + col_numeric > max_expected + ) out_of_range_count = out_of_range_mask.sum() if out_of_range_count > 0: - print(f"Warning: Column '{col}' has {out_of_range_count} values outside the expected range [{min_expected}, {max_expected}].") + print( + f"Warning: Column '{col}' has {out_of_range_count} values outside" + + "the expected range [{min_expected}, {max_expected}]." + ) else: print(f"All values in column '{col}' are within the expected range.") else: print(f"Column '{col}' not found in DataFrame.") # Ask the user if they want to check the range of any further columns - more_ranges = input("Do you want to check the range of any more columns? (yes/no): ").strip().lower() - if more_ranges == 'yes': - continue + more_ranges = ( + input("Do you want to check the range of any more columns? (yes/no): ") + .strip() + .lower() + ) + if more_ranges == "yes": + continue else: break - - - else: - print("Skipping range check.") - - + else: + print("Skipping range check.") # Check Three - Unique Values/ Duplicates - # Asks the user to specify if a column contains only unique values. Then checks if there are any duplicates in that column. + # Asks the user to specify if a column contains only unique values. Then checks if there are + # any duplicates in that column. - # Ask the user if they want to check for duplicate values. - print('Check 3/3 - Duplicate Values') + # Ask the user if they want to check for duplicate values. + print("Check 3/3 - Duplicate Values") check_duplicates = input("Do you want to check for duplicate values? (yes/no): ").strip().lower() - if check_duplicates == 'yes': - + if check_duplicates == "yes": # Ask the user if they want to check for the entire data frame or only specific columns - duplicate_scope = input("Do you want to check every column for duplicate values? (yes/no): ").strip().lower() - if duplicate_scope == 'yes': - #Check every column for duplicate values + duplicate_scope = ( + input("Do you want to check every column for duplicate values? (yes/no): ") + .strip() + .lower() + ) + if duplicate_scope == "yes": + # Check every column for duplicate values for col in df.columns: duplicate_count = df[col].duplicated().sum() if duplicate_count > 0: print(f"Warning: Column '{col}' contains {duplicate_count} duplicated values.") else: print(f"Column '{col}' contains no duplicate values.") - - elif duplicate_scope == 'no': - + + elif duplicate_scope == "no": while True: # Ask the user to specify which columns to check - columns_to_check = input("Enter the column heading(s) you want to check, separated by commas: ").split(',') + columns_to_check = input( + "Enter the column heading(s) you want to check, separated by commas: " + ).split(",") columns_to_check = [col.strip() for col in columns_to_check] for col in columns_to_check: if col in df.columns: duplicate_count = df[col].duplicated().sum() if duplicate_count > 0: - print(f"Warning: Column '{col}' contains {duplicate_count} duplicate values:") + print( + f"Warning: Column '{col}' contains {duplicate_count}" + + "duplicate values:" + ) # Print each duplicate value and how many times it appears in the column duplicated_values = df[col][df[col].duplicated(keep=False)] value_counts = duplicated_values.value_counts() @@ -354,17 +405,23 @@ def user_input_qa(df): print(f"Column '{col}' contains no duplicate values.") else: print(f"Column '{col}' not found in DataFrame.") - + # Ask the user if they want to check for duplicates in any further columns - more_duplicates = input("Do you want to check for duplicate values in any more columns? (yes/no): ").strip().lower() - if more_duplicates == 'yes': + more_duplicates = ( + input( + "Do you want to check for duplicate values in any more columns? (yes/no): " + ) + .strip() + .lower() + ) + if more_duplicates == "yes": continue else: break - + else: print("Skipping duplicate value check.") - print('QA checks complete') + print("QA checks complete") return diff --git a/data/lookups/UK_selected_codes_lookup.csv b/data/lookups/UK_selected_codes_lookup.csv index c829e3e..46eb867 100644 --- a/data/lookups/UK_selected_codes_lookup.csv +++ b/data/lookups/UK_selected_codes_lookup.csv @@ -1,5 +1,5 @@ variable_name,variable_code,table_ID,table_name,country,new_code,domain,radial_plot_label -Lives in a communal establishment,ts0010003,TS001,Residency type,ew,v01,Demography and Migration,Communual establishment living +Lives in a communal establishment,ts0010003,TS001,Residency type,ew,v01,Demography and Migration,Communal establishment living Never married and never registered a civil partnership,ts0020002,TS002,Legal partnership status,ew,v02,Demography and Migration,Never married and never in a civil partnership Married or in a registered civil partnership,ts0020003,TS002,Legal partnership status,ew,v03,Demography and Migration,Married or in a civil partnership Separated or divorced,separated_divorced,TS002,Legal partnership status,ew,v04,Demography and Migration,Separated or divorced @@ -59,7 +59,7 @@ Economically active: Unemployed,ts0660013,TS066,Economic activity status,ew,v57, Level 1 Level 2 or Apprenticeship,level_1_2_and_appr,TS067,Highest level of qualification,ew,v58,Education,Level 1 Level 2 or Apprenticeship Level 3 qualifications,ts0670006,TS067,Highest level of qualification,ew,v59,Education,Level 3 qualifications Level 4 qualifications or above,ts0670007,TS067,Highest level of qualification,ew,v60,Education,Level 4 qualifications or above -Lives in a communal establishment,ni1920003,ni192,Residence Type,ni,v01,Demography and Migration,Communual establishment living +Lives in a communal establishment,ni1920003,ni192,Residence Type,ni,v01,Demography and Migration,Communal establishment living Never married and never registered a civil partnership,ni1370002,ni137,Marital and Civil Partnership Status,ni,v02,Demography and Migration,Never married and never in a civil partnership Married or in a registered civil partnership,ni1380002,ni138,Marital and Civil Partnership Status - 3 Categories,ni,v03,Demography and Migration,Married or in a civil partnership Separated or divorced,separated_divorced,ni137,Marital and Civil Partnership Status,ni,v04,Demography and Migration,Separated or divorced @@ -118,7 +118,7 @@ Economically active: Unemployed,ni0500005,ni050,Economic Activity - 12 Categorie Level 1 Level 2 or Apprenticeship,level_1_2_and_appr,ni110,Qualifications (Highest Level),ni,v58,Education,Level 1 Level 2 or Apprenticeship Level 3 qualifications,ni1100006,ni110,Qualifications (Highest Level),ni,v59,Education,Level 3 qualifications Level 4 qualifications or above,ni1100007,ni110,Qualifications (Highest Level),ni,v60,Education,Level 4 qualifications or above -Lives in a communal establishment,UV101b0003,UV101b,Usual resident population by sex by age (6),scot,v01,Demography and Migration,Communual establishment living +Lives in a communal establishment,UV101b0003,UV101b,Usual resident population by sex by age (6),scot,v01,Demography and Migration,Communal establishment living Never married and never registered a civil partnership,UV1040002,UV104,Marital and civil partnership status,scot,v02,Demography and Migration,Never married and never in a civil partnership Married or in a registered civil partnership,UV1040003,UV104,Marital and civil partnership status,scot,v03,Demography and Migration,Married or in a civil partnership Separated or divorced,separated_divorced,UV104,Marital and civil partnership status,scot,v04,Demography and Migration,Separated or divorced diff --git a/docs/specifications/4.0_Post_Processing.md b/docs/specifications/4.0_Post_Processing.md index 60e3847..70a660c 100644 --- a/docs/specifications/4.0_Post_Processing.md +++ b/docs/specifications/4.0_Post_Processing.md @@ -76,6 +76,9 @@ Example group output using dummy data. The subgroup output has the same structur #### 4.2.3 Two groups of radial plots: One is based on the comparison of a cluster to the UK mean. The other group compare variables within a cluster to the mean of parent cluster. For example, a radial plot for cluster 5a (group level) represents a comparison to the mean of all clusters that make up supergroup 5. +#### 4.2.5 Bar charts +Similarly to the radial plots, bar charts are created for each cluster, one is based on the comparison of the cluster to the UK mean. The other compares variables within a cluster to the mean of parent cluster. For example, bar chart for cluster 5a (group level) represents a comparison to the mean of all clusters that make up supergroup 5. These are displayed as horizontal bar charts, broken down by domain e.g. Labour market, education, housing. + ### 4.3 Process 1. Take the table containing all chosen variables that is outputted at the pre-processing stage. For each value and each variable, the standardised mean is calculated. This ensures that all variables contribute equally to further calculations of the mean. @@ -83,7 +86,7 @@ One is based on the comparison of a cluster to the UK mean. The other group comp 3. Using the tables from steps 1 and 2, means are calculated for each variable at each cluster/hierarchy level, standardised to the UK mean. 4. Using the tables from steps 1 and 2, it sorts by supergroup/group/subgroup and calculates means of each variable in each cluster, standardised to the parent mean. 5. Radial plots are created from the outputs of steps 3 and 4. -6. Horizontal bar charts (long charts and small multiples), are created from the outputs of steps 3 and 4. +6. Horizontal bar charts are created from the outputs of steps 3 and 4. 7. Short summaries are printed out in the terminal when running the post-processing component. These provide an overview of the characteristics of each cluster. ### 4.4 Strengths @@ -94,5 +97,5 @@ Interpretability: Producing radial plots and bar charts highlight which variables drive cluster differences. ### 4.5 Limitations -Standardisation assumes that variables are normally distributed and standardised means can be affected by outliers in the data. It is therefore not always appropriate depending on the dataset. - +Standardisation assumes that variables are normally distributed and standardised means can be affected by outliers in the data. +It is therefore not always appropriate depending on the dataset. \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..2f612e8 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,20 @@ +[tool.ruff.lint] +select = [ + "E", + "F", + "B", + "SIM", + "I", +] +ignore = ["D203", "E203"] + +[tool.ruff] +line-length = 101 +exclude = ["tests"] + +[tool.ruff.format] +line-ending = "auto" + +[tool.bandit] +exclude_dirs = ["tests", "docs"] +skips = [] diff --git a/setup.cfg b/setup.cfg index 9dbacba..56ebff5 100644 --- a/setup.cfg +++ b/setup.cfg @@ -13,12 +13,12 @@ classifiers = [options] packages = area_classification + area_classification.analysis area_classification.clustering area_classification.downloading_data area_classification.post_processing area_classification.pre_processing area_classification.utilities - area_classification.data_visualisation include_package_data = True python_requires = >=3.9 zip_safe = no diff --git a/tests/downloading_data/test_ew_lad_bulk_download.py b/tests/downloading_data/test_ew_lad_bulk_download.py index de78b28..59c6cbb 100644 --- a/tests/downloading_data/test_ew_lad_bulk_download.py +++ b/tests/downloading_data/test_ew_lad_bulk_download.py @@ -1,11 +1,18 @@ # Unit test not running yet! import unittest -from unittest.mock import patch, Mock, MagicMock, mock_open +from unittest.mock import Mock, mock_open, patch + import pandas as pd -from area_classification.downloading_data.ew_lad_bulk_download import get_census_table_urls, download_and_unzip_data, format_and_export_metadata_table + +from area_classification.downloading_data.ew_lad_bulk_download import ( + download_and_unzip_data, + format_and_export_metadata_table, + get_census_table_urls, +) MODULE = "area_classification.downloading_data.ew_lad_bulk_download" + class TestGetCensusTableUrls(unittest.TestCase): @patch(f"{MODULE}.requests.get") def test_get_census_table_urls(self, mock_get): @@ -24,14 +31,12 @@ def test_get_census_table_urls(self, mock_get): mock_response.content = html.encode("utf-8") mock_get.return_value = mock_response - config = { - "england_and_wales_table_codes_to_remove": ["def456"] - } + config = {"england_and_wales_table_codes_to_remove": ["def456"]} urls = get_census_table_urls(config) expected_urls = [ "https://www.nomisweb.co.uk/output/census/2021/census2021-abc123.zip", - "https://www.nomisweb.co.uk/output/census/2021/census2021-ghi789.zip" + "https://www.nomisweb.co.uk/output/census/2021/census2021-ghi789.zip", ] self.assertCountEqual(urls, expected_urls) @@ -47,8 +52,16 @@ class TestDownloadAndUnzipData(unittest.TestCase): @patch(f"{MODULE}.requests.get") @patch(f"{MODULE}.tempfile.mkdtemp") def test_download_and_unzip_data( - self, mock_mkdtemp, mock_requests_get, mock_open_func, mock_zipfile, - mock_glob, mock_read_csv, mock_to_csv, mock_makedirs, mock_rmtree + self, + mock_mkdtemp, + mock_requests_get, + mock_open_func, + mock_zipfile, + mock_glob, + mock_read_csv, + mock_to_csv, + mock_makedirs, + mock_rmtree, ): # --- Test case 1: ts001, unit from metadata --- mock_mkdtemp.return_value = "/tmp/mockdir" @@ -57,7 +70,7 @@ def test_download_and_unzip_data( # CSV and metadata txt file mock_glob.side_effect = [ ["/tmp/mockdir/census2021-ts001-ltla.csv"], # CSV - ["/tmp/mockdir/metadata/meta.txt"] # Metadata txt + ["/tmp/mockdir/metadata/meta.txt"], # Metadata txt ] # Mock open for metadata file metadata_content = "Some header\nUnit of measure: Household\nOther info\n" @@ -65,13 +78,15 @@ def test_download_and_unzip_data( m_open.return_value.__iter__ = lambda self: iter(metadata_content.splitlines(True)) mock_open_func.side_effect = m_open - df = pd.DataFrame({ - "date": [20210101], - "geography": ["Area1"], - "geography code": ["E123"], - "col1": [1], - "col2": [2] - }) + df = pd.DataFrame( + { + "date": [20210101], + "geography": ["Area1"], + "geography code": ["E123"], + "col1": [1], + "col2": [2], + } + ) mock_read_csv.return_value = df zip_urls = ["https://www.nomisweb.co.uk/output/census/2021/census2021-ts001.zip"] config = {"input_directory": "/mock/input"} @@ -82,7 +97,7 @@ def test_download_and_unzip_data( # --- Test case 2: ts007a, special case unit --- mock_glob.side_effect = [ ["/tmp/mockdir/census2021-ts007a-ltla.csv"], # CSV - [] # No metadata txt + [], # No metadata txt ] zip_urls = ["https://www.nomisweb.co.uk/output/census/2021/census2021-ts007a.zip"] meta = download_and_unzip_data(zip_urls, config) @@ -96,23 +111,21 @@ def test_download_and_unzip_data( self.assertTrue(mock_to_csv.called) - - - class TestFormatAndExportMetadataTable(unittest.TestCase): @patch(f"{MODULE}.os.makedirs") @patch(f"{MODULE}.pd.DataFrame.to_csv") def test_format_and_export_metadata_table(self, mock_to_csv, mock_makedirs): - # Sample input DataFrame - meta_data_table = pd.DataFrame({ - "Full_Name": [ - "Tenure of household: Total: All households", - "Year of arrival in the UK: Total: All usual residents; measures: Value" - ], - "Variable_ID": ["ts0010001", "ts0020001"], - "Table_ID": ["TS001", "TS002"] - }) + meta_data_table = pd.DataFrame( + { + "Full_Name": [ + "Tenure of household: Total: All households", + "Year of arrival in the UK: Total: All usual residents; measures: Value", + ], + "Variable_ID": ["ts0010001", "ts0020001"], + "Table_ID": ["TS001", "TS002"], + } + ) config = {"input_directory": "mock_dir/input"} # Call the function @@ -137,6 +150,5 @@ def test_format_and_export_metadata_table(self, mock_to_csv, mock_makedirs): self.assertEqual(result.loc[1, "Variable_Name"], "All usual residents") - -if __name__ == '__main__': - unittest.main() \ No newline at end of file +if __name__ == "__main__": + unittest.main() diff --git a/tests/downloading_data/test_scot_tables_reformatting.py b/tests/downloading_data/test_scot_tables_reformatting.py index af592ff..541d6c8 100644 --- a/tests/downloading_data/test_scot_tables_reformatting.py +++ b/tests/downloading_data/test_scot_tables_reformatting.py @@ -1,15 +1,19 @@ import os import unittest -from unittest.mock import patch, mock_open, call, MagicMock +from unittest.mock import MagicMock, call, mock_open, patch + import pandas as pd import pandas.testing as pdt -import numpy as np from area_classification.downloading_data.scot_tables_reformatting import ( - rename_csv_files_by_table_id, extract_pop_density_table, - extract_metadata_from_files, replace_ca19_names_with_codes, - remove_rows, reformat_migrant_indicator, reformat_pop_density, - replace_variable_names_with_codes + extract_metadata_from_files, + extract_pop_density_table, + reformat_migrant_indicator, + reformat_pop_density, + remove_rows, + rename_csv_files_by_table_id, + replace_ca19_names_with_codes, + replace_variable_names_with_codes, ) @@ -30,15 +34,9 @@ def test_rename_csv_files_by_table_id( # Check that os.rename was called for each CSV file with a Table ID expected_calls = [ - call( - os.path.join(test_folder, "file1.csv"), - os.path.join(test_folder, "UV101b.csv") - ), - call( - os.path.join(test_folder, "file2.csv"), - os.path.join(test_folder, "UV101b.csv") - ) - ] + call(os.path.join(test_folder, "file1.csv"), os.path.join(test_folder, "UV101b.csv")), + call(os.path.join(test_folder, "file2.csv"), os.path.join(test_folder, "UV101b.csv")), + ] self.assertEqual(mock_rename.call_count, 2) self.assertEqual(mock_rename.call_args_list, expected_calls) @@ -67,7 +65,7 @@ def test_extract_pop_density_table( mock_to_csv.assert_called_once() mock_remove.assert_called_once() mock_logger.info.assert_called() - + class TestExtractMetadataFromFiles(unittest.TestCase): @patch("area_classification.downloading_data.scot_tables_reformatting.logger") @@ -84,7 +82,7 @@ def test_extract_metadata_from_files( "migrant_indicator.csv", # Special case "population_density.csv", # Special case "UV607.csv", # Special case for table_name - "UV123.csv" # Normal file + "UV123.csv", # Normal file ] # Create a mapping of filenames to mock file objects @@ -100,14 +98,10 @@ def open_side_effect(file, *args, **kwargs): def csv_reader_side_effect(file_obj): if "UV607.csv" in file_obj.name: # 9 rows, row 4 has special format, row 9 has "Individuals" - return [ - [], [], [], ["Some-Text-TableName-All"], [], [], [], [], ["Individuals"] - ] + return [[], [], [], ["Some-Text-TableName-All"], [], [], [], [], ["Individuals"]] elif "UV123.csv" in file_obj.name: # 9 rows, row 4 has normal format, row 9 has "Households" - return [ - [], [], [], ["Some-Text-AnotherTable"], [], [], [], [], ["Households"] - ] + return [[], [], [], ["Some-Text-AnotherTable"], [], [], [], [], ["Households"]] else: return [[]] * 9 @@ -125,23 +119,36 @@ def csv_reader_side_effect(file_obj): # Check special cases self.assertIn( {"table_id": "migrant_indicator", "table_name": "Migrant Indicator", "unit": "Person"}, - metadata + metadata, ) self.assertIn( - {"table_id": "population_density", "table_name": "Population Density", "unit": "Persons per square kilometer"}, - metadata + { + "table_id": "population_density", + "table_name": "Population Density", + "unit": "Persons per square kilometer", + }, + metadata, ) # Check normal file - self.assertTrue(any(entry["table_id"] == "UV123" and entry["unit"] == "Household" for entry in metadata)) + self.assertTrue( + any(entry["table_id"] == "UV123" and entry["unit"] == "Household" for entry in metadata) + ) # Check UV607 special parsing - self.assertTrue(any(entry["table_id"] == "UV607" and "TableName" in entry["table_name"] for entry in metadata)) + self.assertTrue( + any( + entry["table_id"] == "UV607" and "TableName" in entry["table_name"] + for entry in metadata + ) + ) class TestReplaceCA19NamesWithCodes(unittest.TestCase): def test_replace_names_with_codes(self): - config = {"reformat_scot_input_folder": "./tests/data/test_scot_tables_reformatting/ReplaceCA19NamesWithCodes"} + config = { + "reformat_scot_input_folder": "./tests/data/test_scot_tables_reformatting/ReplaceCA19NamesWithCodes" + } scot_input_folder = "./tests/data/test_scot_tables_reformatting/ReplaceCA19NamesWithCodes" LAD_lookup_file_path = "./tests/data/test_scot_tables_reformatting/lookup.csv" @@ -150,29 +157,62 @@ def test_replace_names_with_codes(self): os.makedirs(scot_input_folder) # Create the lookup df needed - lookup_df = pd.DataFrame({ - "LAD22NM": ["City of Edinburgh", "Glasgow City"], - "LAD22CD": ["S12000036", "S12000049"] - }) + lookup_df = pd.DataFrame( + {"LAD22NM": ["City of Edinburgh", "Glasgow City"], "LAD22CD": ["S12000036", "S12000049"]} + ) lookup_df.to_csv(LAD_lookup_file_path, index=False, header=True) - + # Create the uv100 test input file with mock data - uv100 = pd.DataFrame({ - "Title row1": ["empty","empty","empty","empty","empty","empty","empty","empty","Council Area 2019","City of EDINburgh", "GlasGow City"], - "Title row2": ["empty","empty","empty","empty","empty","empty","empty","empty","LAD22CD","S12000036", "S12000049"], - "Title row3": ["empty","empty","empty","empty","empty","empty","empty","empty","variable1","empty", "empty"], - }) + uv100 = pd.DataFrame( + { + "Title row1": [ + "empty", + "empty", + "empty", + "empty", + "empty", + "empty", + "empty", + "empty", + "Council Area 2019", + "City of EDINburgh", + "GlasGow City", + ], + "Title row2": [ + "empty", + "empty", + "empty", + "empty", + "empty", + "empty", + "empty", + "empty", + "LAD22CD", + "S12000036", + "S12000049", + ], + "Title row3": [ + "empty", + "empty", + "empty", + "empty", + "empty", + "empty", + "empty", + "empty", + "variable1", + "empty", + "empty", + ], + } + ) # Save the uv100 test input file output_path = os.path.join(scot_input_folder, "uv100.csv") uv100.to_csv(output_path, index=False, header=True) # Call the function to test - replace_ca19_names_with_codes( - scot_input_folder, - LAD_lookup_file_path, - config - ) + replace_ca19_names_with_codes(scot_input_folder, LAD_lookup_file_path, config) # Look in the folder to check the result of the output file for filename in os.listdir(scot_input_folder): @@ -183,12 +223,12 @@ def test_replace_names_with_codes(self): result_df = pd.read_csv(file_path) # Create the expected dataframe - expected_df = pd.DataFrame([ - ["S12000036", "S12000036", "empty"], - ["S12000049", "S12000049", "empty"] - ], columns=["Council Area 2019", "LAD22CD", "variable1"]) + expected_df = pd.DataFrame( + [["S12000036", "S12000036", "empty"], ["S12000049", "S12000049", "empty"]], + columns=["Council Area 2019", "LAD22CD", "variable1"], + ) - # Compare the result with the expected dataframe + # Compare the result with the expected dataframe pdt.assert_frame_equal(result_df, expected_df) # Clean up - remove created files @@ -206,27 +246,28 @@ class TestRemoveRows(unittest.TestCase): @patch("pandas.read_csv") def test_remove_rows(self, mock_read_csv, mock_listdir, mock_dirname, mock_makedirs): # Mock input DataFrame: 5 rows, "Council Area 2019" in first cell, last 3 rows are extra - input_df = pd.DataFrame({ - 0: ["Table name", "Council Area 2019", "A", "B", "extra1", "extra2", "extra3"], - 1: ["all people", '', 3, 4, "extra4", "extra5", "extra6"] - }) + input_df = pd.DataFrame( + { + 0: ["Table name", "Council Area 2019", "A", "B", "extra1", "extra2", "extra3"], + 1: ["all people", "", 3, 4, "extra4", "extra5", "extra6"], + } + ) mock_read_csv.return_value = input_df.copy() captured = {} + def to_csv_side_effect(self, file_path, index, header): - captured['df'] = self.copy() + captured["df"] = self.copy() + with patch.object(pd.DataFrame, "to_csv", new=to_csv_side_effect): config = {"reformat_scot_input_folder": "dummy_folder"} remove_rows(config, "dummy_folder") # Expected output after processing - expected_df = pd.DataFrame({ - 0: ["CA19", "A", "B"], - 1: ["all people", 3, 4] - }) + expected_df = pd.DataFrame({0: ["CA19", "A", "B"], 1: ["all people", 3, 4]}) # Compare output DataFrame to expected DataFrame - pd.testing.assert_frame_equal(captured['df'].reset_index(drop=True), expected_df) + pd.testing.assert_frame_equal(captured["df"].reset_index(drop=True), expected_df) class TestReformatMigrantIndicator(unittest.TestCase): @@ -236,39 +277,44 @@ class TestReformatMigrantIndicator(unittest.TestCase): @patch("pandas.read_csv") def test_reformat_migrant_indicator(self, mock_read_csv, mock_join, mock_exists, mock_makedirs): # Mock input DataFrame (after skiprows=9, header=None) - migrant_data = pd.DataFrame([ - ["Table name", "Header1", "Header2", "Total"], - ["Council Area 2019", "Value1", "Value2", "Sum"], - ["Glasgow City", 10, 20, 30], - ["Edinburgh, City of", 5, 15, 20], - ["Total", "", "", ""] - ]) + migrant_data = pd.DataFrame( + [ + ["Table name", "Header1", "Header2", "Total"], + ["Council Area 2019", "Value1", "Value2", "Sum"], + ["Glasgow City", 10, 20, 30], + ["Edinburgh, City of", 5, 15, 20], + ["Total", "", "", ""], + ] + ) # Mock lookup DataFrame - lookup_data = pd.DataFrame({ - "LAD22NM": ["Glasgow City", "Edinburgh, City of"], - "LAD22CD": ["S12000046", "S12000036"] - }) + lookup_data = pd.DataFrame( + { + "LAD22NM": ["Glasgow City", "Edinburgh, City of"], + "LAD22CD": ["S12000046", "S12000036"], + } + ) # pandas.read_csv returns migrant_data first, then lookup_data mock_read_csv.side_effect = [migrant_data, lookup_data] # Capture output DataFrame captured = {} + def to_csv_side_effect(self, file_path, index): - captured['df'] = self.copy() - captured['file_path'] = file_path + captured["df"] = self.copy() + captured["file_path"] = file_path with patch.object(pd.DataFrame, "to_csv", new=to_csv_side_effect): config = {"reformat_scot_input_folder": "output_folder"} reformat_migrant_indicator("input_folder", "lookup.csv", config) # Check output DataFrame - df = captured['df'] + df = captured["df"] expected_columns = ["CA19", "Total", "Header1", "Header2"] self.assertListEqual(list(df.columns), expected_columns) self.assertIn("S12000046", df["CA19"].values) self.assertIn("S12000036", df["CA19"].values) - self.assertEqual(captured['file_path'], "output_folder/reformat_migrant_indicator.csv") + self.assertEqual(captured["file_path"], "output_folder/reformat_migrant_indicator.csv") class TestReformatMigrantIndicator(unittest.TestCase): @@ -278,40 +324,41 @@ class TestReformatMigrantIndicator(unittest.TestCase): @patch("pandas.read_csv") def test_reformat_migrant_indicator(self, mock_read_csv, mock_join, mock_exists, mock_makedirs): # Mock input DataFrame (after skiprows=9, header=None) - migrant_data = pd.DataFrame([ - ["Table name", "Header1", "Header2", "Total"], - ["Council Area 2019", "", "", ""], - ["Glasgow City", 10, 20, 30], - ["Edinburgh", 5, 15, 20], - ["Total", "", "", ""] - ]) + migrant_data = pd.DataFrame( + [ + ["Table name", "Header1", "Header2", "Total"], + ["Council Area 2019", "", "", ""], + ["Glasgow City", 10, 20, 30], + ["Edinburgh", 5, 15, 20], + ["Total", "", "", ""], + ] + ) # Mock lookup DataFrame - lookup_data = pd.DataFrame({ - "LAD22NM": ["Glasgow City", "Edinburgh"], - "LAD22CD": ["S12000046", "S12000036"] - }) + lookup_data = pd.DataFrame( + {"LAD22NM": ["Glasgow City", "Edinburgh"], "LAD22CD": ["S12000046", "S12000036"]} + ) # pandas.read_csv returns migrant_data first, then lookup_data mock_read_csv.side_effect = [migrant_data, lookup_data] # Capture output DataFrame captured = {} + def to_csv_side_effect(self, file_path, index): - captured['df'] = self.copy() - captured['file_path'] = file_path + captured["df"] = self.copy() + captured["file_path"] = file_path with patch.object(pd.DataFrame, "to_csv", new=to_csv_side_effect): config = {"reformat_scot_input_folder": "output_folder"} reformat_migrant_indicator("input_folder", "lookup.csv", config) # Check output DataFrame - df = captured['df'] + df = captured["df"] expected_columns = ["CA19", "Total", "Header1", "Header2"] self.assertListEqual(list(df.columns), expected_columns) self.assertIn("S12000046", df["CA19"].values) self.assertIn("S12000036", df["CA19"].values) - self.assertEqual(captured['file_path'], "output_folder/reformat_migrant_indicator.csv") - + self.assertEqual(captured["file_path"], "output_folder/reformat_migrant_indicator.csv") class TestReformatPopDensity(unittest.TestCase): @@ -320,23 +367,26 @@ class TestReformatPopDensity(unittest.TestCase): @patch("os.path.exists", return_value=True) @patch("os.path.join", side_effect=lambda *args: "/".join(args)) @patch("pandas.read_csv") - def test_reformat_pop_density(self, mock_read_csv, mock_join, mock_exists, mock_dirname, mock_makedirs): - - + def test_reformat_pop_density( + self, mock_read_csv, mock_join, mock_exists, mock_dirname, mock_makedirs + ): # Mock input DataFrame - df_input = pd.DataFrame({ - "Area Name": ["Area name1", "Area name1"], - "Area code": ["S92000003", "S12000046"], # expect S92000003 row to be removed - "Area type": ["Country", "Council Area"], - "Population density": [100, 200] - }) + df_input = pd.DataFrame( + { + "Area Name": ["Area name1", "Area name1"], + "Area code": ["S92000003", "S12000046"], # expect S92000003 row to be removed + "Area type": ["Country", "Council Area"], + "Population density": [100, 200], + } + ) mock_read_csv.return_value = df_input.copy() # Capture output DataFrame captured = {} + def to_csv_side_effect(self, file_path, index): - captured['df'] = self.copy() - captured['file_path'] = file_path + captured["df"] = self.copy() + captured["file_path"] = file_path with patch.object(pd.DataFrame, "to_csv", new=to_csv_side_effect): config = {"reformat_scot_input_folder": "output_folder"} @@ -345,16 +395,13 @@ def to_csv_side_effect(self, file_path, index): # Check output DataFrame columns expected_columns = [ "CA19", - "Population density (number of usual residents per square kilometre)" + "Population density (number of usual residents per square kilometre)", ] - self.assertListEqual(list(captured['df'].columns), expected_columns) + self.assertListEqual(list(captured["df"].columns), expected_columns) # Check 'S92000003' row is removed - self.assertNotIn("S92000003", captured['df']["CA19"].values) + self.assertNotIn("S92000003", captured["df"]["CA19"].values) # Check output file path - self.assertEqual( - captured['file_path'], - "output_folder/reformat_population_density.csv" - ) + self.assertEqual(captured["file_path"], "output_folder/reformat_population_density.csv") class TestReplaceVariableNamesWithCodes(unittest.TestCase): @@ -362,29 +409,34 @@ class TestReplaceVariableNamesWithCodes(unittest.TestCase): @patch("os.path.dirname", return_value="dummy_dir") @patch("os.listdir", return_value=["reformat_uv123.csv"]) @patch("pandas.read_csv") - def test_replace_variable_names_with_codes_uv123(self, mock_read_csv, mock_listdir, mock_dirname, mock_makedirs): + def test_replace_variable_names_with_codes_uv123( + self, mock_read_csv, mock_listdir, mock_dirname, mock_makedirs + ): # Step 1: Mock input DataFrame - input_df = pd.DataFrame({ - "CA19": ["A", "B"], - "var1": [1, 2], - "var2": [3, 4], - "var3": [5, 6] # expecting this to be dropped from output - }) + input_df = pd.DataFrame( + { + "CA19": ["A", "B"], + "var1": [1, 2], + "var2": [3, 4], + "var3": [5, 6], # expecting this to be dropped from output + } + ) mock_read_csv.return_value = input_df.copy() # Step 2: Patch to_csv to capture output DataFrame captured = {} + def to_csv_side_effect(self, file_path, index, header): - - captured['df'] = self.copy() + captured["df"] = self.copy() + with patch.object(pd.DataFrame, "to_csv", new=to_csv_side_effect): config = {"reformat_scot_input_folder": "dummy_folder"} result = replace_variable_names_with_codes(config) - + # Step 3: Assert output DataFrame columns # last column has been dropped from output expected_columns = ["CA19", "uv1230001", "uv1230002"] - self.assertListEqual(list(captured['df'].columns), expected_columns) + self.assertListEqual(list(captured["df"].columns), expected_columns) # Step 4: Assert returned variable names and ids # varialble_names and variable_ids get created before the last column is dropped @@ -395,4 +447,4 @@ def to_csv_side_effect(self, file_path, index, header): if __name__ == "__main__": - unittest.main() \ No newline at end of file + unittest.main() diff --git a/tests/integration_tests/test_integration_cluster_summaries.py b/tests/integration_tests/test_integration_cluster_summaries.py index 2b7f12a..0b9ba3a 100644 --- a/tests/integration_tests/test_integration_cluster_summaries.py +++ b/tests/integration_tests/test_integration_cluster_summaries.py @@ -1,93 +1,123 @@ -#pip install xlwt -import xlwt +# pip install xlwt +import io +import os import shutil import unittest -import pandas as pd -import os from unittest.mock import patch -import io + +import pandas as pd + from area_classification.post_processing.cluster_summaries import cluster_summaries_wrapper + class TestClusterSummariesWrapperIntegration(unittest.TestCase): def setUp(self): # Create a mock configuration self.config = { - 'input_directory': './tests/data/summaries/', - 'output_directory': './tests/data/summaries/', + "input_directory": "./tests/data/summaries/", + "output_directory": "./tests/data/summaries/", + "qa_directory": "./tests/data/summaries/", } # Create mock data for restructured_cluster_table_long - self.restructured_cluster_table_long = pd.DataFrame({ - 'LAD_name': ['Hartlepool', 'Middlesbrough', 'City of Edinburgh', 'Glasgow City'], - 'LAD_code': ['E06000001' , 'E06000002', 'S12000036', 'S12000049'], - 'supergroup': ['1', '1', '2', '2'], - 'group': ['1a', '1a', '2b', '2b'], - 'subgroup': ['1a1', '1a2', '2b1', '2b2'], - 'v01': [0.5, 0.6, 0.7, 0.8], - 'v02': [0.1, 0.7, 0.2, 0.3], - 'v12': [0.5, 0.3, 0.9, 0.8] - }) - self.restructured_cluster_table_long.to_csv('test_restructured_cluster_table_long.csv', index=False) + self.restructured_cluster_table_long = pd.DataFrame( + { + "LAD_name": ["Hartlepool", "Middlesbrough", "City of Edinburgh", "Glasgow City"], + "LAD_code": ["E06000001", "E06000002", "S12000036", "S12000049"], + "supergroup": ["1", "1", "2", "2"], + "group": ["1a", "1a", "2b", "2b"], + "subgroup": ["1a1", "1a2", "2b1", "2b2"], + "v01": [0.5, 0.6, 0.7, 0.8], + "v02": [0.1, 0.7, 0.2, 0.3], + "v12": [0.5, 0.3, 0.9, 0.8], + } + ) + self.restructured_cluster_table_long.to_csv( + "test_restructured_cluster_table_long.csv", index=False + ) # Create mock uk_std_cluster_means DataFrame # This is used for average variance as looking at vairance of the clustering - self.uk_std_cluster_means = pd.DataFrame({ - 'cluster': [1, '1a', '1a1', '1a1', '1a2', '2', '2b', '2b1', '2b2'], - 'hierarchy_level': ['supergroup', 'group', 'subgroup', 'subgroup', 'subgroup', 'supergroup', 'group', 'subgroup', 'subgroup'], - 'v01': [0.35, 0.2, 0.525, 0.45, 0.60, 0.55, 0.40, 0.625, 0.70], - 'v02': [0.75, 0.15, -0.825, 0.90, 0.10, 0.95, -0.20, 1.025, 1.10], - 'v12': [0.16, 0.08, 0.20, -0.24, 0.32, 0.40, -0.12, 0.48, 0.56], - }) - self.uk_std_cluster_means.to_csv('uk_std_cluster_means.csv', index=False) + self.uk_std_cluster_means = pd.DataFrame( + { + "cluster": [1, "1a", "1a1", "1a1", "1a2", "2", "2b", "2b1", "2b2"], + "hierarchy_level": [ + "supergroup", + "group", + "subgroup", + "subgroup", + "subgroup", + "supergroup", + "group", + "subgroup", + "subgroup", + ], + "v01": [0.35, 0.2, 0.525, 0.45, 0.60, 0.55, 0.40, 0.625, 0.70], + "v02": [0.75, 0.15, -0.825, 0.90, 0.10, 0.95, -0.20, 1.025, 1.10], + "v12": [0.16, 0.08, 0.20, -0.24, 0.32, 0.40, -0.12, 0.48, 0.56], + } + ) + self.uk_std_cluster_means.to_csv("uk_std_cluster_means.csv", index=False) # Create a mock lookup file - os.makedirs('./tests/data/summaries/', exist_ok=True) - self.lookup_file = './tests/data/summaries/lookup_file.csv' + os.makedirs("./tests/data/summaries/", exist_ok=True) + self.lookup_file = "./tests/data/summaries/lookup_file.csv" + + pd.DataFrame( + { + "variable_name": [ + "Lives in a communal establishment", + "Never married and never registered a civil partnership", + "Usual residents per square kilometre", + ], + "variable_code": ["ts0010003", "ts0020002", "ts0060001"], + "table_ID": ["TS001", "TS002", "TS006"], + "table_name": ["Residency type", "Legal partnership status", "Population density"], + "country": ["ew", "ew", "ew"], + "new_code": ["v01", "v02", "v12"], + "domain": [ + "Demography and Migration", + "Demography and Migration", + "Demography and Migration", + ], + } + ).to_csv(self.lookup_file, index=False) - pd.DataFrame({ - 'variable_name': ['Lives in a communal establishment', 'Never married and never registered a civil partnership', 'Usual residents per square kilometre'], - 'variable_code': ['ts0010003', 'ts0020002', 'ts0060001'], - 'table_ID': ['TS001', 'TS002', 'TS006'], - 'table_name': ['Residency type', 'Legal partnership status', 'Population density' ], - 'country': ['ew', 'ew', 'ew'], - 'new_code': ['v01', 'v02', 'v12'], - 'domain': ['Demography and Migration', 'Demography and Migration', 'Demography and Migration'] - }).to_csv(self.lookup_file, index=False) - def test_cluster_summaries_wrapper(self): # Expected output strings # When checking variance, remember sample var used and the value after higher / lower is related to the UK_means table expected_output = ( "Cluster 1\n" "Cluster 1 contains 2 local authorities which is 50.00% of UK local authorities. The average variance for cluster 1 is 0.068. Example areas: Middlesbrough, Hartlepool\n" - "Values in the brackets below are the difference between the mean of the variable for this cluster\n" - " compared with the mean of the other clusters combined. The population of cluster 1 has a:\n" + "Values in the brackets below are the difference between the mean of the \n" + " variable for this cluster compared with the mean of the other clusters combined. \n" + " The population of cluster 1 has a:\n" "• lower (-0.240) Usual residents per square kilometre. Variance:0.020 (Demography and Migration domain)\n" "• lower (-0.200) proportion of people who live in a communal establishment. Variance:0.005 (Demography and Migration domain)\n" - "• lower (-0.200) proportion of people who are Never married and never registered a civil partnership. Variance:0.180 (Demography and Migration domain)\n" + "• lower (-0.200) proportion of people who are Never married and never registered a civil partnership. Variance:0.180 (Demography and Migration domain)\n" "----------------------------------------\n" "Cluster 2\n" "Cluster 2 contains 2 local authorities which is 50.00% of UK local authorities. The average variance for cluster 2 is 0.005. Example areas: Glasgow City, City of Edinburgh\n" - "Values in the brackets below are the difference between the mean of the variable for this cluster\n" - " compared with the mean of the other clusters combined. The population of cluster 2 has a:\n" + "Values in the brackets below are the difference between the mean of the \n" + " variable for this cluster compared with the mean of the other clusters combined. \n" + " The population of cluster 2 has a:\n" "• higher (0.240) Usual residents per square kilometre. Variance:0.005 (Demography and Migration domain)\n" "• higher (0.200) proportion of people who live in a communal establishment. Variance:0.005 (Demography and Migration domain)\n" "• higher (0.200) proportion of people who are Never married and never registered a civil partnership. Variance:0.005 (Demography and Migration domain)\n" - "----------------------------------------\n" - ) + ) for col, dtype in self.restructured_cluster_table_long.dtypes.items(): - print(f"{col}: {dtype}") + print(f"{col}: {dtype}") - with patch('sys.stdout', new=io.StringIO()) as fake_out: - # print("Running cluster_summaries_wrapper...") + with patch("sys.stdout", new=io.StringIO()) as fake_out: + # print("Running cluster_summaries_wrapper...") cluster_summaries_wrapper( - config=self.config, + config=self.config, restructured_cluster_table_long=self.restructured_cluster_table_long, uk_std_cluster_means=self.uk_std_cluster_means, lookup_file=self.lookup_file, - cluster_column='supergroup' + cluster_column="supergroup", ) # print("Checking outputs...") print("expected output:", expected_output) @@ -96,15 +126,16 @@ def test_cluster_summaries_wrapper(self): print("Cleaning up test files...") # Clean up - remove created files and folders - for filename in os.listdir(self.config['input_directory']): - file_path = os.path.join(self.config['input_directory'], filename) + for filename in os.listdir(self.config["input_directory"]): + file_path = os.path.join(self.config["input_directory"], filename) if os.path.isfile(file_path): os.remove(file_path) elif os.path.isdir(file_path): shutil.rmtree(file_path) - shutil.rmtree(self.config['input_directory']) + shutil.rmtree(self.config["input_directory"]) print("Integration test completed.") -if __name__ == '__main__': - unittest.main() \ No newline at end of file + +if __name__ == "__main__": + unittest.main() diff --git a/tests/integration_tests/test_integration_prepare_clustering_data.py b/tests/integration_tests/test_integration_prepare_clustering_data.py index 3c6ef3d..0d9a9f0 100644 --- a/tests/integration_tests/test_integration_prepare_clustering_data.py +++ b/tests/integration_tests/test_integration_prepare_clustering_data.py @@ -1,13 +1,15 @@ import unittest + import pandas as pd -import numpy as np + from area_classification.pre_processing.prepare_clustering_data import prepare_clustering_data + class TestPrepareClusteringDataIntegration(unittest.TestCase): def test_prepare_clustering_data_pipeline(self): # Arrange: Create a sample DataFrame data = { - 'LAD_code': ['E06000001', 'W06000001', 'N09000001', 'S12000005'], + "LAD_code": ["E06000001", "W06000001", "N09000001", "S12000005"], "v01": [10, 20, 30, 15], "v02": [5, 15, 25, 12], } @@ -30,5 +32,6 @@ def test_prepare_clustering_data_pipeline(self): # 4. Ensure no NaN values exist in the result self.assertFalse(result.isnull().values.any()) + if __name__ == "__main__": - unittest.main() \ No newline at end of file + unittest.main() diff --git a/tests/post_processing/test_cluster_summaries.py b/tests/post_processing/test_cluster_summaries.py index 52dbbe8..334bd87 100644 --- a/tests/post_processing/test_cluster_summaries.py +++ b/tests/post_processing/test_cluster_summaries.py @@ -1,83 +1,125 @@ - import unittest + import pandas as pd -from area_classification.post_processing.cluster_summaries import calculate_cluster_variance, cluster_summary + +from area_classification.post_processing.cluster_summaries import ( + calculate_cluster_variance, + cluster_summary, +) + class TestCalculateClusterVariance(unittest.TestCase): def setUp(self): # Sample input DataFrame - self.input_df = pd.DataFrame({ - 'LAD_code': ['S12000001', 'S12000002', 'S12000003','S12000004', 'S12000005', 'S12000006' ], - 'supergroup': [1, 3, 2, 2, 3, 1], - 'group': ['1b', '3a', '2a', '2b', '3a', '1b'], - 'subgroup': ['1b1', '3a2', '2a1', '2b1', '3a1', '1b2'], - 'v01': [0.50, 0.30, 0.20, 0.20, 0.75, 0.7], - 'v02': [0.60, 0.90, 0.10, 0.20, 0.75, 0.9] - }) + self.input_df = pd.DataFrame( + { + "LAD_code": [ + "S12000001", + "S12000002", + "S12000003", + "S12000004", + "S12000005", + "S12000006", + ], + "supergroup": [1, 3, 2, 2, 3, 1], + "group": ["1b", "3a", "2a", "2b", "3a", "1b"], + "subgroup": ["1b1", "3a2", "2a1", "2b1", "3a1", "1b2"], + "v01": [0.50, 0.30, 0.20, 0.20, 0.75, 0.7], + "v02": [0.60, 0.90, 0.10, 0.20, 0.75, 0.9], + } + ) # Expected output DataFrame after aggregation - self.expected_df = pd.DataFrame({ - 'supergroup': [1, 2, 3], - 'v01': [0.02, 0, 0.10125], - 'v02': [0.045, 0.005, 0.01125], - 'cluster_average_variance': [0.0325, 0.0025, 0.05625 ] - }).set_index('supergroup') # Set 'supergroup' as the index + self.expected_df = pd.DataFrame( + { + "supergroup": [1, 2, 3], + "v01": [0.02, 0, 0.10125], + "v02": [0.045, 0.005, 0.01125], + "cluster_average_variance": [0.0325, 0.0025, 0.05625], + } + ).set_index("supergroup") # Set 'supergroup' as the index def test_calculate_cluster_variance(self): - - result_df = calculate_cluster_variance(self.input_df, cluster_column = 'supergroup') + result_df = calculate_cluster_variance(self.input_df, cluster_column="supergroup") # Assert that the result matches the expected output pd.testing.assert_frame_equal(result_df, self.expected_df) - class TestClusterSummary(unittest.TestCase): def setUp(self): + # Create a mock configuration + self.config = { + "qa_directory": "./tests/data/summaries/" + } + # Sample input DataFrame - self.input_df = pd.DataFrame({ - 'LAD_code': ['E06000001' , 'E06000002', 'E06000003', 'E06000004', 'E06000005', 'E06000047' ], - 'LAD_name': ['Hartlepool', 'Middlesbrough', 'Redcar and Cleveland','Stockton-on-Tees', 'Darlington', 'County Durham'], - 'supergroup': ['1','3', '2', '2', '3', '1'], - 'group': ['1b', '3a', '2a', '2b', '3a', '1b'], - 'subgroup': ['1b1', '3a2', '2a1', '2b1', '3a1', '1b2'], - 'v01': [0.50, 0.30, 0.20, 0.20, 0.75, 0.7], - 'v02': [0.60, 0.90, 0.10, 0.20, 0.75, 0.9], - 'v12': [0.12, 0.34, 0.06, 0.11, 0.06, 0.20] - }) + self.input_df = pd.DataFrame( + { + "LAD_code": [ + "E06000001", + "E06000002", + "E06000003", + "E06000004", + "E06000005", + "E06000047", + ], + "LAD_name": [ + "Hartlepool", + "Middlesbrough", + "Redcar and Cleveland", + "Stockton-on-Tees", + "Darlington", + "County Durham", + ], + "supergroup": ["1", "3", "2", "2", "3", "1"], + "group": ["1b", "3a", "2a", "2b", "3a", "1b"], + "subgroup": ["1b1", "3a2", "2a1", "2b1", "3a1", "1b2"], + "v01": [0.50, 0.30, 0.20, 0.20, 0.75, 0.7], + "v02": [0.60, 0.90, 0.10, 0.20, 0.75, 0.9], + "v12": [0.12, 0.34, 0.06, 0.11, 0.06, 0.20], + } + ) # Mock uk_std_cluster_means DataFrame - self.uk_std_cluster_means_df = pd.DataFrame({ - 'cluster': [1, 2, 3], - 'hierarchy_level': ['supergroup', 'supergroup', 'supergroup'], - 'v01': [0.35, 0.2, 0.525], - 'v02': [0.75, 0.15, 0.825], - 'v12': [0.16, 0.08, 0.20], - }) + self.uk_std_cluster_means_df = pd.DataFrame( + { + "cluster": [1, 2, 3], + "hierarchy_level": ["supergroup", "supergroup", "supergroup"], + "v01": [0.35, 0.2, 0.525], + "v02": [0.75, 0.15, 0.825], + "v12": [0.16, 0.08, 0.20], + } + ) # Mock variance DataFrame - self.variance_df = pd.DataFrame({ - 'supergroup': ['1', '2', '3'], - 'v01': [0.02, 0, 0.10125], - 'v02': [0.045, 0.005, 0.01125], - 'cluster_average_variance': [0.033, 0.003, 0.056 ] - }).set_index('supergroup') # Set 'supergroup' as the index - + self.variance_df = pd.DataFrame( + { + "supergroup": ["1", "2", "3"], + "v01": [0.02, 0, 0.10125], + "v02": [0.045, 0.005, 0.01125], + "cluster_average_variance": [0.033, 0.003, 0.056], + } + ).set_index("supergroup") # Set 'supergroup' as the index # Expected output strings self.expected_output = [ - 'Cluster 1 contains 2 local authorities which is 33.33% of UK local authorities. The average variance for cluster 1 is 0.033. Example areas: County Durham, Hartlepool', - 'Cluster 2 contains 2 local authorities which is 33.33% of UK local authorities. The average variance for cluster 2 is 0.003. Example areas: Stockton-on-Tees, Redcar and Cleveland', - 'Cluster 3 contains 2 local authorities which is 33.33% of UK local authorities. The average variance for cluster 3 is 0.056. Example areas: Darlington, Middlesbrough' + "Cluster 1 contains 2 local authorities which is 33.33% of UK local authorities. The average variance for cluster 1 is 0.033. Example areas: County Durham, Hartlepool", + "Cluster 2 contains 2 local authorities which is 33.33% of UK local authorities. The average variance for cluster 2 is 0.003. Example areas: Stockton-on-Tees, Redcar and Cleveland", + "Cluster 3 contains 2 local authorities which is 33.33% of UK local authorities. The average variance for cluster 3 is 0.056. Example areas: Darlington, Middlesbrough", ] def test_cluster_summary(self): self.maxDiff = None # Show full diff for debugging - result_output = cluster_summary(self.input_df, self.uk_std_cluster_means_df, self.variance_df, cluster_column='supergroup') + result_output = cluster_summary( + self.config, + self.input_df, + self.uk_std_cluster_means_df, + self.variance_df, + cluster_column="supergroup", + ) # Compare the lists self.assertListEqual(result_output, self.expected_output) + if __name__ == "__main__": unittest.main() - - - \ No newline at end of file diff --git a/tests/post_processing/test_cluster_table_restructure.py b/tests/post_processing/test_cluster_table_restructure.py index 6da22af..8aef3d9 100644 --- a/tests/post_processing/test_cluster_table_restructure.py +++ b/tests/post_processing/test_cluster_table_restructure.py @@ -1,61 +1,88 @@ +import os import unittest + import pandas as pd -import os -from unittest.mock import patch + from area_classification.post_processing.cluster_table_restructure import cluster_table_restructure -config = { - "LAD_lookup_file_path": "./tests/data/LAD_lookup.csv", - "output_directory": "./tests/data/" - } +config = {"LAD_lookup_file_path": "./tests/data/LAD_lookup.csv", "output_directory": "./tests/data/"} + class TestClusterTableRestructure(unittest.TestCase): def setUp(self): # No actual directories or files will be created - if not os.path.exists(config["output_directory"]+"cluster_assignments/"): - os.makedirs(config["output_directory"]+ "cluster_assignments/") + if not os.path.exists(config["output_directory"] + "cluster_assignments/"): + os.makedirs(config["output_directory"] + "cluster_assignments/") - lookup_df = pd.DataFrame({ - 'LAD22NM': ['Hartlepool', 'Isle of Anglesey', 'Antrim and Newtownabbey','Clackmannanshire'], - 'LAD22CD': ['E06000001', 'W06000001', 'N09000001','S12000005'], - }) + lookup_df = pd.DataFrame( + { + "LAD22NM": [ + "Hartlepool", + "Isle of Anglesey", + "Antrim and Newtownabbey", + "Clackmannanshire", + ], + "LAD22CD": ["E06000001", "W06000001", "N09000001", "S12000005"], + } + ) lookup_df.to_csv(config["LAD_lookup_file_path"], index=True, header=True) - self.input_df = pd.DataFrame({ - 'LAD_code': ['E06000001', 'W06000001', 'N09000001','S12000005'], - 'subsubcluster': ['1ab', '2bc', '3cb', '6ab'], - }) + self.input_df = pd.DataFrame( + { + "LAD_code": ["E06000001", "W06000001", "N09000001", "S12000005"], + "subsubcluster": ["1ab", "2bc", "3cb", "6ab"], + } + ) - self.standardised_data = pd.DataFrame({ - 'LAD_code': ['E06000001', 'W06000001', 'N09000001','S12000005'], - 'v01': [0.35, 0.2, 0.525, 0.45], - 'v02': [0.75, 0.15, 0.825, 0.60], - 'v12': [0.16, 0.08, 0.20, 0.12], - }) + self.standardised_data = pd.DataFrame( + { + "LAD_code": ["E06000001", "W06000001", "N09000001", "S12000005"], + "v01": [0.35, 0.2, 0.525, 0.45], + "v02": [0.75, 0.15, 0.825, 0.60], + "v12": [0.16, 0.08, 0.20, 0.12], + } + ) - self.expected_df = pd.DataFrame({ - 'LAD_name': ['Hartlepool', 'Isle of Anglesey', 'Antrim and Newtownabbey','Clackmannanshire'], - 'LAD_code': ['E06000001', 'W06000001', 'N09000001','S12000005'], - 'supergroup': ['1', '2', '3', '6'], - 'group': ['1a', '2b', '3c', '6a'], - 'subgroup': ['1a2', '2b3', '3c2', '6a2'], - }) + self.expected_df = pd.DataFrame( + { + "LAD_name": [ + "Hartlepool", + "Isle of Anglesey", + "Antrim and Newtownabbey", + "Clackmannanshire", + ], + "LAD_code": ["E06000001", "W06000001", "N09000001", "S12000005"], + "supergroup": ["1", "2", "3", "6"], + "group": ["1a", "2b", "3c", "6a"], + "subgroup": ["1a2", "2b3", "3c2", "6a2"], + } + ) - self.expected_df_long = pd.DataFrame({ - 'LAD_name': ['Hartlepool', 'Isle of Anglesey', 'Antrim and Newtownabbey','Clackmannanshire'], - 'LAD_code': ['E06000001', 'W06000001', 'N09000001','S12000005'], - 'supergroup': ['1', '2', '3', '6'], - 'group': ['1a', '2b', '3c', '6a'], - 'subgroup': ['1a2', '2b3', '3c2', '6a2'], - 'v01': [0.35, 0.2, 0.525, 0.45], - 'v02': [0.75, 0.15, 0.825, 0.60], - 'v12': [0.16, 0.08, 0.20, 0.12], - }) + self.expected_df_long = pd.DataFrame( + { + "LAD_name": [ + "Hartlepool", + "Isle of Anglesey", + "Antrim and Newtownabbey", + "Clackmannanshire", + ], + "LAD_code": ["E06000001", "W06000001", "N09000001", "S12000005"], + "supergroup": ["1", "2", "3", "6"], + "group": ["1a", "2b", "3c", "6a"], + "subgroup": ["1a2", "2b3", "3c2", "6a2"], + "v01": [0.35, 0.2, 0.525, 0.45], + "v02": [0.75, 0.15, 0.825, 0.60], + "v12": [0.16, 0.08, 0.20, 0.12], + } + ) def test_restructure_table(self): result_df, result_df_long = cluster_table_restructure( - config, self.input_df, split_column='subsubcluster', - keep_column='LAD_code', standardised_data=self.standardised_data + config, + self.input_df, + split_column="subsubcluster", + keep_column="LAD_code", + standardised_data=self.standardised_data, ) pd.testing.assert_frame_equal(result_df, self.expected_df) pd.testing.assert_frame_equal(result_df_long, self.expected_df_long) @@ -66,5 +93,6 @@ def tearDown(self): os.remove(file_path) print(f"Test ran then deleted: {file_path}") -if __name__ == '__main__': - unittest.main() \ No newline at end of file + +if __name__ == "__main__": + unittest.main() diff --git a/tests/post_processing/test_cluster_variables_mean.py b/tests/post_processing/test_cluster_variables_mean.py index 7d5f6f6..5dffc74 100644 --- a/tests/post_processing/test_cluster_variables_mean.py +++ b/tests/post_processing/test_cluster_variables_mean.py @@ -1,8 +1,9 @@ - -import unittest -import pandas as pd import os import shutil +import unittest + +import pandas as pd + from area_classification.post_processing.cluster_variables_mean import cluster_variable_means @@ -16,30 +17,35 @@ def setUp(self): self.config = {"output_directory": self.test_output_dir} # Mock restructured_cluster_table DataFrame - self.restructured_cluster_table = pd.DataFrame({ - "LAD_name": ["Hartlepool", "Middlesbrough","Redcar and Cleveland"], - "LAD_code": ["E06000001", "E06000002","E06000003"], - "supergroup": [1, 2, 1], - "group": ["1a", "2b", "1b"], - "subgroup": ["1a1", "2b1", "1b1"] - }) + self.restructured_cluster_table = pd.DataFrame( + { + "LAD_name": ["Hartlepool", "Middlesbrough", "Redcar and Cleveland"], + "LAD_code": ["E06000001", "E06000002", "E06000003"], + "supergroup": [1, 2, 1], + "group": ["1a", "2b", "1b"], + "subgroup": ["1a1", "2b1", "1b1"], + } + ) # Mock pre_clustering_data_std_mean DataFrame - self.pre_clustering_data_std_mean = pd.DataFrame({ - "LAD_code": ["E06000001", "E06000002","E06000003"], - "V01": [-1, 0.5, 0.1], - "V02": [3, 0.7, -0.2], - "V03": [0.1, -0.2, 0.3] - }) + self.pre_clustering_data_std_mean = pd.DataFrame( + { + "LAD_code": ["E06000001", "E06000002", "E06000003"], + "V01": [-1, 0.5, 0.1], + "V02": [3, 0.7, -0.2], + "V03": [0.1, -0.2, 0.3], + } + ) def tearDown(self): # Remove the temporary output directory after the test shutil.rmtree(self.test_output_dir) - def test_cluster_variable_means(self): # Run the function - result = cluster_variable_means(self.config, self.restructured_cluster_table, self.pre_clustering_data_std_mean) + result = cluster_variable_means( + self.config, self.restructured_cluster_table, self.pre_clustering_data_std_mean + ) # Check the output DataFrame structure self.assertIsInstance(result, pd.DataFrame) @@ -50,11 +56,7 @@ def test_cluster_variable_means(self): self.assertIn("V03", result.columns) # Define expected means for the '1 supergroup' row - expected_means = { - "V01": -0.45, - "V02": 1.4, - "V03": 0.2 - } + expected_means = {"V01": -0.45, "V02": 1.4, "V03": 0.2} # Filter the DataFrame for the '1 supergroup' row supergroup_row = result.query("cluster == 1 and hierarchy_level == 'supergroup'") @@ -64,16 +66,23 @@ def test_cluster_variable_means(self): # Compare the actual values with the expected values for column, expected_value in expected_means.items(): - self.assertAlmostEqual(supergroup_row.iloc[0][column], expected_value, places=6, msg=f"Mismatch in {column} for '1 supergroup'") - + self.assertAlmostEqual( + supergroup_row.iloc[0][column], + expected_value, + places=6, + msg=f"Mismatch in {column} for '1 supergroup'", + ) # Check if the output file is created - output_file_path = os.path.join(self.test_output_dir, "std_means", "uk_std_means", "uk_std_cluster_means_output.csv") + output_file_path = os.path.join( + self.test_output_dir, "std_means", "uk_std_means", "uk_std_cluster_means_output.csv" + ) self.assertTrue(os.path.exists(output_file_path)) # Check if the output file contains the expected data output_data = pd.read_csv(output_file_path) self.assertFalse(output_data.empty) + if __name__ == "__main__": - unittest.main() \ No newline at end of file + unittest.main() diff --git a/tests/pre_processing/test_aggregating_vairables.py b/tests/pre_processing/test_aggregating_vairables.py index 99ecaf1..9b47e8d 100644 --- a/tests/pre_processing/test_aggregating_vairables.py +++ b/tests/pre_processing/test_aggregating_vairables.py @@ -1,36 +1,44 @@ import unittest + import pandas as pd -from area_classification.pre_processing.aggregating_variables import aggregating_variables + +from area_classification.pre_processing.aggregating_variables import aggregating_variables from area_classification.utilities.load_config import load_config + class TestAggregatingVariables(unittest.TestCase): def setUp(self): # Sample input DataFrame - self.input_df = pd.DataFrame({ - 'LAD_code': ['S12000001', 'S12000002', 'S12000003','S12000004'], - 'UV1040003': [50, 30, 20, 100], - 'UV1040004': [100, 60, 40, 100], - 'UV1040005': [25, 15, 10, 50], - 'UV1040006': [50, 30, 20, 200] - }) + self.input_df = pd.DataFrame( + { + "LAD_code": ["S12000001", "S12000002", "S12000003", "S12000004"], + "UV1040003": [50, 30, 20, 100], + "UV1040004": [100, 60, 40, 100], + "UV1040005": [25, 15, 10, 50], + "UV1040006": [50, 30, 20, 200], + } + ) # Expected output DataFrame after aggregation - self.expected_df = pd.DataFrame({ - 'LAD_code': ['S12000001', 'S12000002', 'S12000003', 'S12000004'], - 'UV1040003': [50, 30, 20, 100], - 'UV1040004': [100, 60, 40, 100], - 'UV1040005': [25, 15, 10, 50], - 'UV1040006': [50, 30, 20, 200], - 'separated_divorced': [125, 75, 50, 150] # Added UV1040004 + UV1040005 - }) + self.expected_df = pd.DataFrame( + { + "LAD_code": ["S12000001", "S12000002", "S12000003", "S12000004"], + "UV1040003": [50, 30, 20, 100], + "UV1040004": [100, 60, 40, 100], + "UV1040005": [25, 15, 10, 50], + "UV1040006": [50, 30, 20, 200], + "separated_divorced": [125, 75, 50, 150], # Added UV1040004 + UV1040005 + } + ) def test_aggregating_variables(self): - config = load_config('area_classification/config.yaml') - aggregation_config = load_config('area_classification/aggregation_setup.yaml') - aggregation_configs = aggregation_config['scot_file_configs'] - result_df = aggregating_variables(self.input_df, aggregation_configs, config ) + config = load_config("area_classification/config.yaml") + aggregation_config = load_config("area_classification/aggregation_setup.yaml") + aggregation_configs = aggregation_config["scot_file_configs"] + result_df = aggregating_variables(self.input_df, aggregation_configs, config) # Assert that the result matches the expected output pd.testing.assert_frame_equal(result_df, self.expected_df) -if __name__ == '__main__': - unittest.main() \ No newline at end of file + +if __name__ == "__main__": + unittest.main() diff --git a/tests/pre_processing/test_convert_to_percentages.py b/tests/pre_processing/test_convert_to_percentages.py index bbd3943..94d8bf6 100644 --- a/tests/pre_processing/test_convert_to_percentages.py +++ b/tests/pre_processing/test_convert_to_percentages.py @@ -1,24 +1,31 @@ import unittest + import pandas as pd -from area_classification.pre_processing.convert_to_percentages import convert_to_percentages + +from area_classification.pre_processing.convert_to_percentages import convert_to_percentages + class TestConvertToPercentages(unittest.TestCase): def setUp(self): # Sample input DataFrame - self.input_df = pd.DataFrame({ - 'LAD_code': ['E06000001', 'W06000001', 'N09000001','S12000005'], - 'v01': [50, 30, 20, 100], - 'v01_total': [100, 60, 40, 1000], - 'v02': [25, 15, 10, 50], - 'v02_total': [50, 30, 20, 200] - }) + self.input_df = pd.DataFrame( + { + "LAD_code": ["E06000001", "W06000001", "N09000001", "S12000005"], + "v01": [50, 30, 20, 100], + "v01_total": [100, 60, 40, 1000], + "v02": [25, 15, 10, 50], + "v02_total": [50, 30, 20, 200], + } + ) # Expected output DataFrame after conversion - self.expected_df = pd.DataFrame({ - 'LAD_code': ['E06000001', 'W06000001', 'N09000001','S12000005'], - 'v01': [50.0, 50.0, 50.0, 10.0], # Percentages of v01 / v01_total - 'v02': [50.0, 50.0, 50.0, 25.0] # Percentages of v02 / v02_total - }) + self.expected_df = pd.DataFrame( + { + "LAD_code": ["E06000001", "W06000001", "N09000001", "S12000005"], + "v01": [50.0, 50.0, 50.0, 10.0], # Percentages of v01 / v01_total + "v02": [50.0, 50.0, 50.0, 25.0], # Percentages of v02 / v02_total + } + ) def test_convert_to_percentages(self): # Call the function to test @@ -26,5 +33,6 @@ def test_convert_to_percentages(self): # Assert that the result matches the expected output pd.testing.assert_frame_equal(result_df, self.expected_df) -if __name__ == '__main__': - unittest.main() \ No newline at end of file + +if __name__ == "__main__": + unittest.main() diff --git a/tests/pre_processing/test_drop_variables.py b/tests/pre_processing/test_drop_variables.py index 24c1e31..a68ea86 100644 --- a/tests/pre_processing/test_drop_variables.py +++ b/tests/pre_processing/test_drop_variables.py @@ -1,37 +1,41 @@ import unittest + import pandas as pd -from area_classification.pre_processing.drop_variables import drop_variables_pre_clustering + +from area_classification.pre_processing.drop_variables import drop_variables_pre_clustering from area_classification.utilities.load_config import load_config class TestDropVariables(unittest.TestCase): def setUp(self): # Sample input DataFrame - self.input_df = pd.DataFrame({ - 'LAD_code': ['E06000001', 'W06000001', 'N09000001','S12000005'], - 'v01': [50, 30, 20, 100], - 'v02': [100, 60, 40, 100], - 'v03': [25, 15, 10, 50], - 'v04': [50, 30, 20, 200] - }) + self.input_df = pd.DataFrame( + { + "LAD_code": ["E06000001", "W06000001", "N09000001", "S12000005"], + "v01": [50, 30, 20, 100], + "v02": [100, 60, 40, 100], + "v03": [25, 15, 10, 50], + "v04": [50, 30, 20, 200], + } + ) print(self.input_df) - self.variables_to_drop = ({ - 'v02', - 'v04' - }) + self.variables_to_drop = {"v02", "v04"} # Expected output DataFrame after conversion - self.expected_df = pd.DataFrame({ - 'LAD_code': ['E06000001', 'W06000001', 'N09000001','S12000005'], - 'v01': [50, 30, 20, 100], - 'v03': [25, 15, 10, 50] - }) + self.expected_df = pd.DataFrame( + { + "LAD_code": ["E06000001", "W06000001", "N09000001", "S12000005"], + "v01": [50, 30, 20, 100], + "v03": [25, 15, 10, 50], + } + ) def test_drop_variables(self): - config = load_config('area_classification/config.yaml') + config = load_config("area_classification/config.yaml") result_df = drop_variables_pre_clustering(config, self.input_df, self.variables_to_drop) # Assert that the result matches the expected output pd.testing.assert_frame_equal(result_df, self.expected_df) -if __name__ == '__main__': - unittest.main() \ No newline at end of file + +if __name__ == "__main__": + unittest.main() diff --git a/tests/pre_processing/test_prepare_clustering_data.py b/tests/pre_processing/test_prepare_clustering_data.py index ba48ce3..158c48d 100644 --- a/tests/pre_processing/test_prepare_clustering_data.py +++ b/tests/pre_processing/test_prepare_clustering_data.py @@ -1,24 +1,33 @@ import unittest + import pandas as pd -from area_classification.pre_processing.prepare_clustering_data import standardise_data -from area_classification.pre_processing.prepare_clustering_data import apply_arcsinh_transformation -from area_classification.pre_processing.prepare_clustering_data import apply_min_max_scaling + +from area_classification.pre_processing.prepare_clustering_data import ( + apply_arcsinh_transformation, + apply_min_max_scaling, + standardise_data, +) + class TestPrepareClusteringData(unittest.TestCase): def test_standardise_dataframe(self): # Define input DataFrame - input_df = pd.DataFrame({ - 'LAD_code': ['E06000001', 'W06000001', 'N09000001', 'S12000005'], - 'v01': [52.0, 25.0, 50.0, 10.0], # Percentages of v01 / v01_total - 'v02': [30.0, 50.0, 60.0, 25.0] # Percentages of v02 / v02_total - }) + input_df = pd.DataFrame( + { + "LAD_code": ["E06000001", "W06000001", "N09000001", "S12000005"], + "v01": [52.0, 25.0, 50.0, 10.0], # Percentages of v01 / v01_total + "v02": [30.0, 50.0, 60.0, 25.0], # Percentages of v02 / v02_total + } + ) # Define expected output DataFrame - expected_df = pd.DataFrame({ - 'LAD_code': ['E06000001', 'W06000001', 'N09000001', 'S12000005'], - 'v01': [1.009456, -0.526055, 0.895714, -1.379116], - 'v02': [-0.786334, 0.611593, 1.310556, -1.135815] - }) + expected_df = pd.DataFrame( + { + "LAD_code": ["E06000001", "W06000001", "N09000001", "S12000005"], + "v01": [1.009456, -0.526055, 0.895714, -1.379116], + "v02": [-0.786334, 0.611593, 1.310556, -1.135815], + } + ) # Run the function and assert the result std_result_df = standardise_data(input_df) @@ -26,18 +35,22 @@ def test_standardise_dataframe(self): def test_apply_arcsinh_transformation(self): # Define input DataFrame - input_df = pd.DataFrame({ - 'LAD_code': ['E06000001', 'W06000001', 'N09000001', 'S12000005'], - 'v01': [1.009456, -0.526055, 0.895714, -1.379116], - 'v02': [-0.786334, 0.611593, 1.310556, -1.135815] - }) + input_df = pd.DataFrame( + { + "LAD_code": ["E06000001", "W06000001", "N09000001", "S12000005"], + "v01": [1.009456, -0.526055, 0.895714, -1.379116], + "v02": [-0.786334, 0.611593, 1.310556, -1.135815], + } + ) # Define expected output DataFrame after arcsinh transformation - expected_df = pd.DataFrame({ - 'LAD_code': ['E06000001', 'W06000001', 'N09000001', 'S12000005'], - 'v01': [0.888044207, -0.504393876, 0.80567778, -1.125783223], - 'v02': [-0.7219613, 0.57874036, 1.084870788, -0.974225475] - }) + expected_df = pd.DataFrame( + { + "LAD_code": ["E06000001", "W06000001", "N09000001", "S12000005"], + "v01": [0.888044207, -0.504393876, 0.80567778, -1.125783223], + "v02": [-0.7219613, 0.57874036, 1.084870788, -0.974225475], + } + ) # Run the function and assert the result arcsinh_result_df = apply_arcsinh_transformation(input_df) @@ -45,25 +58,27 @@ def test_apply_arcsinh_transformation(self): def test_apply_min_max_scaling(self): # Define input DataFrame - input_df = pd.DataFrame({ - 'LAD_code': ['E06000001', 'W06000001', 'N09000001', 'S12000005'], - 'v01': [0.888044207, -0.504393876, 0.80567778, -1.125783223], - 'v02': [-0.7219613, 0.57874036, 1.084870788, -0.974225475] - }) + input_df = pd.DataFrame( + { + "LAD_code": ["E06000001", "W06000001", "N09000001", "S12000005"], + "v01": [0.888044207, -0.504393876, 0.80567778, -1.125783223], + "v02": [-0.7219613, 0.57874036, 1.084870788, -0.974225475], + } + ) # Define expected output DataFrame after min-max scaling - expected_df = pd.DataFrame({ - 'LAD_code': ['E06000001', 'W06000001', 'N09000001', 'S12000005'], - 'v01': [1.0, 0.308561561,0.959099737, 0.0], - 'v02': [0.122512292, 0.754197718, 1.0, 0.0] - }) + expected_df = pd.DataFrame( + { + "LAD_code": ["E06000001", "W06000001", "N09000001", "S12000005"], + "v01": [1.0, 0.308561561, 0.959099737, 0.0], + "v02": [0.122512292, 0.754197718, 1.0, 0.0], + } + ) # Run the function and assert the result mm_result_df = apply_min_max_scaling(input_df) pd.testing.assert_frame_equal(mm_result_df, expected_df) -if __name__ == '__main__': - unittest.main() - - +if __name__ == "__main__": + unittest.main() diff --git a/tests/pre_processing/test_select_totals_columns.py b/tests/pre_processing/test_select_totals_columns.py index 4989b92..6fc5f67 100644 --- a/tests/pre_processing/test_select_totals_columns.py +++ b/tests/pre_processing/test_select_totals_columns.py @@ -1,83 +1,103 @@ import unittest from unittest.mock import patch + import pandas as pd + from area_classification.pre_processing.select_totals_columns import select_totals_columns + MODULE = "area_classification.pre_processing.select_totals_columns" + class TestTotalColumnsSelectUk(unittest.TestCase): def setUp(self): - # Scotland data - self.aggregate_input_s = pd.DataFrame({ - 'area_code': ['S12000001', 'S12000002', 'S12000003', 'S12000004'], - 'UV101b0001': [50, 100, 110, 145], - 'UV101b0002': [20, 10, 77, 56], - 'UV101b0003': [30, 90, 33, 143], - 'hours_part': [30, 25, 33, 10], - 'hours_full': [30, 25, 32, 40], - 'UV6040001': [60, 50, 65, 50] - }) - self.select_input_s = pd.DataFrame({ - 'area_code': ['S12000001', 'S12000002', 'S12000003', 'S12000004'], - 'v01': [30, 90, 33, 143], - 'v45': [30, 25, 33, 10], - 'v46': [30, 25, 32, 40] - }) + self.aggregate_input_s = pd.DataFrame( + { + "area_code": ["S12000001", "S12000002", "S12000003", "S12000004"], + "UV101b0001": [50, 100, 110, 145], + "UV101b0002": [20, 10, 77, 56], + "UV101b0003": [30, 90, 33, 143], + "hours_part": [30, 25, 33, 10], + "hours_full": [30, 25, 32, 40], + "UV6040001": [60, 50, 65, 50], + } + ) + self.select_input_s = pd.DataFrame( + { + "area_code": ["S12000001", "S12000002", "S12000003", "S12000004"], + "v01": [30, 90, 33, 143], + "v45": [30, 25, 33, 10], + "v46": [30, 25, 32, 40], + } + ) # England/Wales data - self.aggregate_input_e = pd.DataFrame({ - 'area_code': ['E12000001', 'E12000002', 'E12000003', 'E12000004'], - 'ts0010001': [50, 100, 110, 145], - 'ts0010002': [20, 10, 77, 56], - 'ts0010003': [30, 90, 33, 143], - 'ts0590002': [30, 25, 33, 10], - 'ts0590005': [30, 25, 32, 40], - 'ts0590001': [60, 50, 65, 50] - }) - self.select_input_e = pd.DataFrame({ - 'area_code': ['E12000001', 'E12000002', 'E12000003', 'E12000004'], - 'v01': [30, 90, 33, 143], - 'v45': [30, 25, 33, 10], - 'v46': [30, 25, 32, 40] - }) + self.aggregate_input_e = pd.DataFrame( + { + "area_code": ["E12000001", "E12000002", "E12000003", "E12000004"], + "ts0010001": [50, 100, 110, 145], + "ts0010002": [20, 10, 77, 56], + "ts0010003": [30, 90, 33, 143], + "ts0590002": [30, 25, 33, 10], + "ts0590005": [30, 25, 32, 40], + "ts0590001": [60, 50, 65, 50], + } + ) + self.select_input_e = pd.DataFrame( + { + "area_code": ["E12000001", "E12000002", "E12000003", "E12000004"], + "v01": [30, 90, 33, 143], + "v45": [30, 25, 33, 10], + "v46": [30, 25, 32, 40], + } + ) # Expected concatenated output - self.expected_df = pd.DataFrame({ - 'area_code': [ - 'E12000001', 'E12000002', 'E12000003', 'E12000004', - 'S12000001', 'S12000002', 'S12000003', 'S12000004' - ], - 'v01': [30, 90, 33, 143, 30, 90, 33, 143], - 'v01_total': [50, 100, 110, 145, 50, 100, 110, 145], - 'v45': [30, 25, 33, 10, 30, 25, 33, 10], - 'v45_total': [60, 50, 65, 50, 60, 50, 65, 50], - 'v46': [30, 25, 32, 40, 30, 25, 32, 40], - 'v46_total': [60, 50, 65, 50, 60, 50, 65, 50] - }) + self.expected_df = pd.DataFrame( + { + "area_code": [ + "E12000001", + "E12000002", + "E12000003", + "E12000004", + "S12000001", + "S12000002", + "S12000003", + "S12000004", + ], + "v01": [30, 90, 33, 143, 30, 90, 33, 143], + "v01_total": [50, 100, 110, 145, 50, 100, 110, 145], + "v45": [30, 25, 33, 10, 30, 25, 33, 10], + "v45_total": [60, 50, 65, 50, 60, 50, 65, 50], + "v46": [30, 25, 32, 40, 30, 25, 32, 40], + "v46_total": [60, 50, 65, 50, 60, 50, 65, 50], + } + ) @patch(f"{MODULE}.os.listdir") @patch(f"{MODULE}.pd.read_csv") - @patch("pandas.DataFrame.to_csv") + @patch("pandas.DataFrame.to_csv") def test_total_columns_select_uk_concat(self, mock_to_csv, mock_read_csv, mock_listdir): mock_to_csv.return_value = None # Patch os.listdir to return the expected select files mock_listdir.return_value = [ "preprocessing_ew_selected_variables.csv", - "preprocessing_scot_selected_variables.csv" + "preprocessing_scot_selected_variables.csv", ] - + # In setUp, add: - self.lookup_df = pd.DataFrame({ - "new_code": ["v01", "v45", "v46", "v01", "v45", "v46"], - "table_ID": ["ts001", "ts059", "ts059", "UV101b", "UV604", "UV604"], - "country": ["ew", "ew", "ew", "scot", "scot", "scot"] - }) - + self.lookup_df = pd.DataFrame( + { + "new_code": ["v01", "v45", "v46", "v01", "v45", "v46"], + "table_ID": ["ts001", "ts059", "ts059", "UV101b", "UV604", "UV604"], + "country": ["ew", "ew", "ew", "scot", "scot", "scot"], + } + ) + # set up a mock config: config = { "qa_directory": "./data/QA/", - "select_variables_lookup": "tests/data/total_columns_select_uk_test_data/lookup.csv" + "select_variables_lookup": "tests/data/total_columns_select_uk_test_data/lookup.csv", } - # Map file paths to DataFrames def side_effect(path, *args, **kwargs): if "preprocessing_aggregated_all_variables_CA19.csv" in path: @@ -91,6 +111,7 @@ def side_effect(path, *args, **kwargs): if "lookup.csv" in path: return self.lookup_df raise ValueError(f"Unexpected file path: {path}") + mock_read_csv.side_effect = side_effect result_df = select_totals_columns(config, "tests/data/total_columns_select_uk_test_data") @@ -99,7 +120,10 @@ def side_effect(path, *args, **kwargs): if col.endswith("_total"): self.expected_df[col] = self.expected_df[col].astype(int) - pd.testing.assert_frame_equal(result_df.reset_index(drop=True), self.expected_df.reset_index(drop=True)) + pd.testing.assert_frame_equal( + result_df.reset_index(drop=True), self.expected_df.reset_index(drop=True) + ) + -if __name__ == '__main__': - unittest.main() \ No newline at end of file +if __name__ == "__main__": + unittest.main() diff --git a/tests/pre_processing/test_select_variables.py b/tests/pre_processing/test_select_variables.py index a5740e5..d95cd1e 100644 --- a/tests/pre_processing/test_select_variables.py +++ b/tests/pre_processing/test_select_variables.py @@ -1,40 +1,49 @@ -import pandas as pd import unittest + +import pandas as pd + from area_classification.pre_processing.select_variables import select_variables class TestSelectVariables(unittest.TestCase): def setUp(self): # Sample input DataFrame - self.input_df = pd.DataFrame({ - 'LAD_code': ['S12000001', 'S12000002', 'S12000003','S12000004'], - 'UV1040001': [50, 30, 20, 100], - 'UV1040002': [100, 60, 40, 100], - 'UV1040003': [25, 15, 10, 50], - 'UV1040004': [50, 30, 20, 200] - }) + self.input_df = pd.DataFrame( + { + "LAD_code": ["S12000001", "S12000002", "S12000003", "S12000004"], + "UV1040001": [50, 30, 20, 100], + "UV1040002": [100, 60, 40, 100], + "UV1040003": [25, 15, 10, 50], + "UV1040004": [50, 30, 20, 200], + } + ) # Expected output DataFrame after conversion - self.expected_df = pd.DataFrame({ - 'LAD_code': ['S12000001', 'S12000002', 'S12000003','S12000004'], - 'v02': [100, 60, 40, 100], - 'v03': [25, 15, 10, 50] - }) - - self.lookup_df = pd.DataFrame({ - 'variable_name': ['Never married', 'Married'], - 'variable_code': ['UV1040002', 'UV1040003'], - 'table_id': ['UV104', 'UV104'], - 'table_name': ['Marital status', 'Marital status'], - 'country': ['scot', 'scot'], - 'new_code': ['v02', 'v03'], - 'domain': ['Demography and Migration', 'Demography and Migration'] - }) + self.expected_df = pd.DataFrame( + { + "LAD_code": ["S12000001", "S12000002", "S12000003", "S12000004"], + "v02": [100, 60, 40, 100], + "v03": [25, 15, 10, 50], + } + ) + + self.lookup_df = pd.DataFrame( + { + "variable_name": ["Never married", "Married"], + "variable_code": ["UV1040002", "UV1040003"], + "table_id": ["UV104", "UV104"], + "table_name": ["Marital status", "Marital status"], + "country": ["scot", "scot"], + "new_code": ["v02", "v03"], + "domain": ["Demography and Migration", "Demography and Migration"], + } + ) def test_select_variables(self): result_df = select_variables(self.input_df, self.lookup_df) # Assert that the result matches the expected output pd.testing.assert_frame_equal(result_df, self.expected_df) -if __name__ == '__main__': - unittest.main() \ No newline at end of file + +if __name__ == "__main__": + unittest.main() diff --git a/tests/pre_processing/test_standardised_illness_ratio.py b/tests/pre_processing/test_standardised_illness_ratio.py index 11c2bd7..28967b8 100644 --- a/tests/pre_processing/test_standardised_illness_ratio.py +++ b/tests/pre_processing/test_standardised_illness_ratio.py @@ -1,9 +1,12 @@ import unittest -import pandas as pd -from area_classification.pre_processing.standardised_illness_ratio import SIR_calculation from pathlib import Path from unittest.mock import patch +import pandas as pd + +from area_classification.pre_processing.standardised_illness_ratio import SIR_calculation + + class TestSIRCalculation(unittest.TestCase): def setUp(self): patcher_to_csv = patch("pandas.DataFrame.to_csv") @@ -14,18 +17,30 @@ def setUp(self): self.addCleanup(patcher_makedirs.stop) def test_SIR_calculation(self): - mock_data = pd.DataFrame({ - "area_code": ['S1', 'S1', 'S2', 'S2', 'S3', 'S3'], - 'Local_Authority': ['LA1', 'LA1', 'LA2', 'LA2', 'LA3', 'LA3'], - 'age_group': ['0_14_65_over', '15_64', '0_14_65_over', '15_64', '0_14_65_over', '15_64'], - 'total_population': [100, 200, 150, 250, 120, 180], - 'total_disabled': [10, 20, 12, 24, 12, 22] - }) - config = {"qa_directory": ''} + mock_data = pd.DataFrame( + { + "area_code": ["S1", "S1", "S2", "S2", "S3", "S3"], + "Local_Authority": ["LA1", "LA1", "LA2", "LA2", "LA3", "LA3"], + "age_group": [ + "0_14_65_over", + "15_64", + "0_14_65_over", + "15_64", + "0_14_65_over", + "15_64", + ], + "total_population": [100, 200, 150, 250, 120, 180], + "total_disabled": [10, 20, 12, 24, 12, 22], + } + ) + config = {"qa_directory": ""} df_output = SIR_calculation(mock_data, config) output = df_output[["area_code", "SIR"]] - expected_output = pd.read_csv(Path("./tests/data/sir_test_expected_output.csv")).rename(columns={"SIR_expected": "SIR"}) + expected_output = pd.read_csv(Path("./tests/data/sir_test_expected_output.csv")).rename( + columns={"SIR_expected": "SIR"} + ) pd.testing.assert_frame_equal(output, expected_output, check_dtype=False) + if __name__ == "__main__": - unittest.main() \ No newline at end of file + unittest.main() diff --git a/tests/utilities/test_disability_age_group_conversion.py b/tests/utilities/test_disability_age_group_conversion.py index fdf9932..1fae2dc 100644 --- a/tests/utilities/test_disability_age_group_conversion.py +++ b/tests/utilities/test_disability_age_group_conversion.py @@ -1,63 +1,126 @@ -import pandas as pd -import unittest -import unittest import os +import unittest + +import pandas as pd + +from area_classification.utilities.disability_age_group_conversion import ( + convert_disability_age_group_england_wales, + convert_disability_age_group_northern_ireland, + convert_disability_age_group_scotland, +) -from area_classification.utilities.disability_age_group_conversion import convert_disability_age_group_scotland -from area_classification.utilities.disability_age_group_conversion import convert_disability_age_group_england_wales -from area_classification.utilities.disability_age_group_conversion import convert_disability_age_group_northern_ireland class TestConvertDisabilityAgeGroupScotland(unittest.TestCase): def setUp(self): self.scot_test_data_filepath = "./tests/data/scot_test_disability_data.csv" - self.config = {'input_directory': './tests/data/', "LAD_lookup_file_path": "./tests/data/scotland_lad_lookup.csv"} + self.config = { + "input_directory": "./tests/data/", + "LAD_lookup_file_path": "./tests/data/scotland_lad_lookup.csv", + } columns = [ - " ", "Disability", "All people", "Day-to-day activities limited a lot", - "Day-to-day activities limited a little", "Day-to-day activities not limited" + " ", + "Disability", + "All people", + "Day-to-day activities limited a lot", + "Day-to-day activities limited a little", + "Day-to-day activities not limited", ] - placeholders = pd.DataFrame({ - col: (['']*10 + ['Clackmannanshire'] if col == columns[0] else ['']*11) - for col in columns - }) + placeholders = pd.DataFrame( + { + col: ([""] * 10 + ["Clackmannanshire"] if col == columns[0] else [""] * 11) + for col in columns + } + ) header = pd.DataFrame([columns], columns=columns) - age_bands = ( ['0 to 4', '5 to 9', '10 to 14', '15', '16 to 17', '18 to 19', '20 to 24'] + - [f'{i} to {i+4}' for i in range(25, 85, 5)] + - ['85 and over'] ) + age_bands = ( + ["0 to 4", "5 to 9", "10 to 14", "15", "16 to 17", "18 to 19", "20 to 24"] + + [f"{i} to {i + 4}" for i in range(25, 85, 5)] + + ["85 and over"] + ) - population = [1000 + i*10 for i in range(len(age_bands))] - little_limited_count = [i*20 for i in range(len(age_bands))] - lot_limited_count = [i*15 for i in range(len(age_bands))] - non_disabled_count = [population[i] - (little_limited_count[i] + lot_limited_count[i]) for i in range(len(age_bands))] + population = [1000 + i * 10 for i in range(len(age_bands))] + little_limited_count = [i * 20 for i in range(len(age_bands))] + lot_limited_count = [i * 15 for i in range(len(age_bands))] + non_disabled_count = [ + population[i] - (little_limited_count[i] + lot_limited_count[i]) + for i in range(len(age_bands)) + ] data = { - " ": ['Sex'] + ['All people'] + ['']*(len(age_bands)) + ['Dumfries and Galloway'] + [''] + ['Sex'] + ['All people'] + ['']*(len(age_bands)) , - "Disability": ['Age'] + ['Total'] + age_bands + [''] + ['Disability'] + ['Age'] + ['Total'] + age_bands , - "All people": [''] + ['TOTAL'] + population + [''] + ['All people'] + [''] + ['TOTAL'] + population , - "Day-to-day activities limited a lot": [''] + ['TOTAL'] + lot_limited_count + [''] + ['Day-to-day activities limited a lot'] + [''] + ['TOTAL'] + lot_limited_count , - "Day-to-day activities limited a little": [''] + ['TOTAL'] + little_limited_count + [''] + ['Day-to-day activities limited a little'] + [''] + ['TOTAL'] + little_limited_count, - "Day-to-day activities not limited": [''] + ['TOTAL'] + non_disabled_count + [''] + ['Day-to-day activities not limited'] + [''] + ['TOTAL'] + non_disabled_count , + " ": ["Sex"] + + ["All people"] + + [""] * (len(age_bands)) + + ["Dumfries and Galloway"] + + [""] + + ["Sex"] + + ["All people"] + + [""] * (len(age_bands)), + "Disability": ["Age"] + + ["Total"] + + age_bands + + [""] + + ["Disability"] + + ["Age"] + + ["Total"] + + age_bands, + "All people": [""] + + ["TOTAL"] + + population + + [""] + + ["All people"] + + [""] + + ["TOTAL"] + + population, + "Day-to-day activities limited a lot": [""] + + ["TOTAL"] + + lot_limited_count + + [""] + + ["Day-to-day activities limited a lot"] + + [""] + + ["TOTAL"] + + lot_limited_count, + "Day-to-day activities limited a little": [""] + + ["TOTAL"] + + little_limited_count + + [""] + + ["Day-to-day activities limited a little"] + + [""] + + ["TOTAL"] + + little_limited_count, + "Day-to-day activities not limited": [""] + + ["TOTAL"] + + non_disabled_count + + [""] + + ["Day-to-day activities not limited"] + + [""] + + ["TOTAL"] + + non_disabled_count, } data_rows = pd.DataFrame(data) df = pd.concat([placeholders, header, data_rows], ignore_index=True) os.makedirs(os.path.dirname(self.scot_test_data_filepath), exist_ok=True) df.to_csv(self.scot_test_data_filepath, index=False, header=False) - LAD_lookup = pd.DataFrame({ - 'LAD22CD': ['S12000005', 'S12000006'], - 'LAD22NM': ['Clackmannanshire', 'Dumfries and Galloway'], - 'LAD22NMW': ['', ''], - 'ObjectId': [1, 2], - }) - os.makedirs(os.path.dirname('./tests/data/scotland_lad_lookup.csv'), exist_ok=True) - LAD_lookup.to_csv('./tests/data/scotland_lad_lookup.csv') - - self.expected_df = pd.DataFrame({ - 'area_code': ['S12000005', 'S12000005', 'S12000006', 'S12000006'], - 'age_group': ['<15 and >=65', '15-64', '<15 and >=65', '15-64'], - 'total_population': ['8880.0', '13020.0', '8880.0', '13020.0'], - 'total_disabled': ['3080.0', '3570.0', '3080.0', '3570.0'] - }) + LAD_lookup = pd.DataFrame( + { + "LAD22CD": ["S12000005", "S12000006"], + "LAD22NM": ["Clackmannanshire", "Dumfries and Galloway"], + "LAD22NMW": ["", ""], + "ObjectId": [1, 2], + } + ) + os.makedirs(os.path.dirname("./tests/data/scotland_lad_lookup.csv"), exist_ok=True) + LAD_lookup.to_csv("./tests/data/scotland_lad_lookup.csv") + + self.expected_df = pd.DataFrame( + { + "area_code": ["S12000005", "S12000005", "S12000006", "S12000006"], + "age_group": ["<15 and >=65", "15-64", "<15 and >=65", "15-64"], + "total_population": ["8880.0", "13020.0", "8880.0", "13020.0"], + "total_disabled": ["3080.0", "3570.0", "3080.0", "3570.0"], + } + ) self.expected_df["total_population"] = self.expected_df["total_population"].astype(float) self.expected_df["total_disabled"] = self.expected_df["total_disabled"].astype(float) @@ -68,51 +131,61 @@ def test_convert_disability_age_group_scotland(self): pd.testing.assert_frame_equal(result_df, self.expected_df) def tearDown(self): - for filename in os.listdir('./tests/data/'): - if 'disability' in filename: - file_path = os.path.join('./tests/data/', filename) + for filename in os.listdir("./tests/data/"): + if "disability" in filename: + file_path = os.path.join("./tests/data/", filename) if os.path.isfile(file_path): os.remove(file_path) print(f"Test ran then deleted: {file_path}") + class TestConvertDisabilityAgeGroupEnglandWales(unittest.TestCase): def setUp(self): self.test_data_filepath = "./tests/data/ew_test_disability_data.xlsx" - self.config = {'input_directory': './tests/data/'} + self.config = {"input_directory": "./tests/data/"} columns = [ - "Year", "Local Authority", "Area Code", "Category", - "Disability status", "Age", "Count", "Population", "Sex" + "Year", + "Local Authority", + "Area Code", + "Category", + "Disability status", + "Age", + "Count", + "Population", + "Sex", ] - placeholders = pd.DataFrame({col: ['']*4 for col in columns}) + placeholders = pd.DataFrame({col: [""] * 4 for col in columns}) header = pd.DataFrame([columns], columns=columns) - age_bands = ['under 1', '1 to 4'] + [f'{i} to {i+4}' for i in range(5, 90, 5)] + ['90+'] - population = [1000 + i*10 for i in range(len(age_bands))] - disabled_count = [i*20 for i in range(20)] + age_bands = ["under 1", "1 to 4"] + [f"{i} to {i + 4}" for i in range(5, 90, 5)] + ["90+"] + population = [1000 + i * 10 for i in range(len(age_bands))] + disabled_count = [i * 20 for i in range(20)] non_disabled_count = [population[i] - disabled_count[i] for i in range(20)] data = { - "Year": [2021]*40, - "Local Authority": ['Adur']*40, - "Area Code": ['E07000223']*40, - "Category": ['Two category']*40, - "Disability status": ['Disabled']*20 + ['Non-disabled']*20, - "Age": age_bands*2, + "Year": [2021] * 40, + "Local Authority": ["Adur"] * 40, + "Area Code": ["E07000223"] * 40, + "Category": ["Two category"] * 40, + "Disability status": ["Disabled"] * 20 + ["Non-disabled"] * 20, + "Age": age_bands * 2, "Count": disabled_count + non_disabled_count, - "Population": population*2, - "Sex": ['Persons']*40 + "Population": population * 2, + "Sex": ["Persons"] * 40, } data_rows = pd.DataFrame(data) df = pd.concat([placeholders, header, data_rows], ignore_index=True) os.makedirs(os.path.dirname(self.test_data_filepath), exist_ok=True) - df.to_excel(self.test_data_filepath, sheet_name='Table 6', index=False, header=False) + df.to_excel(self.test_data_filepath, sheet_name="Table 6", index=False, header=False) - self.expected_df = pd.DataFrame({ - 'area_code': ['E07000223', 'E07000223'], - 'local_authority': ['Adur', 'Adur'], - 'age_group': ['<15 and >=65', '15-64'], - 'total_disabled': [2100, 1700], - 'total_population': [11050, 10850], - }) + self.expected_df = pd.DataFrame( + { + "area_code": ["E07000223", "E07000223"], + "local_authority": ["Adur", "Adur"], + "age_group": ["<15 and >=65", "15-64"], + "total_disabled": [2100, 1700], + "total_population": [11050, 10850], + } + ) def test_convert_disability_age_group_england_wales(self): result_df = convert_disability_age_group_england_wales(self.test_data_filepath, self.config) @@ -121,26 +194,29 @@ def test_convert_disability_age_group_england_wales(self): pd.testing.assert_frame_equal(result_df, self.expected_df) def tearDown(self): - for filename in os.listdir('./tests/data/'): - if 'disability' in filename: - file_path = os.path.join('./tests/data/', filename) + for filename in os.listdir("./tests/data/"): + if "disability" in filename: + file_path = os.path.join("./tests/data/", filename) if os.path.isfile(file_path): os.remove(file_path) print(f"Test ran then deleted: {file_path}") + class TestConvertDisabilityAgeGroupNorthernIreland(unittest.TestCase): def setUp(self): self.test_data_filepath = "./tests/data/ni_test_disability_data.xlsx" - self.config = {'input_directory': './tests/data/'} + self.config = {"input_directory": "./tests/data/"} columns = [ - "Geography", "Geography code", "All usual residents", - "All usual residents:Day-to-day activities limited a lot", - "All usual residents:Day-to-day activities limited a little", - "All usual residents:Day-to-day activities not limited", - "Usual residents aged 0-14 years", - "Usual residents aged 0-14 years:Day-to-day activities limited a lot", - "Usual residents aged 0-14 years: Day-to-day activities limited a little", - "Usual residents aged 0-14 years: Day-to-day activities not limited", + "Geography", + "Geography code", + "All usual residents", + "All usual residents:Day-to-day activities limited a lot", + "All usual residents:Day-to-day activities limited a little", + "All usual residents:Day-to-day activities not limited", + "Usual residents aged 0-14 years", + "Usual residents aged 0-14 years:Day-to-day activities limited a lot", + "Usual residents aged 0-14 years: Day-to-day activities limited a little", + "Usual residents aged 0-14 years: Day-to-day activities not limited", "Usual residents aged 15-39 years", "Usual residents aged 15-39 years: Day-to-day activities limited a lot", "Usual residents aged 15-39 years: Day-to-day activities limited a little", @@ -152,62 +228,74 @@ def setUp(self): "Usual residents aged 65+ years", "Usual residents aged 65+ years: Day-to-day activities limited a lot", "Usual residents aged 65+ years: Day-to-day activities limited a little", - "Usual residents aged 65+ years: Day-to-day activities not limited" + "Usual residents aged 65+ years: Day-to-day activities not limited", ] - placeholders_before = pd.DataFrame({col: ['']*8 for col in columns}) + placeholders_before = pd.DataFrame({col: [""] * 8 for col in columns}) header = pd.DataFrame([columns], columns=columns) data = { - "Geography": ["Antrim and Newtownabbey", "Belfast"], - "Geography code": ["N09000001", "N09000003"], - "All usual residents": [1000, 1200], - "All usual residents:Day-to-day activities limited a lot": [100, 120], - "All usual residents:Day-to-day activities limited a little": [150, 180], - "All usual residents:Day-to-day activities not limited": [750, 900], - "Usual residents aged 0-14 years": [200, 250], - "Usual residents aged 0-14 years:Day-to-day activities limited a lot": [10, 12], - "Usual residents aged 0-14 years: Day-to-day activities limited a little": [20, 25], - "Usual residents aged 0-14 years: Day-to-day activities not limited": [170, 213], - "Usual residents aged 15-39 years": [300, 350], - "Usual residents aged 15-39 years: Day-to-day activities limited a lot": [20, 25], - "Usual residents aged 15-39 years: Day-to-day activities limited a little": [30, 35], - "Usual residents aged 15-39 years: Day-to-day activities not limited": [250, 290], - "Usual residents aged 40-64 years": [300, 350], - "Usual residents aged 40-64 years: Day-to-day activities limited a lot": [40, 45], - "Usual residents aged 40-64 years: Day-to-day activities limited a little": [50, 60], - "Usual residents aged 40-64 years: Day-to-day activities not limited": [210, 245], - "Usual residents aged 65+ years": [200, 250], - "Usual residents aged 65+ years: Day-to-day activities limited a lot": [30, 38], - "Usual residents aged 65+ years: Day-to-day activities limited a little": [50, 60], - "Usual residents aged 65+ years: Day-to-day activities not limited": [120, 152] - } + "Geography": ["Antrim and Newtownabbey", "Belfast"], + "Geography code": ["N09000001", "N09000003"], + "All usual residents": [1000, 1200], + "All usual residents:Day-to-day activities limited a lot": [100, 120], + "All usual residents:Day-to-day activities limited a little": [150, 180], + "All usual residents:Day-to-day activities not limited": [750, 900], + "Usual residents aged 0-14 years": [200, 250], + "Usual residents aged 0-14 years:Day-to-day activities limited a lot": [10, 12], + "Usual residents aged 0-14 years: Day-to-day activities limited a little": [20, 25], + "Usual residents aged 0-14 years: Day-to-day activities not limited": [170, 213], + "Usual residents aged 15-39 years": [300, 350], + "Usual residents aged 15-39 years: Day-to-day activities limited a lot": [20, 25], + "Usual residents aged 15-39 years: Day-to-day activities limited a little": [30, 35], + "Usual residents aged 15-39 years: Day-to-day activities not limited": [250, 290], + "Usual residents aged 40-64 years": [300, 350], + "Usual residents aged 40-64 years: Day-to-day activities limited a lot": [40, 45], + "Usual residents aged 40-64 years: Day-to-day activities limited a little": [50, 60], + "Usual residents aged 40-64 years: Day-to-day activities not limited": [210, 245], + "Usual residents aged 65+ years": [200, 250], + "Usual residents aged 65+ years: Day-to-day activities limited a lot": [30, 38], + "Usual residents aged 65+ years: Day-to-day activities limited a little": [50, 60], + "Usual residents aged 65+ years: Day-to-day activities not limited": [120, 152], + } data_rows = pd.DataFrame(data) - placeholders_after = pd.DataFrame({col: ['empty']*14 for col in columns}) - df = pd.concat([placeholders_before, header, data_rows, placeholders_after], ignore_index=True) + placeholders_after = pd.DataFrame({col: ["empty"] * 14 for col in columns}) + df = pd.concat( + [placeholders_before, header, data_rows, placeholders_after], ignore_index=True + ) os.makedirs(os.path.dirname(self.test_data_filepath), exist_ok=True) - df.to_excel(self.test_data_filepath, sheet_name='LGD', index=False, header=False) + df.to_excel(self.test_data_filepath, sheet_name="LGD", index=False, header=False) - self.expected_df = pd.DataFrame({ - 'area_code': ['N09000001', 'N09000001', 'N09000003', 'N09000003'], - 'local_authority': ['Antrim and Newtownabbey', 'Antrim and Newtownabbey', 'Belfast', 'Belfast'], - 'age_group': ['<15 and >=65', '15-64','<15 and >=65', '15-64' ], - 'total_disabled': [110, 140, 135, 165], - 'total_population': [400, 600, 500, 700], - }) + self.expected_df = pd.DataFrame( + { + "area_code": ["N09000001", "N09000001", "N09000003", "N09000003"], + "local_authority": [ + "Antrim and Newtownabbey", + "Antrim and Newtownabbey", + "Belfast", + "Belfast", + ], + "age_group": ["<15 and >=65", "15-64", "<15 and >=65", "15-64"], + "total_disabled": [110, 140, 135, 165], + "total_population": [400, 600, 500, 700], + } + ) def test_convert_disability_age_group_northern_ireland(self): - result_df = convert_disability_age_group_northern_ireland(self.test_data_filepath, self.config) + result_df = convert_disability_age_group_northern_ireland( + self.test_data_filepath, self.config + ) self.assertIsInstance(result_df, pd.DataFrame) self.assertFalse(result_df.empty) pd.testing.assert_frame_equal(result_df, self.expected_df) def tearDown(self): - for filename in os.listdir('./tests/data/'): - if 'disability' in filename: - file_path = os.path.join('./tests/data/', filename) + for filename in os.listdir("./tests/data/"): + if "disability" in filename: + file_path = os.path.join("./tests/data/", filename) if os.path.isfile(file_path): os.remove(file_path) print(f"Test ran then deleted: {file_path}") -if __name__ == '__main__': - unittest.main() \ No newline at end of file + +if __name__ == "__main__": + unittest.main() diff --git a/tests/utilities/test_load_config.py b/tests/utilities/test_load_config.py index a13277b..38f60c8 100644 --- a/tests/utilities/test_load_config.py +++ b/tests/utilities/test_load_config.py @@ -1,8 +1,9 @@ -import unittest import getpass -import os +import unittest + from area_classification.utilities.load_config import load_config + class TestLoadConfig(unittest.TestCase): def test_load_config_with_placeholder(self): # Arrange @@ -18,5 +19,6 @@ def test_load_config_with_placeholder(self): self.assertIn("filepath", result) self.assertEqual(result["filepath"], expected_filepath) + if __name__ == "__main__": - unittest.main() \ No newline at end of file + unittest.main() diff --git a/tests/utilities/test_loading_data.py b/tests/utilities/test_loading_data.py index aa36cc8..67623cc 100644 --- a/tests/utilities/test_loading_data.py +++ b/tests/utilities/test_loading_data.py @@ -1,54 +1,56 @@ import os import tempfile -import pandas as pd import unittest + +import pandas as pd + from area_classification.utilities.loading_data import load_data, load_format_data + def create_dummy_csv(directory, filename, data): path = os.path.join(directory, filename) pd.DataFrame(data).to_csv(path, index=False) return path -class TestLoadFormatData(unittest.TestCase): +class TestLoadFormatData(unittest.TestCase): def test_load_data_handles_missing_values(self): with tempfile.TemporaryDirectory() as tmpdir: # Create a CSV file with missing values - data = {'A': [1, None, 3], 'B': [4, 5, None]} - csv_path = os.path.join(tmpdir, 'test.csv') + data = {"A": [1, None, 3], "B": [4, 5, None]} + csv_path = os.path.join(tmpdir, "test.csv") pd.DataFrame(data).to_csv(csv_path) # Call load_data df = load_data(csv_path) # Check that missing values are replaced with 0 self.assertTrue((df.isnull().sum().sum() == 0)) - self.assertEqual(df.loc[1, 'A'], 0) - self.assertEqual(df.loc[2, 'B'], 0) + self.assertEqual(df.loc[1, "A"], 0) + self.assertEqual(df.loc[2, "B"], 0) def test_load_format_data_merges_multiple_files_correctly(self): with tempfile.TemporaryDirectory() as tmpdir: # Create two dummy CSV files with a common join column - data1 = {'geo_code': [1, 2], 'A': [10, 20]} - data2 = {'geo_code': [1, 2], 'B': [100, 200]} - create_dummy_csv(tmpdir, 'ts001.csv', data1) - create_dummy_csv(tmpdir, 'ts002.csv', data2) + data1 = {"geo_code": [1, 2], "A": [10, 20]} + data2 = {"geo_code": [1, 2], "B": [100, 200]} + create_dummy_csv(tmpdir, "ts001.csv", data1) + create_dummy_csv(tmpdir, "ts002.csv", data2) # Prepare config with input_directory - config = {'input_directory': tmpdir} + config = {"input_directory": tmpdir} # Run function - merged = load_format_data(tmpdir, 'ts*.csv', 'geo_code', config) + merged = load_format_data(tmpdir, "ts*.csv", "geo_code", config) # Check shape and columns self.assertEqual(merged.shape, (2, 3)) - self.assertEqual(set(merged.columns), {'geo_code', 'A', 'B'}) - self.assertEqual(merged.loc[0, 'A'], 10) - self.assertEqual(merged.loc[0, 'B'], 100) + self.assertEqual(set(merged.columns), {"geo_code", "A", "B"}) + self.assertEqual(merged.loc[0, "A"], 10) + self.assertEqual(merged.loc[0, "B"], 100) def test_load_format_data_raises_if_no_files_found(self): with tempfile.TemporaryDirectory() as tmpdir: - config = {'input_directory': tmpdir} + config = {"input_directory": tmpdir} with self.assertRaises(FileNotFoundError) as cm: - load_format_data(tmpdir, 'ts*.csv', 'geo_code', config) + load_format_data(tmpdir, "ts*.csv", "geo_code", config) self.assertIn("No files matching", str(cm.exception)) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() -