Source code for get_pop.modules.parse.parse

import pandas as pd
from typing import Dict, Callable
from get_pop.definitions import PATH_USA_POP, parsed_data_type
import logging
import pathlib
from get_pop.definitions import selected_fields_type, selected_values_type


[docs]def parse_states( value_field: str, selected_values: selected_values_type, selected_fields: selected_fields_type, *, field_cleaners: Dict[str, Callable[[pd.DataFrame, str], pd.DataFrame]] = None, ) -> parsed_data_type: """ Outputs CSVs of state data after parsing a large CSV of U.S. county-level census data for selected states. Args: value_field (str): Field that will be used to filter data by. selected_values (selected_values_type): A list of dictionaries relating to the state's selected for data extraction. Each dict has a key-value pairs for the full name of the state and it's two-letter abbreviation. selected_fields (selected_fields_type): A list of dictionaries that represent the fields that will be selected from the U.S. Census CSV, and how the field will be represented in the final CSV. field_cleaners (Dict[Callable[[pd.DataFrame, str], pd.DataFrame]]): (Optional) function that cleans a specified field Returns: parsed_data_type - A list of dictionaries with parsed data """ # read df = pd.read_csv(PATH_USA_POP, encoding="ISO-8859-1") # filter - remove statewide population counts df = df[df["COUNTY"] != 0] # filter - include only selected values selected_values_names = [x["name"] for x in selected_values] df = df[df[value_field].isin(selected_values_names)] # option - clean value field if field_cleaners: for field in field_cleaners.keys(): cleaner_func = field_cleaners[field] df = cleaner_func(df, field) # rename field lookuptable rename_schema = {} for field in selected_fields: input_name = field["input_name"] output_name = field["output_name"] rename_schema[input_name] = output_name # group by by_state = df.groupby(value_field) payload = [] for name, group in by_state: logging.info(f"Processing: {name}") # get selected state dict for processing instructions selected_state = list(filter(lambda x: x["name"] == name, selected_values))[0] # generate FIPS code # Temporarily disabling SettingWithCopy warning pd.reset_option("mode.chained_assignment") with pd.option_context("mode.chained_assignment", None): group["STATE"] = group["STATE"].astype(str).str.zfill(2) group["COUNTY"] = group["COUNTY"].astype(str).str.zfill(3) group["FIPS"] = group["STATE"] + group["COUNTY"] # truncate cols in df selected_fields_input = [x["input_name"] for x in selected_fields] group = group[selected_fields_input] # rename group = group.rename(columns=rename_schema) # option - special processor (special funcs for doing extra stuff to df) special_processors = selected_state.get("special_processors") if special_processors: for processor in special_processors: group = processor(group) # produce csv abbrv = selected_state["abbrv"] payload.append({"name": abbrv, "data": group}) return payload