| """ |
| Process demographics data |
| -------- |
| Process DOB, sex, marital status and SIMD data |
| """ |
| import json |
| from utils.common import read_data, correct_column_names |
|
|
|
|
| def initialize_demo_data(demo_file): |
| """ |
| Load in demographics dataset to correct format |
| -------- |
| :param demo_file: demographics data file name |
| :return: demographics dataframe with correct column names and types |
| """ |
| print('Loading demographic data') |
|
|
| |
| demo_cols = ['SafeHavenID', 'OBF_DOB', 'SEX', 'MARITAL_STATUS', |
| 'SIMD_2009_QUINTILE', 'SIMD_2009_DECILE', |
| 'SIMD_2009_VIGINTILE', 'SIMD_2012_QUINTILE', |
| 'SIMD_2012_DECILE', 'SIMD_2012_VIGINTILE', |
| 'SIMD_2016_QUINTILE', 'SIMD_2016_DECILE', |
| 'SIMD_2016_VIGINTILE'] |
| demo_types = ['int', 'object', 'str', 'str', 'float', 'float', 'float', |
| 'float', 'float', 'float', 'float', 'float', 'float'] |
| df = read_data(demo_file, demo_cols, demo_types) |
|
|
| |
| df = df.drop_duplicates() |
|
|
| return df |
|
|
|
|
| def process_sex(df): |
| """ |
| Process sex column in demographics |
| -------- |
| :param df: dataframe to update |
| :return: updated dataframe |
| """ |
| print('One-hot encoding sex') |
|
|
| df['sex_bin'] = (df.SEX == 'F').astype(int) |
|
|
| return df |
|
|
|
|
| def main(): |
|
|
| |
| with open('../../../config.json') as json_config_file: |
| config = json.load(json_config_file) |
|
|
| |
| demo_file = config['extract_data_path'] + 'Demographics_Cohort3R.csv' |
| demo = initialize_demo_data(demo_file) |
|
|
| |
| demo = process_sex(demo) |
|
|
| |
| demo = demo.drop('SEX', axis=1) |
|
|
| |
| new_cols = correct_column_names(demo.columns[1:], 'demo') |
| demo.columns = ['SafeHavenID'] + new_cols |
|
|
| |
| demo.to_pickle(config['model_data_path'] + 'demo_proc.pkl') |
|
|
|
|
| main() |
|
|