| """ |
| Process SMR01 admission data |
| -------- |
| Clean and process admission data while adding tracking for COPD and respiratory |
| admissions per year for each SafeHavenID |
| """ |
| import json |
| import pandas as pd |
| from datetime import date |
| from dateutil.relativedelta import relativedelta |
| from utils.common import add_hist_adm_presc, first_patient_appearance |
| from utils.adm_common import (initialize_adm_data, correct_stays, |
| track_copd_resp) |
| from utils.adm_processing import (convert_ethgrp_desc, mode_ethnicity, |
| search_diag) |
| from utils.adm_reduction import fill_missing_years, calc_adm_per_year |
|
|
|
|
| def process_ethnicity(df): |
| """ |
| Find relevant ethnic group for each patient, accounting for null data |
| -------- |
| :param df: admission dataframe to be updated |
| :return: admission dataframe with ethnicity cleaned and updated |
| """ |
| print('Processing ethnicity') |
|
|
| |
| df = df.rename(columns={'ETHGRP': 'eth_grp'}) |
| df['eth_grp'] = df.eth_grp.str.strip() |
| df['eth_grp'] = df.groupby('SafeHavenID')['eth_grp'].apply( |
| lambda x: x.ffill().bfill().fillna('Unknown')) |
|
|
| |
| df['eth_grp'] = [convert_ethgrp_desc(eth) for eth in df.eth_grp] |
|
|
| |
| df = df.groupby('SafeHavenID').apply(mode_ethnicity, 'eth_grp') |
|
|
| return df |
|
|
|
|
| def add_eoy_column(df, dt_col, eoy_date): |
| """ |
| Add EOY relative to user-specified end date |
| -------- |
| :param df: dataframe |
| :param dt_col: date column in dataframe |
| :param eoy_date: EOY date from config |
| :return: updated df with EOY column added |
| """ |
| |
| df = df.reset_index(drop=True) |
|
|
| |
| end_date = pd.to_datetime(eoy_date) |
| end_month = end_date.month |
| end_day = end_date.day |
|
|
| |
| df['eoy'] = [date(y, end_month, end_day) for y in df[dt_col].dt.year] |
|
|
| |
| eoy_index = df.columns[df.columns == 'eoy'] |
| adm_vs_eoy = df[dt_col] > df.eoy |
| row_index = df.index[adm_vs_eoy] |
| df.loc[row_index, eoy_index] = df[adm_vs_eoy].eoy + relativedelta(years=1) |
| df['eoy'] = pd.to_datetime(df.eoy) |
|
|
| return df |
|
|
|
|
| def extract_yearly_data(df): |
| """ |
| Extract features on a yearly basis for each SafeHavenID |
| -------- |
| :param adm: admission dataframe to be updated |
| :return: dataframe with feature values per year |
| """ |
| print('Reducing to 1 row SafeHavenID per year') |
|
|
| |
| df['adm'] = 1 |
|
|
| |
| df = df.groupby('SafeHavenID').apply(fill_missing_years) |
| df = df.reset_index(drop=True) |
|
|
| |
| df = df.groupby('SafeHavenID').apply(add_hist_adm_presc, 'adm', 'ADMDATE') |
| df = df.reset_index(drop=True) |
|
|
| |
| df = calc_adm_per_year(df) |
|
|
| |
| final_cols = ['eth_grp', 'adm_per_year', 'total_hosp_days', |
| 'mean_los', 'copd_per_year', 'resp_per_year', |
| 'anxiety_depression_per_year', 'days_since_copd', |
| 'days_since_resp', 'days_since_adm', 'adm_to_date', |
| 'copd_to_date', 'resp_to_date', 'anxiety_depression_to_date', |
| 'copd_date', 'resp_date', 'adm_date'] |
|
|
| df = df[final_cols] |
|
|
| return df |
|
|
|
|
| def main(): |
|
|
| |
| with open('../../../config.json') as json_config_file: |
| config = json.load(json_config_file) |
|
|
| |
| adm_file = config['extract_data_path'] + 'SMR01_Cohort3R.csv' |
| adm = initialize_adm_data(adm_file) |
|
|
| |
| adm = correct_stays(adm) |
|
|
| |
| data_path = config['model_data_path'] |
| first_patient_appearance(adm, 'ADMDATE', 'adm', data_path) |
|
|
| |
| adm = process_ethnicity(adm) |
|
|
| |
| adm = track_copd_resp(adm) |
|
|
| |
| adm = search_diag(adm, 'anxiety_depression') |
|
|
| |
| reduced_cols = ['SafeHavenID', 'eth_grp', 'ADMDATE', 'STAY', 'copd_event', |
| 'resp_event', 'anxiety_depression_event'] |
| adm_reduced = adm[reduced_cols] |
|
|
| |
| adm_reduced.to_pickle(data_path + 'validation_adm_proc.pkl') |
|
|
| |
| adm_reduced = add_eoy_column(adm_reduced, 'ADMDATE', config['date']) |
|
|
| |
| adm_yearly = extract_yearly_data(adm_reduced) |
|
|
| |
| adm_yearly.to_pickle(data_path + 'adm_proc.pkl') |
|
|
|
|
| main() |
|
|