| """ |
| Process SMR01 comorbidities data |
| -------- |
| Clean and process comorbidities, tracking specific comorbidities and returning |
| the total number of comorbidities per patient per year |
| """ |
| import json |
| import pandas as pd |
| from datetime import date |
| from dateutil.relativedelta import relativedelta |
| from utils.common import track_event |
| from utils.adm_common import initialize_adm_data, correct_stays |
| from utils.comorb_processing import diagnosis_mapping_lists |
|
|
|
|
| def track_comorbidity(df, excel_file, sheet_name, diag_names): |
| """ |
| Map from admission descriptions to comorbidities using provided sheet. |
| Add new column for each comorbidity. |
| -------- |
| :param df: pandas dataframe |
| :param excel_file: str filename for diagnosis mapping |
| :param sheet_name: str sheet name for diagnosis mapping |
| :param diag_names: list of diagnoses |
| :return: dataframe update with diagnosis mapping |
| """ |
| print('Tracking comorbidities') |
|
|
| |
| mapping = diagnosis_mapping_lists(excel_file, sheet_name, diag_names) |
| |
| |
| diag_columns = ['DIAG1Desc', 'DIAG2Desc', 'DIAG3Desc', 'DIAG4Desc', |
| 'DIAG5Desc', 'DIAG6Desc'] |
| df_diag = df[diag_columns] |
|
|
| |
| for key in mapping: |
| com = mapping[key] |
| com_bool = df_diag.apply(lambda x: track_event(x, com, False)) |
| com_int = com_bool.any(axis=1).astype(int) |
| df[key] = com_int |
|
|
| return df |
|
|
|
|
| def fill_comorbidities(df, diag_names): |
| """ |
| Fill comorbidites |
| -------- |
| :param df: dataframe of groupby values |
| :param diag_names: list of diagnoses |
| :return: updated dataframe |
| """ |
|
|
| df[diag_names] = df[diag_names].replace(to_replace=0, method='ffill') |
|
|
| return df |
|
|
|
|
| def add_eoy_column(df, dt_col, eoy_date): |
| """ |
| Add EOY relative to user-specified end date |
| -------- |
| :param df: dataframe |
| :param dt_col: date column in dataframe |
| :param eoy_date: EOY date from config |
| :return: updated df with EOY column added |
| """ |
| |
| df = df.reset_index(drop=True) |
|
|
| |
| end_date = pd.to_datetime(eoy_date) |
| end_month = end_date.month |
| end_day = end_date.day |
|
|
| |
| df['eoy'] = [date(y, end_month, end_day) for y in df[dt_col].dt.year] |
|
|
| |
| eoy_index = df.columns[df.columns == 'eoy'] |
| adm_vs_eoy = df[dt_col] > df.eoy |
| row_index = df.index[adm_vs_eoy] |
| df.loc[row_index, eoy_index] = df[adm_vs_eoy].eoy + relativedelta(years=1) |
| df['eoy'] = pd.to_datetime(df.eoy) |
|
|
| return df |
|
|
|
|
| def add_yearly_stats(df): |
| """ |
| Sum comorbidities per patient per year |
| -------- |
| :param df: dataframe to update |
| :return: sum of comorbidities per patient per year |
| """ |
| print('Adding comorbidity count per year') |
|
|
| |
| cols_2_drop = ['ADMDATE', 'DISDATE', 'STAY', 'ETHGRP', 'DIAG1Desc', |
| 'DIAG2Desc', 'DIAG3Desc', 'DIAG4Desc', 'DIAG5Desc', |
| 'DIAG6Desc', 'DISDATE', 'STAY'] |
| df = df.drop(cols_2_drop, axis=1) |
|
|
| |
| df = df.groupby(['SafeHavenID', 'eoy']).last().sum(axis=1) |
| df = df.to_frame().rename(columns={0: 'comorb_per_year'}) |
|
|
| return df |
|
|
|
|
| def main(): |
|
|
| |
| with open('../../../config.json') as json_config_file: |
| config = json.load(json_config_file) |
|
|
| |
| adm_file = config['extract_data_path'] + 'SMR01_Cohort3R.csv' |
| adm = initialize_adm_data(adm_file) |
|
|
| |
| adm = correct_stays(adm) |
|
|
| |
| adm = adm.apply(lambda x: x.str.strip() if x.dtype == 'object' else x) |
|
|
| |
| excel_file = "mappings/Comorbidity feature review for models & clin " \ |
| "summary update v2 May 2021.xlsx" |
| sheet_name = 'Diagnosis category mapping3' |
| diag_names = ['Ischaemic_hd', 'Atrial_fib', 'pacemake', 'periph_vasc', |
| 'cog_imp', 'HF1', 'LV_sys', 'valv_hd', 'HF_pres_ejec', |
| 'hypertension', 'Cerebrovascula_dis', 'Diabetes_mel', |
| 'Osteoporosis', 'frailty', 'liver_dis', 'metastat_canc', |
| 'headneck_canc', 'breast_canc', 'gi_canc', 'other_canc', |
| 'kidney_dis', 'Asthma_ov', 'Pulmonary_fib', |
| 'Obstructive_apnoea', 'Pulmonary_hyp', 'Previous_pneum', |
| 'DVT_PTE', 'Lung_cancer', 'Bronchiectasis', 'Resp_fail'] |
| adm_comorb = track_comorbidity(adm, excel_file, sheet_name, diag_names) |
|
|
| |
| adm_comorb = adm_comorb.sort_values('ADMDATE').reset_index(drop=True) |
|
|
| |
| print('Filling comorbidities') |
| adm_filled = adm_comorb.groupby('SafeHavenID').apply( |
| fill_comorbidities, diag_names) |
|
|
| |
| adm_filled = add_eoy_column(adm_filled, 'ADMDATE', config['date']) |
|
|
| |
| adm_yearly = add_yearly_stats(adm_filled) |
|
|
| |
| adm_yearly.to_pickle(config['model_data_path'] + 'comorb_proc.pkl') |
|
|
|
|
| main() |
|
|