| """ |
| Utilities required across all processing scripts |
| """ |
| import pandas as pd |
| import numpy as np |
|
|
|
|
| def read_data(file, cols, types): |
| """ |
| Read in data source |
| -------- |
| :param file: string filename |
| :param cols: string list of column names |
| :param types: string list of column types |
| :return: dataframe |
| """ |
| schema = dict(zip(cols, types)) |
| df = pd.read_csv(file, usecols=cols, encoding="cp1252", dtype=schema) |
|
|
| return df |
|
|
|
|
| def first_patient_appearance(df, dt_col, typ, data_path): |
| """ |
| Save first appearance of patient in dataset |
| -------- |
| :param df: dataframe to check |
| :param dt_col: date column to sort by |
| :param typ: type of dataset being used |
| :param data_path: path to data extracts |
| :return: None, dataframe with first dates saved |
| """ |
| df = df.sort_values(dt_col) |
| df_first = df.groupby('SafeHavenID')[dt_col].first() |
| df_first = df_first.to_frame().reset_index() |
| df_first.columns = ['SafeHavenID', 'first_adm'] |
| df_first.to_pickle(data_path + typ + '_first_dates.pkl') |
|
|
|
|
| def add_days_since_event(df, typ, dt_col): |
| """ |
| Historical features: add days since features e.g. copd/resp/rescue |
| -------- |
| :param df: dataframe to be updated |
| :param typ: 'rescue', 'copd' or 'resp' feature to be created |
| :param dt_col: str date column name |
| :return: updated dataframe with historical column added |
| """ |
| if typ == 'rescue': |
| event_col = 'rescue_meds' |
| elif typ == 'adm': |
| event_col = 'adm' |
| else: |
| event_col = typ + '_event' |
| date_col = typ + '_date' |
| days_col = 'days_since_' + typ |
| df[date_col] = df.apply( |
| lambda x: x[dt_col] if x[event_col] else np.nan, axis=1).ffill() |
| if df[date_col].isna().all(): |
| df[days_col] = np.nan |
| else: |
| df[days_col] = (df.eoy - df[date_col]).dt.days |
|
|
| return df |
|
|
|
|
| def track_event(x, desc, single): |
| """ |
| Fill nulls and search to see if x matches a description |
| -------- |
| :param x: str list of features to track |
| :param desc: str list to compare |
| :param single: boolean for checking against single description e.g. |
| "COPD" True otherwise False |
| :return: tracked feature list |
| """ |
| x = x.fillna('') |
|
|
| |
| if single: |
| result = [desc in s for s in x] |
|
|
| |
| else: |
| result = [s in desc for s in x] |
|
|
| return result |
|
|
|
|
| def add_hist_adm_presc(df, typ, dt_col): |
| """ |
| Historical features: add days since and to-date features |
| -------- |
| :param df: dataframe to be updated |
| :param typ: type of data - 'adm' or 'presc' |
| :param dt_col: string name of date column |
| :return: updated dataframe with historical columns added |
| """ |
| if typ == 'presc': |
| df = df.sort_values(dt_col).reset_index(drop=True) |
| df = add_days_since_event(df, 'rescue', dt_col) |
| df['rescue_to_date'] = df.rescue_meds.cumsum() |
| df['anxiety_depression_presc_to_date'] = df.anxiety_depression_presc.cumsum() |
| else: |
| for col in ['adm', 'copd', 'resp']: |
| df = add_days_since_event(df, col, dt_col) |
| for col in ['copd', 'resp', 'anxiety_depression']: |
| df[col + '_to_date'] = df[col + '_event'].cumsum() |
|
|
| |
| df[typ + '_to_date'] = df[typ].cumsum() |
|
|
| return df |
|
|
|
|
| def correct_column_names(cols, typ): |
| """ |
| Convert column names to lower case and fill any spaces with underscores |
| -------- |
| :param cols: string list of column names |
| :param typ: type of dataset being updated |
| :return: cleaned column names |
| """ |
| print('Correcting column headers') |
|
|
| if typ == 'presc': |
| lower_cols = cols.str.replace('[+-]', ' ').str.lower() |
| new_cols = ["_".join(col.split()) for col in lower_cols] |
| else: |
| new_cols = cols.str.lower().str.replace(' ', '_').tolist() |
|
|
| return new_cols |