| """ |
| Script for preprocessing labs data |
| -------- |
| Track median values for labs tests over the previous 2 years for patients |
| with resulting dataset containing 1 row of information per patient per year |
| """ |
| import json |
| import pandas as pd |
| import numpy as np |
| from datetime import date |
| from dateutil.relativedelta import relativedelta |
| from utils.common import (read_data, correct_column_names, |
| first_patient_appearance) |
| from utils.labs_processing import add_total_labs |
|
|
|
|
| def initialize_labs_data(labs_file): |
| """ |
| Load in labs dataset to correct format |
| -------- |
| :param labs_file: labs data file name |
| :return: labs dataframe with correct column names and types |
| """ |
| print('Loading labs data') |
|
|
| |
| old_cols = ['SafeHavenID', 'SAMPLEDATE', 'CLINICALCODEDESCRIPTION', |
| 'QUANTITYVALUE', 'RANGEHIGHVALUE', 'RANGELOWVALUE'] |
| labs_types = ['int', 'object', 'str', 'float', 'float', 'float'] |
| df = read_data(labs_file, old_cols, labs_types) |
|
|
| |
| new_cols = ['SafeHavenID', 'SampleDate', 'ClinicalCodeDescription', |
| 'QuantityValue', 'RangeHighValue', 'RangeLowValue'] |
| mapping = dict(zip(old_cols, new_cols)) |
| df = df.rename(columns=mapping) |
|
|
| |
| df = df.dropna().drop_duplicates() |
|
|
| |
| num_cols = ['QuantityValue', 'RangeHighValue', 'RangeLowValue'] |
| df = df[(df[num_cols] > -1).all(axis=1)] |
|
|
| |
| final_cols = ['SafeHavenID', 'SampleDate', 'ClinicalCodeDescription', |
| 'QuantityValue'] |
| df = df[final_cols] |
|
|
| |
| df['SampleDate'] = pd.to_datetime(df.SampleDate) |
|
|
| return df |
|
|
|
|
| def clean_labs(df): |
| """ |
| Clean descriptions and select relevant tests |
| -------- |
| :param df: pandas dataframe |
| :return: cleaned dataframe |
| """ |
| print('Cleaning labs data') |
|
|
| lab_tests = ['ALT', 'AST', 'Albumin', 'Alkaline Phosphatase', 'Basophils', |
| 'C Reactive Protein', 'Chloride', 'Creatinine', 'Eosinophils', |
| 'Estimated GFR', 'Haematocrit', 'Haemoglobin', 'Lymphocytes', |
| 'MCH', 'Mean Cell Volume', 'Monocytes', 'Neutrophils', |
| 'PCO2 (temp corrected', 'Platelets', 'Potassium', |
| 'Red Blood Count', 'Serum vitamin B12', 'Sodium', |
| 'Total Bilirubin', 'Urea', 'White Blood Count'] |
|
|
| |
| str_col = 'ClinicalCodeDescription' |
| df[str_col] = df[str_col].str.strip() |
| |
| |
| with open('mappings/test_mapping.json') as json_file: |
| test_mapping = json.load(json_file) |
|
|
| |
| for k, v in test_mapping.items(): |
| df[str_col] = df[str_col].replace(v, k) |
|
|
| |
| df = df[[desc in lab_tests for desc in df[str_col]]] |
|
|
| return df |
|
|
|
|
| def add_neut_lypmh(df): |
| """ |
| Pivot dataframe and calculate neut_lypmh feature |
| -------- |
| :param df: pandas dataframe |
| :return: pivoted dataframe |
| """ |
| print('Calculating neut_lypmh data') |
|
|
| |
| df = pd.pivot_table( |
| df, index=['SafeHavenID', 'SampleDate'], |
| columns=['ClinicalCodeDescription'], values='QuantityValue', |
| dropna=True).reset_index() |
|
|
| |
| df['neut_lymph'] = df.Neutrophils / df.Lymphocytes |
|
|
| |
| df['neut_lymph'] = df.neut_lymph.replace([np.inf, -np.inf], np.nan) |
|
|
| return df |
|
|
|
|
| def add_eoy_column(df, dt_col, eoy_date): |
| """ |
| Add EOY relative to user-specified end date |
| -------- |
| :param df: dataframe |
| :param dt_col: date column in dataframe |
| :param eoy_date: EOY date from config |
| :return: updated df with EOY column added |
| """ |
| |
| df = df.reset_index(drop=True) |
|
|
| |
| end_date = pd.to_datetime(eoy_date) |
| end_month = end_date.month |
| end_day = end_date.day |
|
|
| |
| df['eoy'] = [date(y, end_month, end_day) for y in df[dt_col].dt.year] |
|
|
| |
| eoy_index = df.columns[df.columns == 'eoy'] |
| adm_vs_eoy = df[dt_col] > df.eoy |
| row_index = df.index[adm_vs_eoy] |
| df.loc[row_index, eoy_index] = df[adm_vs_eoy].eoy + relativedelta(years=1) |
| df['eoy'] = pd.to_datetime(df.eoy) |
|
|
| return df |
|
|
|
|
| def reduce_labs_data(df, dt_col): |
| """ |
| Reduce dataset to 1 row per ID per year looking back at the median values |
| over the previous 2 years |
| -------- |
| :param df: pandas dataframe |
| :param dt_col: date column |
| :return: reduced labs dataframe |
| """ |
| print('Reducing labs to 1 row per patient per year') |
|
|
| group_cols = ['SafeHavenID', 'eoy'] |
| med_cols = ['ALT', 'AST', 'Albumin', 'Alkaline Phosphatase', 'Basophils', |
| 'C Reactive Protein', 'Chloride', 'Creatinine', 'Eosinophils', |
| 'Estimated GFR', 'Haematocrit', 'Haemoglobin', 'Lymphocytes', |
| 'MCH', 'Mean Cell Volume', 'Monocytes', 'Neutrophils', |
| 'Platelets', 'Potassium', 'Red Blood Count', 'Sodium', |
| 'Total Bilirubin', 'Urea', 'White Blood Count', 'neut_lymph'] |
|
|
| |
| df['labs'] = 1 |
|
|
| |
| df = df.sort_values(dt_col) |
|
|
| |
| shifted = df[['eoy']] + pd.DateOffset(years=1) |
| new_tab = df[['SafeHavenID', dt_col] + med_cols].join(shifted) |
| combined_cols = ['SafeHavenID', 'eoy', dt_col] + med_cols |
| combined = pd.concat([df[combined_cols], new_tab]) |
| combined = combined.sort_values(dt_col) |
|
|
| |
| df_med = combined.groupby(group_cols).median() |
|
|
| |
| new_med_cols = [col + '_med_2yr' for col in df_med.columns] |
| df_med.columns = new_med_cols |
|
|
| |
| test = [] |
| for k, v in df.groupby('SafeHavenID')['eoy'].unique().to_dict().items(): |
| test.append(df_med.loc[(k, v), ]) |
| df_med = pd.concat(test) |
|
|
| |
| df_last = df[group_cols + ['labs_to_date']] |
| df_last = df_last.groupby(group_cols).last() |
|
|
| |
| df_sum = df[group_cols + ['labs']] |
| df_sum = df.groupby(group_cols)['labs'].sum() |
|
|
| |
| df_sum = df_sum.to_frame() |
| df_sum.columns = ['labs_per_year'] |
|
|
| |
| df_annual = df_med.join(df_last).join(df_sum) |
| |
| return df_annual |
|
|
|
|
| def main(): |
|
|
| |
| with open('../../../config.json') as json_config_file: |
| config = json.load(json_config_file) |
|
|
| |
| labs_file = config['extract_data_path'] + 'SCI_Store_Cohort3R.csv' |
| labs = initialize_labs_data(labs_file) |
|
|
| |
| labs = clean_labs(labs) |
|
|
| |
| data_path = config['model_data_path'] |
| first_patient_appearance(labs, 'SampleDate', 'labs', data_path) |
|
|
| |
| labs = add_neut_lypmh(labs) |
|
|
| |
| labs = add_eoy_column(labs, 'SampleDate', config['date']) |
| labs = labs.sort_values('SampleDate') |
|
|
| |
| labs['labs_to_date'] = 1 |
| labs = labs.groupby('SafeHavenID').apply(add_total_labs) |
| labs = labs.reset_index(drop=True) |
|
|
| |
| labs_yearly = reduce_labs_data(labs, 'SampleDate') |
|
|
| |
| labs_yearly.columns = correct_column_names(labs_yearly.columns, 'labs') |
|
|
| |
| labs_yearly.to_pickle(data_path + 'labs_proc.pkl') |
|
|
|
|
| main() |
|
|