Model E: Unsupervised PCA + clustering risk stratification

53a6def 22 days ago

2.17 kB

	"""
	Admission reduction utilities
	"""
	import pandas as pd
	from datetime import date


	def fill_missing_years(df):
	"""
	Add admission data from years where patient is missing from the dataset
	--------
	:param df: dataframe to be updated
	:return: dataframe with missing years added
	"""
	df = df.sort_values('ADMDATE')
	year_col = df.eoy.dt.year.tolist()
	end_month = df.eoy.dt.month.iloc[0]
	end_day = df.eoy.dt.day.iloc[0]

	# We only want missing years
	year_range = range(year_col[0] + 1, year_col[-1])
	years = [y for y in year_range if not (y in year_col)]

	# If any years missing add rows
	if len(years) > 0:
	sh_id = df.SafeHavenID.iloc[0]
	eth_grp = df.eth_grp.iloc[0]
	adm_dates = pd.to_datetime([date(y, end_month, end_day) for y in years])
	data = {'SafeHavenID': sh_id, 'eth_grp': eth_grp, 'ADMDATE': adm_dates,
	'STAY': 0, 'copd_event': 0, 'resp_event': 0, 'eoy': adm_dates,
	'adm': 0, 'anxiety_depression_event': 0}
	missed_years = pd.DataFrame(data)
	df = pd.concat([df, missed_years]).sort_values('ADMDATE')

	return df


	def calc_adm_per_year(df):
	"""
	Reduce data to 1 row per year
	--------
	:param df: dataframe to reduced
	:return: reduced dataframe
	"""
	# Last EOY columns
	eoy_cols = ['eth_grp', 'days_since_copd', 'days_since_resp', 'days_since_adm',
	'adm_to_date', 'copd_to_date', 'resp_to_date',
	'anxiety_depression_to_date', 'copd_date', 'resp_date', 'adm_date']
	last = df.groupby(['SafeHavenID', 'eoy'])[eoy_cols].last()

	# Average column
	los = df.groupby(['SafeHavenID', 'eoy'])[['STAY']].mean()
	los.columns = ['mean_los']

	# Total columns
	sum_cols = ['adm', 'copd_event', 'resp_event', 'anxiety_depression_event', 'STAY']
	total_cols = ['adm_per_year', 'copd_per_year', 'resp_per_year',
	'anxiety_depression_per_year', 'total_hosp_days']
	total = df.groupby(['SafeHavenID', 'eoy'])[sum_cols].sum()
	total.columns = total_cols

	# Join together
	results = last.join(los).join(total)

	return results