copd-model-e / validation /spirometry_scripts /spirometry_RC_SU_mapping.py

Model E: Unsupervised PCA + clustering risk stratification

53a6def 23 days ago

2.84 kB

	"""
	Map GOLD standard COPD groupings from REC/SUP IDs to SafeHavenIDs.
	--------
	NB: Data contained within 'RC_SU1_spirometry_data.csv' has been created using
	from data within the teams space.
	"""
	import pandas as pd


	# Set file paths
	file_path = '<YOUR_DATA_PATH>/copd.model-e/'
	input_file_path = file_path + 'training/src/data/'
	output_file_path = '<YOUR_DATA_PATH>/Model_E_Extracts/rec_sup_spirometry_data.pkl'


	def read_data(file):
	"""
	Read in data source
	--------
	:param file: string filename
	:return: dataframe
	"""
	df = pd.read_csv(file)

	return df


	def calc_gold_grade(data):
	"""
	Calculate GOLD grade for COPD classification using FEV1%
	--------
	:param data: dataframe containing FEV1% column
	:return: GOLD grade values based on if else statement
	"""
	fev1 = data['FEV1%']
	if fev1 >= 80:
	val = 'GOLD 1'
	elif (fev1 >= 50) & (fev1 < 80):
	val = 'GOLD 2'
	elif (fev1 >= 30) & (fev1 < 50):
	val = 'GOLD 3'
	elif fev1 < 30:
	val = 'GOLD 4'
	else:
	val = ''

	return val


	def add_SH_mappings_for_RC_and_SU1(RC_IDs, SU1_IDs, spirometry_data):
	"""
	Join the SH ID mappings to the spirometry data for RC and SU1
	--------
	:param RC_IDs: dataframe containing RECEIVER - SH ID mappings
	:param SU1_IDs: dataframe containing SU1 - SH ID mappings
	:param spirometry_data: spirometry data for RC and SU1
	:return: RC and SU1 spirometry data with SH ID mapping columns
	"""
	receiver_IDs = RC_IDs.rename(columns={'RNo': 'StudyId'})
	scaleup_IDs = SU1_IDs.rename(columns={'Study_Number': 'StudyId'})
	all_service_IDs = pd.concat([receiver_IDs, scaleup_IDs], ignore_index=True)
	spirometry_mappings = pd.merge(
	spirometry_data, all_service_IDs, on="StudyId", how="left").dropna()
	type_map = {'FEV1%': 'int32', 'SafeHavenID': 'int32'}
	spirometry_mappings = spirometry_mappings.astype(type_map)

	return spirometry_mappings


	def main():

	# Read spirometry data
	rec_sup_spiro_file = input_file_path + "RC_SU1_spirometry_data.csv"
	rec_sup_spiro_data = read_data(rec_sup_spiro_file).dropna()

	# Create new columns showing the GOLD group of each study participant
	rec_sup_spiro_data['GOLD grade'] = rec_sup_spiro_data.apply(
	calc_gold_grade, axis=1)

	# Read RC and SU1 SafeHaven ID mapping files
	rec_id_file = "<YOUR_DATA_PATH>/EXAMPLE_STUDY_DATA/Cohort3Rand.csv"
	sup_id_file = "<YOUR_DATA_PATH>/SU_IDs/Scale_Up_lookup.csv"
	rec_id_map_data = read_data(rec_id_file)
	sup_id_map_data = read_data(sup_id_file)

	# Join spirometry data to SH mappings
	mapped_data = add_SH_mappings_for_RC_and_SU1(
	rec_id_map_data, sup_id_map_data, rec_sup_spiro_data)

	# Save data
	mapped_data.to_pickle(output_file_path)


	main()