| import sys |
| import json |
| import pandas as pd |
| import numpy as np |
| import pickle |
|
|
|
|
| def extract_year(df, eoy_date): |
| """ |
| Extract 1 year of data |
| -------- |
| :param df: dataframe to extract from |
| :param eoy_date: user-specified end of year date |
| :return: data from chosen year |
| """ |
| return df[df.eoy == eoy_date] |
|
|
|
|
| def read_yearly_data(data_path, data_type, eoy_date): |
| """ |
| Read in data for year required |
| -------- |
| :param data_path: path to generated data |
| :param data_type: type of data to read in |
| :param eoy_date: user-specified end of year date |
| :return: data from chosen year and ids |
| """ |
| df = pd.read_pickle(data_path + 'min_max_' + data_type + '.pkl') |
| df_year = extract_year(df, eoy_date) |
| ids = df_year.pop('SafeHavenID').to_list() |
| df_year = df_year.drop('eoy', axis=1) |
|
|
| return df_year, ids |
|
|
|
|
| def main(): |
|
|
| |
| with open('../../../config.json') as json_config_file: |
| config = json.load(json_config_file) |
|
|
| |
| eoy_date = config['date'] |
| data_path = config['model_data_path'] |
|
|
| |
| data_type = sys.argv[1] |
| run_name = sys.argv[2] |
|
|
| |
| print('Loading data') |
| columns = np.load(data_path + run_name + '_cols.npy', allow_pickle=True) |
| df_scaled, ids = read_yearly_data(data_path, data_type, eoy_date) |
| df_scaled_reduced = df_scaled[columns] |
| df_unscaled_full = pd.read_pickle(data_path + 'filled_' + data_type + '.pkl') |
| df_unscaled = extract_year(df_unscaled_full, eoy_date) |
| |
| |
| print('Loading model') |
| clf_model_file = data_path + run_name + '_dtc_model.pkl' |
| clf = pickle.load(open(clf_model_file, 'rb')) |
|
|
| |
| print('Predicting clusters') |
| labels = clf.predict(df_scaled_reduced.to_numpy()) |
| df_unscaled['cluster'] = labels |
| df_unscaled.to_pickle(data_path + '_'.join((run_name, data_type, 'clusters.pkl'))) |
|
|
|
|
| main() |
|
|