| import argparse |
| import collections |
| import gzip |
| import html |
| import json |
| import os |
| import random |
| import re |
| import torch |
| from tqdm import tqdm |
| import numpy as np |
| from utils import check_path, clean_text, amazon18_dataset2fullname, write_json_file, write_remap_index |
|
|
| def load_ratings(file): |
| users, items, inters = set(), set(), set() |
| with open(file, 'r') as fp: |
| for line in tqdm(fp, desc='Load ratings'): |
| try: |
| item, user, rating, time = line.strip().split(',') |
| users.add(user) |
| items.add(item) |
| inters.add((user, item, float(rating), int(time))) |
| except ValueError: |
| print(line) |
| return users, items, inters |
|
|
|
|
| def load_meta_items(file): |
| items = {} |
| with gzip.open(file, "r") as fp: |
| for line in tqdm(fp, desc="Load metas"): |
| data = json.loads(line) |
| item = data["asin"] |
| title = clean_text(data["title"]) |
|
|
| descriptions = data["description"] |
| descriptions = clean_text(descriptions) |
|
|
| brand = data["brand"].replace("by\n", "").strip() |
|
|
| categories = data["category"] |
| new_categories = [] |
| for category in categories: |
| if "</span>" in category: |
| break |
| new_categories.append(category.strip()) |
| categories = ",".join(new_categories).strip() |
|
|
| items[item] = {"title": title, "description": descriptions, "brand": brand, "categories": categories} |
| |
| return items |
|
|
|
|
| def load_review_data(args, user2id, item2id): |
|
|
| dataset_full_name = amazon18_dataset2fullname[args.dataset] |
| review_file_path = os.path.join(args.input_path, 'Review', dataset_full_name + '.json.gz') |
|
|
| reviews = {} |
|
|
| with gzip.open(review_file_path, "r") as fp: |
|
|
| for line in tqdm(fp,desc='Load reviews'): |
| inter = json.loads(line) |
| try: |
| user = inter['reviewerID'] |
| item = inter['asin'] |
| if user in user2id and item in item2id: |
| uid = user2id[user] |
| iid = item2id[item] |
| else: |
| continue |
| if 'reviewText' in inter: |
| review = clean_text(inter['reviewText']) |
| else: |
| review = '' |
| if 'summary' in inter: |
| summary = clean_text(inter['summary']) |
| else: |
| summary = '' |
| reviews[str((uid,iid))]={"review":review, "summary":summary} |
|
|
| except ValueError: |
| print(line) |
|
|
| return reviews |
|
|
|
|
| def get_user2count(inters): |
| user2count = collections.defaultdict(int) |
| for unit in inters: |
| user2count[unit[0]] += 1 |
| return user2count |
|
|
|
|
| def get_item2count(inters): |
| item2count = collections.defaultdict(int) |
| for unit in inters: |
| item2count[unit[1]] += 1 |
| return item2count |
|
|
|
|
| def generate_candidates(unit2count, threshold): |
| cans = set() |
| for unit, count in unit2count.items(): |
| if count >= threshold: |
| cans.add(unit) |
| return cans, len(unit2count) - len(cans) |
|
|
|
|
| def filter_inters(inters, can_items=None, |
| user_k_core_threshold=0, item_k_core_threshold=0): |
| new_inters = [] |
|
|
| |
| if can_items: |
| print('\nFiltering by meta items: ') |
| for unit in inters: |
| if unit[1] in can_items.keys(): |
| new_inters.append(unit) |
| inters, new_inters = new_inters, [] |
| print(' The number of inters: ', len(inters)) |
|
|
| |
| if user_k_core_threshold or item_k_core_threshold: |
| print('\nFiltering by k-core:') |
| idx = 0 |
| user2count = get_user2count(inters) |
| item2count = get_item2count(inters) |
|
|
| while True: |
| new_user2count = collections.defaultdict(int) |
| new_item2count = collections.defaultdict(int) |
| users, n_filtered_users = generate_candidates( |
| user2count, user_k_core_threshold) |
| items, n_filtered_items = generate_candidates( |
| item2count, item_k_core_threshold) |
| if n_filtered_users == 0 and n_filtered_items == 0: |
| break |
| for unit in inters: |
| if unit[0] in users and unit[1] in items: |
| new_inters.append(unit) |
| new_user2count[unit[0]] += 1 |
| new_item2count[unit[1]] += 1 |
| idx += 1 |
| inters, new_inters = new_inters, [] |
| user2count, item2count = new_user2count, new_item2count |
| print(' Epoch %d The number of inters: %d, users: %d, items: %d' |
| % (idx, len(inters), len(user2count), len(item2count))) |
| return inters |
|
|
|
|
| def make_inters_in_order(inters): |
| user2inters, new_inters = collections.defaultdict(list), list() |
| for inter in inters: |
| user, item, rating, timestamp = inter |
| user2inters[user].append((user, item, rating, timestamp)) |
| for user in user2inters: |
| user_inters = user2inters[user] |
| user_inters.sort(key=lambda d: d[3]) |
| interacted_item = set() |
| for inter in user_inters: |
| if inter[1] in interacted_item: |
| continue |
| interacted_item.add(inter[1]) |
| new_inters.append(inter) |
| return new_inters |
|
|
|
|
| def preprocess_rating(args): |
| dataset_full_name = amazon18_dataset2fullname[args.dataset] |
|
|
| print('Process rating data: ') |
| print(' Dataset: ', args.dataset) |
|
|
| |
| rating_file_path = os.path.join(args.input_path, 'Ratings', dataset_full_name + '.csv') |
| rating_users, rating_items, rating_inters = load_ratings(rating_file_path) |
|
|
| |
| meta_file_path = os.path.join(args.input_path, 'Metadata', f'meta_{dataset_full_name}.json.gz') |
| meta_items = load_meta_items(meta_file_path) |
|
|
| |
| |
| print('The number of raw inters: ', len(rating_inters)) |
|
|
| rating_inters = make_inters_in_order(rating_inters) |
|
|
| rating_inters = filter_inters(rating_inters, can_items=meta_items, |
| user_k_core_threshold=args.user_k, |
| item_k_core_threshold=args.item_k) |
|
|
| |
| rating_inters = make_inters_in_order(rating_inters) |
| print('\n') |
|
|
| |
| return rating_inters, meta_items |
|
|
| def convert_inters2dict(inters): |
| user2items = collections.defaultdict(list) |
| user2index, item2index = dict(), dict() |
| for inter in inters: |
| user, item, rating, timestamp = inter |
| if user not in user2index: |
| user2index[user] = len(user2index) |
| if item not in item2index: |
| item2index[item] = len(item2index) |
| user2items[user2index[user]].append(item2index[item]) |
| return user2items, user2index, item2index |
|
|
| def generate_data(args, rating_inters): |
| print('Split dataset: ') |
| print(' Dataset: ', args.dataset) |
|
|
| |
| user2items, user2index, item2index = convert_inters2dict(rating_inters) |
| train_inters, valid_inters, test_inters = dict(), dict(), dict() |
| for u_index in range(len(user2index)): |
| inters = user2items[u_index] |
| |
| train_inters[u_index] = [str(i_index) for i_index in inters[:-2]] |
| valid_inters[u_index] = [str(inters[-2])] |
| test_inters[u_index] = [str(inters[-1])] |
| assert len(user2items[u_index]) == len(train_inters[u_index]) + \ |
| len(valid_inters[u_index]) + len(test_inters[u_index]) |
| return user2items, train_inters, valid_inters, test_inters, user2index, item2index |
|
|
| def convert_to_atomic_files(args, train_data, valid_data, test_data): |
| print('Convert dataset: ') |
| print(' Dataset: ', args.dataset) |
| uid_list = list(train_data.keys()) |
| uid_list.sort(key=lambda t: int(t)) |
|
|
| with open(os.path.join(args.output_path, args.dataset, f'{args.dataset}.train.inter'), 'w') as file: |
| file.write('user_id:token\titem_id_list:token_seq\titem_id:token\n') |
| for uid in uid_list: |
| item_seq = train_data[uid] |
| seq_len = len(item_seq) |
| for target_idx in range(1, seq_len): |
| target_item = item_seq[-target_idx] |
| seq = item_seq[:-target_idx][-50:] |
| file.write(f'{uid}\t{" ".join(seq)}\t{target_item}\n') |
|
|
| with open(os.path.join(args.output_path, args.dataset, f'{args.dataset}.valid.inter'), 'w') as file: |
| file.write('user_id:token\titem_id_list:token_seq\titem_id:token\n') |
| for uid in uid_list: |
| item_seq = train_data[uid][-50:] |
| target_item = valid_data[uid][0] |
| file.write(f'{uid}\t{" ".join(item_seq)}\t{target_item}\n') |
|
|
| with open(os.path.join(args.output_path, args.dataset, f'{args.dataset}.test.inter'), 'w') as file: |
| file.write('user_id:token\titem_id_list:token_seq\titem_id:token\n') |
| for uid in uid_list: |
| item_seq = (train_data[uid] + valid_data[uid])[-50:] |
| target_item = test_data[uid][0] |
| file.write(f'{uid}\t{" ".join(item_seq)}\t{target_item}\n') |
|
|
| def parse_args(): |
| parser = argparse.ArgumentParser() |
| parser.add_argument('--dataset', type=str, default='Arts', help='Instruments / Arts / Games') |
| parser.add_argument('--user_k', type=int, default=5, help='user k-core filtering') |
| parser.add_argument('--item_k', type=int, default=5, help='item k-core filtering') |
| parser.add_argument('--input_path', type=str, default='') |
| parser.add_argument('--output_path', type=str, default='') |
| return parser.parse_args() |
|
|
|
|
| if __name__ == '__main__': |
| args = parse_args() |
|
|
| |
| rating_inters, meta_items = preprocess_rating(args) |
|
|
|
|
| |
| all_inters,train_inters, valid_inters, test_inters, user2index, item2index = generate_data(args, rating_inters) |
|
|
| check_path(os.path.join(args.output_path, args.dataset)) |
|
|
| write_json_file(all_inters, os.path.join(args.output_path, args.dataset, f'{args.dataset}.inter.json')) |
| convert_to_atomic_files(args, train_inters, valid_inters, test_inters) |
|
|
| item2feature = collections.defaultdict(dict) |
| for item, item_id in item2index.items(): |
| item2feature[item_id] = meta_items[item] |
|
|
| |
|
|
| print("user:",len(user2index)) |
| print("item:",len(item2index)) |
|
|
| write_json_file(item2feature, os.path.join(args.output_path, args.dataset, f'{args.dataset}.item.json')) |
| |
|
|
|
|
| write_remap_index(user2index, os.path.join(args.output_path, args.dataset, f'{args.dataset}.user2id')) |
| write_remap_index(item2index, os.path.join(args.output_path, args.dataset, f'{args.dataset}.item2id')) |