Spaces:
Sleeping
Sleeping
| # rag/db/initializer.py | |
| import faiss | |
| import numpy as np | |
| from huggingface_hub import hf_hub_download | |
| from config import HF_DS_REPO_ID, HF_INDEX_FILE, HF_IDS_FILE | |
| from modules.retriever import set_index | |
| from modules.corpus import prepare_corpus, _get_datasets, set_id_to_row | |
| _vector_ids = None | |
| def _load_index_in_memory(): | |
| """HF Hub์์ ์ธ๋ฑ์ค/ID ๋งคํ์ ๋ฐ์ ๋ฉ๋ชจ๋ฆฌ์ ๋ก๋""" | |
| index_path = hf_hub_download( | |
| repo_id=HF_DS_REPO_ID, | |
| filename=HF_INDEX_FILE, | |
| repo_type="dataset" | |
| ) | |
| ids_path = hf_hub_download( | |
| repo_id=HF_DS_REPO_ID, | |
| filename=HF_IDS_FILE, | |
| repo_type="dataset" | |
| ) | |
| index = faiss.read_index(index_path) | |
| set_index(index) | |
| global _vector_ids | |
| _vector_ids = np.load(ids_path, allow_pickle=True) | |
| def get_vector_ids(): | |
| global _vector_ids | |
| return _vector_ids | |
| def initialize_dbs(): | |
| # 1) ์ฝํผ์ค ์ค๋น (์ต์ด 1ํ parquet ๋ค์ด๋ก๋) | |
| prepare_corpus() | |
| # 2) ์ธ๋ฑ์ค/ID ๋งคํ ๋ฉ๋ชจ๋ฆฌ ๋ก๋ | |
| _load_index_in_memory() | |
| # 3) ๋ฐ์ดํฐ์ ๋ก๋ ๋ฐ page_id โ row ๋งคํ ์์ฑ | |
| datasets = _get_datasets() | |
| id_to_row = {} | |
| for _subset, ds in datasets.items(): | |
| for r in ds: | |
| id_to_row[r["page_id"]] = r | |
| set_id_to_row(id_to_row) | |
| def force_update(): | |
| _load_index_in_memory() | |