Spaces:
Runtime error
Runtime error
| import os | |
| import json | |
| import faiss | |
| import numpy as np | |
| from sentence_transformers import SentenceTransformer | |
| from tqdm import tqdm | |
| # 1. λ°μ΄ν° κ²½λ‘ μ€μ | |
| source_paths = [ | |
| r"data/real_estate_agent/raw/past_papers/brokerage_law.jsonl", | |
| r"data/real_estate_agent/raw/past_papers/civil_law.jsonl", | |
| r"data/real_estate_agent/raw/past_papers/disclosure_taxation.jsonl", | |
| r"data/real_estate_agent/raw/past_papers/introduction.jsonl", | |
| r"data/real_estate_agent/raw/past_papers/public_law.jsonl", | |
| ] | |
| INDEX_PATH = "data/index/index.faiss" | |
| DOCS_PATH = "data/index/docs.npy" | |
| # 2. μλ² λ© λͺ¨λΈ λ‘λ | |
| embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") | |
| def init_faiss(): | |
| questions = [] | |
| # 3. JSONL νμΌ μ½κΈ° | |
| for path in source_paths: | |
| with open(path, "r", encoding="utf-8") as f: | |
| for line in f: | |
| data = json.loads(line) | |
| question_text = data.get("question", "") | |
| if question_text: # μ§λ¬Έμ΄ λΉμ΄μμ§ μμΌλ©΄ μΆκ° | |
| questions.append(question_text) | |
| print(f"β μ΄ {len(questions)}κ° μ§λ¬Έ λ‘λ© μλ£") | |
| # 4. μλ² λ© μμ± | |
| embeddings = embedding_model.encode( | |
| questions, | |
| batch_size=32, | |
| show_progress_bar=True | |
| ) | |
| embeddings = np.array(embeddings).astype('float32') | |
| # 5. FAISS μΈλ±μ€ μμ± | |
| dimension = embeddings.shape[1] | |
| index = faiss.IndexFlatL2(dimension) # L2 거리 κΈ°λ° μΈλ±μ€ | |
| index.add(embeddings) | |
| # 6. μ μ₯ | |
| os.makedirs(os.path.dirname(INDEX_PATH), exist_ok=True) | |
| faiss.write_index(index, INDEX_PATH) | |
| np.save(DOCS_PATH, questions) | |
| print(f"β FAISS μΈλ±μ€μ λ¬Έμ μ μ₯ μλ£!") | |
| if __name__ == "__main__": | |
| init_faiss() | |