Spaces:
Runtime error
Runtime error
| import pandas as pd | |
| from src.services.generate_embedding import generate_embedding | |
| from pyvi.ViTokenizer import tokenize | |
| import pymongo | |
| SHEET_ID = "1MKB6MHgL_lrPB1I69fj2VcVrgmSAMLVNZR1EwSyTSeA" | |
| SHEET_NAME = "Q&A" | |
| # Connect DB | |
| client = pymongo.MongoClient( | |
| "mongodb+srv://rag:p9vojYc9fafYwxE9@rag.xswi7nq.mongodb.net/?retryWrites=true&w=majority&appName=RAG" | |
| ) | |
| db = client.rag | |
| collection = db.questionAndAnswers | |
| def insertQuestionAndAnswers(questionAndAnswers): | |
| for index, questionAndAnswer in enumerate(questionAndAnswers): | |
| print(f"""inserting {index}: {questionAndAnswer['question']}""") | |
| collection.insert_one(questionAndAnswer) | |
| def deleteByUserId(user_id: str): | |
| return collection.delete_many({'user_id': user_id}) | |
| def readDataFromGoogleSheet(sheet_id: str, sheet_name: str): | |
| url = f"https://docs.google.com/spreadsheets/d/{sheet_id}/gviz/tq?tqx=out:csv&sheet={sheet_name}" | |
| df = pd.read_csv(url) | |
| items = [] | |
| for index, row in df.iterrows(): | |
| items.append( | |
| { | |
| "question": row["Question"], | |
| "answer": row["Answer"], | |
| } | |
| ) | |
| print(f'read from google sheet {df.size} items') | |
| return items | |
| def indexData(sheet_id: str, sheet_name: str): | |
| items = readDataFromGoogleSheet(sheet_id, sheet_name) | |
| questionAndAnswers = [] | |
| for item in items: | |
| tokenized_question = tokenize(item["question"]) | |
| questionAndAnswer = { | |
| "question": tokenized_question, | |
| "answer": item["answer"], | |
| "question_embedding": generate_embedding(tokenized_question), | |
| "user_id": sheet_id, | |
| } | |
| questionAndAnswers.append(questionAndAnswer) | |
| deleteByUserId(sheet_id) | |
| print(f'deleted dta of sheet {sheet_id}') | |
| insertQuestionAndAnswers(questionAndAnswers) | |
| # for index, article in enumerate(data): | |
| # if(index< 6580): | |
| # continue; | |
| # if(len(str(article['title'])) == 0 or len(str(article['description'])) == 0 or len(str(article['link'])) == 0 ): | |
| # continue | |
| # tokenized_title = tokenize(article['title']) | |
| # tokenized_description = tokenize(article['description']) | |
| # article = { | |
| # 'title': tokenized_title, | |
| # 'description': tokenized_description, | |
| # 'link': article['link'], | |
| # # 'title_embedding': generate_embedding(tokenized_title), | |
| # 'title_embedding': [], | |
| # 'description_embedding': generate_embedding(tokenized_title + ": " + tokenized_description), | |
| # } | |
| # print(f"processed {index}/{len(articles)}") | |
| # save_db(article) | |