Spaces:
Runtime error
Runtime error
| import os | |
| from pyvi.ViTokenizer import tokenize | |
| from langchain_community.document_loaders import PyPDFLoader | |
| from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| import pymongo | |
| from generate_embedding import generate_embedding | |
| os.environ["OPENAI_API_KEY"] = "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" | |
| # Connect DB | |
| client = pymongo.MongoClient( | |
| "mongodb+srv://rag:p9vojYc9fafYwxE9@rag.xswi7nq.mongodb.net/?retryWrites=true&w=majority&appName=RAG" | |
| ) | |
| db = client.rag | |
| collection = db.pdf | |
| def insertData(chunk): | |
| return collection.insert_many(chunk) | |
| def deleteByUserId(user_id: str): | |
| return collection.delete_many({"user_id": user_id}) | |
| def readFromPDF(): | |
| # load PDF | |
| loader = PyPDFLoader("data/cds.pdf") | |
| pages = loader.load_and_split() | |
| pages = list(filter(lambda page: page.metadata['page'] >= 10, pages)) | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=768, chunk_overlap=200) | |
| chunks = text_splitter.split_documents(pages) | |
| items = [] | |
| for index, chunk in enumerate(chunks): | |
| print(index) | |
| items.append({"page_content": chunk.page_content, "index": index}) | |
| return items | |
| def indexData(user_id: str): | |
| items = readFromPDF() | |
| contents = [] | |
| for item in items: | |
| tokenized_page_content = tokenize(item["page_content"]) | |
| content = { | |
| "page_content": item["page_content"], | |
| "page_content_embedding": generate_embedding(tokenized_page_content), | |
| "user_id": user_id, | |
| "index": item["index"], | |
| } | |
| contents.append(content) | |
| deleteByUserId(user_id) | |
| insertData(contents) | |
| indexData("cds.pdf") | |
| # prompt = hub.pull("rlm/rag-prompt") | |
| # llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0) | |
| # def format_docs(docs): | |
| # return "\n\n".join(doc.page_content for doc in docs) | |
| # rag_chain = ( | |
| # {"context": retriever | format_docs, "question": RunnablePassthrough()} | |
| # | prompt | |
| # | llm | |
| # | StrOutputParser() | |
| # ) | |