Spaces:
Sleeping
Sleeping
| # Utilities to build a RAG system to query information from the CAMELS cosmological simulations using Langchain | |
| # Author: Pablo Villanueva Domingo | |
| from langchain import hub | |
| from langchain_chroma import Chroma | |
| from langchain_core.output_parsers import StrOutputParser | |
| from langchain_core.runnables import RunnablePassthrough | |
| from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| from langchain_community.document_loaders import WebBaseLoader | |
| # Load documentation from urls | |
| def load_docs(): | |
| # Get urls | |
| urlsfile = open("urls.txt") | |
| urls = urlsfile.readlines() | |
| urls = [url.replace("\n","") for url in urls] | |
| urlsfile.close() | |
| # Load, chunk and index the contents of the blog. | |
| loader = WebBaseLoader(urls) | |
| docs = loader.load() | |
| return docs | |
| # Join content pages for processing | |
| def format_docs(docs): | |
| return "\n\n".join(doc.page_content for doc in docs) | |
| # Create a RAG chain | |
| def RAG(llm, docs, embeddings): | |
| # Split text | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) | |
| splits = text_splitter.split_documents(docs) | |
| # Create vector store | |
| vectorstore = Chroma.from_documents(documents=splits, embedding=embeddings) | |
| # Retrieve and generate using the relevant snippets of the documents | |
| retriever = vectorstore.as_retriever() | |
| # Prompt basis example for RAG systems | |
| prompt = hub.pull("rlm/rag-prompt") | |
| # Create the chain | |
| rag_chain = ( | |
| {"context": retriever | format_docs, "question": RunnablePassthrough()} | |
| | prompt | |
| | llm | |
| | StrOutputParser() | |
| ) | |
| return rag_chain | |