Uncommend and run if dependencies are not installed

In [None]:
# !pip install pypdf
# !pip install dotenv
# !pip install -q streamlit
# %pip install -qU langchain-groq
# !pip install -U langchain-community

In [None]:
import os
import sys
import openai
import requests
from langchain_groq import ChatGroq
from langchain.chains import RetrievalQA
from langchain.vectorstores import Chroma
from langchain.prompts import PromptTemplate
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter

In [None]:
sys.path.append('../..')

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file
openai.api_key = "YOUR OPENAI_API_KEY"

In [None]:
os.environ["GROQ_API_KEY"] = "YOUR GROQ_API_KEY"

In [3]:
from langchain.document_loaders import PyPDFLoader

In [4]:
def download_arxiv(arxiv_id, save_path):
 url = f"https://arxiv.org/pdf/{arxiv_id}.pdf"
 response = requests.get(url)
 with open(save_path, "wb") as f:
 f.write(response.content)

In [5]:
download_arxiv(2211.10381, "2211.10381.pdf")

In [6]:
loader = PyPDFLoader("2211.10381.pdf")
pages = loader.load()

In [7]:
combinae_pages = "\n".join([p.page_content for p in pages])

In [9]:
text_splitter = RecursiveCharacterTextSplitter(
 chunk_size=1000,
 chunk_overlap=200,
 length_function=len,
 separators = ["\n\n", "\n", "(?<=\.)", " ", ""]
)

In [10]:
docs = text_splitter.split_documents(pages)

In [11]:
docs[0].page_content

'Environmental Data Science (2023), xx: 1–22\ndoi:10.1017/xxxx\nRESEARCH ARTICLE\nEnvironmental Sensor Placement with Convolutional Gaus-\nsian Neural Processes\nTom R. Andersson 1\n *, Wessel P. Bruinsma 2, Stratis Markou 3, James Requeima 4, Alejandro Coca-\nCastro5, Anna Vaughan 3, Anna-Louise Ellis 6, Matthew A. Lazzara 7,8, Dani Jones †1, J. Scott\nHosking†1,5 and Richard E. Turner†2,3\n1British Antarctic Survey, NERC, UKRI\n2Microsoft Research AI4Science\n3University of Cambridge\n4Vector Institute\n5The Alan Turing Institute\n6Met Office\n7University of Wisconsin-Madison\n8Madison Area Technical College\n†\nJoint senior authors\n*Corresponding author. Email: tomand@bas.ac.uk\nReceived: 1 February 2023; Revised: 29 March 2023; Accepted: 05 May 2023\nKeywords: sensor placement, neural processes, active learning, meta-learning\nAbstract\nEnvironmental sensors are crucial for monitoring weather conditions and the impacts of climate change. However,'

In [12]:
from sentence_transformers import SentenceTransformer

In [None]:
st_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [14]:
vectors = st_model.encode([i.page_content for i in docs], convert_to_numpy=True)

In [15]:
vectors.shape

(139, 384)

#### Create embeddings, store in vector DB, and query to find the most relevant documents

In [None]:
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.schema import Document

embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

vectordb = Chroma.from_documents(
 documents=docs,
 embedding=embeddings,
 persist_directory="./chroma_db"
)

vectordb.persist()

query = "What is the JointMI acquisition function?"
results = vectordb.similarity_search_with_score(query, k=3)

for doc, score in results:
 print(f"Score: {score:.3f}\n{doc.page_content}\n---\n")


#### Check how LLM works

In [None]:
from langchain_groq import ChatGroq

llm = ChatGroq(
 model="llama-3.1-8b-instant",
 temperature=0,
 max_tokens=None,
 timeout=None,
 max_retries=2,
)

messages = [
 (
 "system",
 "You are a helpful assistant that gives the answer to the question given the context",
 ),
 ("human", "I love programming."),
]
ai_msg = llm.invoke(messages)
ai_msg

#### Create chatbot

In [None]:
retriever = vectordb.as_retriever(search_kwargs={"k": 3})

# Step 5: Instantiate ChatGroq LLM
llm = ChatGroq(
 model="llama-3.1-8b-instant",
 temperature=0.2,
 max_tokens=512,
)

# Step 6: Optional custom prompt (can use defaults too)
prompt_template = PromptTemplate.from_template("""
You are a research assistant. Use the context below to answer the question.
If the answer is not in the context, say "I don't know."

Context:
{context}

Question:
{question}
""")

In [None]:
qa_chain = RetrievalQA.from_chain_type(
 llm=llm,
 retriever=retriever,
 chain_type="stuff", # or use "map_reduce" if context is long
 return_source_documents=True,
 chain_type_kwargs={"prompt": prompt_template}
)

In [None]:
query = "What is the JointMI acquisition function?"
result = qa_chain(query)

print("Answer:", result["result"])
print("\n Sources:\n")
for doc in result["source_documents"]:
 print(doc.page_content[:300], "\n---\n")