Duc Trung
commited on
Commit
·
ee00031
0
Parent(s):
init backend
Browse files- .gitignore +22 -0
- __init__.py +0 -0
- main.py +61 -0
- requirements.txt +0 -0
- utils/__init__.py +0 -0
- utils/chunking.py +13 -0
- utils/llm.py +94 -0
- utils/uploadFilePDFtoMD.py +148 -0
- utils/vectorDB.py +30 -0
.gitignore
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Ignore environment variables
|
| 2 |
+
.env
|
| 3 |
+
.env.*
|
| 4 |
+
|
| 5 |
+
# Ignore Python cache
|
| 6 |
+
__pycache__/
|
| 7 |
+
*.pyc
|
| 8 |
+
*.pyo
|
| 9 |
+
*.pyd
|
| 10 |
+
|
| 11 |
+
# Ignore virtual environments
|
| 12 |
+
.venv/
|
| 13 |
+
venv/
|
| 14 |
+
|
| 15 |
+
# Ignore IDE settings
|
| 16 |
+
.vscode/
|
| 17 |
+
.idea/
|
| 18 |
+
*.swp
|
| 19 |
+
|
| 20 |
+
# Ignore OS files
|
| 21 |
+
.DS_Store
|
| 22 |
+
Thumbs.db
|
__init__.py
ADDED
|
File without changes
|
main.py
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi import FastAPI, File, UploadFile, HTTPException
|
| 2 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 3 |
+
import os
|
| 4 |
+
from dotenv import load_dotenv
|
| 5 |
+
from utils.uploadFilePDFtoMD import convert_pdf_to_md
|
| 6 |
+
from utils.vectorDB import create_retriever, load_retriever
|
| 7 |
+
from utils.chunking import split_text_by_markdown
|
| 8 |
+
from langchain_community.embeddings import HuggingFaceEmbeddings
|
| 9 |
+
from utils.llm import ask_question
|
| 10 |
+
from pydantic import BaseModel
|
| 11 |
+
|
| 12 |
+
class QueryRequest(BaseModel):
|
| 13 |
+
question: str
|
| 14 |
+
|
| 15 |
+
load_dotenv()
|
| 16 |
+
app = FastAPI()
|
| 17 |
+
|
| 18 |
+
app.add_middleware(
|
| 19 |
+
CORSMiddleware,
|
| 20 |
+
allow_origins=["https://*.streamlit.app"],
|
| 21 |
+
allow_credentials=True,
|
| 22 |
+
allow_methods=["*"],
|
| 23 |
+
allow_headers=["*"],
|
| 24 |
+
)
|
| 25 |
+
|
| 26 |
+
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
|
| 27 |
+
|
| 28 |
+
@app.post("/uploadfile/")
|
| 29 |
+
async def upload_file(file: UploadFile = File(...)):
|
| 30 |
+
if not file.filename.endswith(".pdf"):
|
| 31 |
+
raise HTTPException(status_code=400, detail="Only PDF files are supported.")
|
| 32 |
+
|
| 33 |
+
# Save uploaded file temporarily
|
| 34 |
+
temp_dir = "temp"
|
| 35 |
+
os.makedirs(temp_dir, exist_ok=True)
|
| 36 |
+
temp_path = os.path.join(temp_dir, file.filename)
|
| 37 |
+
with open(temp_path, "wb") as f:
|
| 38 |
+
f.write(await file.read())
|
| 39 |
+
|
| 40 |
+
try:
|
| 41 |
+
md = convert_pdf_to_md(temp_path)
|
| 42 |
+
chunks = split_text_by_markdown(md)
|
| 43 |
+
retriever = create_retriever(chunks, embeddings)
|
| 44 |
+
os.remove(temp_path)
|
| 45 |
+
return {"message": "File processed and vector store created successfully."}
|
| 46 |
+
except Exception as e:
|
| 47 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 48 |
+
|
| 49 |
+
@app.post("/query")
|
| 50 |
+
async def query(request: QueryRequest):
|
| 51 |
+
try:
|
| 52 |
+
retriever = load_retriever(embeddings)
|
| 53 |
+
retrieved_docs = retriever.invoke(request.question) # Access via request.question
|
| 54 |
+
context = "\n\n".join([doc.page_content for doc in retrieved_docs])
|
| 55 |
+
answer = ask_question(request.question, context)
|
| 56 |
+
return {"question": request.question, "answer": answer}
|
| 57 |
+
except Exception as e:
|
| 58 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 59 |
+
if __name__ == "__main__":
|
| 60 |
+
import uvicorn
|
| 61 |
+
uvicorn.run(app, host="0.0.0.0", port=int(os.getenv("PORT", 8000)))
|
requirements.txt
ADDED
|
Binary file (13.6 kB). View file
|
|
|
utils/__init__.py
ADDED
|
File without changes
|
utils/chunking.py
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from langchain.text_splitter import MarkdownHeaderTextSplitter
|
| 2 |
+
from langchain.schema import Document
|
| 3 |
+
|
| 4 |
+
def split_text_by_markdown(input_md: str) -> list:
|
| 5 |
+
headers_to_split_on = [
|
| 6 |
+
("#", "Header 1"),
|
| 7 |
+
("##", "Header 2"),
|
| 8 |
+
("###", "Header 3"),
|
| 9 |
+
]
|
| 10 |
+
splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
|
| 11 |
+
chunks = splitter.split_text(input_md)
|
| 12 |
+
documents = [Document(page_content=chunk.page_content, metadata=chunk.metadata) for chunk in chunks]
|
| 13 |
+
return documents
|
utils/llm.py
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from langchain.prompts import PromptTemplate
|
| 2 |
+
from langchain_groq import ChatGroq
|
| 3 |
+
import os
|
| 4 |
+
from dotenv import load_dotenv
|
| 5 |
+
load_dotenv()
|
| 6 |
+
# load model from HuggingFace
|
| 7 |
+
# def load_model(model_name="context-labs/meta-llama-Llama-3.2-3B-Instruct-FP16"):
|
| 8 |
+
# ## load model and tokenizer
|
| 9 |
+
# # Configure quantization for memory efficiency
|
| 10 |
+
# quantization_config = BitsAndBytesConfig(
|
| 11 |
+
# load_in_4bit=True,
|
| 12 |
+
# bnb_4bit_quant_type="nf4",
|
| 13 |
+
# bnb_4bit_use_double_quant=True,
|
| 14 |
+
# bnb_4bit_compute_dtype=torch.float16,
|
| 15 |
+
# )
|
| 16 |
+
#
|
| 17 |
+
# tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 18 |
+
# model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=quantization_config, device_map="auto")
|
| 19 |
+
#
|
| 20 |
+
# print("✅ Model and tokenizer loaded")
|
| 21 |
+
# return model, tokenizer
|
| 22 |
+
|
| 23 |
+
# -----------------------------
|
| 24 |
+
# Prompt Template
|
| 25 |
+
# -----------------------------
|
| 26 |
+
|
| 27 |
+
prompt = PromptTemplate(
|
| 28 |
+
input_variables=["context", "question"],
|
| 29 |
+
template="""
|
| 30 |
+
You are an experienced assistant specializing in question-answering tasks.
|
| 31 |
+
Utilize the provided context to respond to the question.
|
| 32 |
+
|
| 33 |
+
Rules:
|
| 34 |
+
- If the question refers to a **specific table**, note that tables may be identified by either:
|
| 35 |
+
• Roman numerals (I, II, III, IV, …)
|
| 36 |
+
• Arabic numerals (1, 2, 3, 4, …)
|
| 37 |
+
- Normalize references (e.g., "Table II" = "Table 2"). Always check both forms when matching.
|
| 38 |
+
- Only answer using information contained in that table.
|
| 39 |
+
- If the table is not found or the requested information is not in the table, respond with: "I don't know."
|
| 40 |
+
|
| 41 |
+
- If the question is about a **formula**:
|
| 42 |
+
• Extract the formula from the context (in LaTeX).
|
| 43 |
+
• Present it in a clean readable way:
|
| 44 |
+
- Use a block math display for clarity: $$ ... $$
|
| 45 |
+
- Then rewrite it inline in plain text (e.g., f_final^t = β·f_adapter^t + (1 - β)·f_original^t).
|
| 46 |
+
• Briefly explain what each symbol means if the context provides that information.
|
| 47 |
+
• If the formula is not found, respond with: "I don't know."
|
| 48 |
+
|
| 49 |
+
- If the question is not about a table or a formula, answer using the context as normal.
|
| 50 |
+
- Never provide an answer you are unsure about.
|
| 51 |
+
- Keep answers concise, factual, and easy for non-experts to read.
|
| 52 |
+
|
| 53 |
+
CONTEXT:
|
| 54 |
+
{context}
|
| 55 |
+
|
| 56 |
+
QUESTION: {question}
|
| 57 |
+
|
| 58 |
+
DETAILED RESPONSE:
|
| 59 |
+
""",
|
| 60 |
+
input_variable=["context", "question"]
|
| 61 |
+
)
|
| 62 |
+
|
| 63 |
+
#call api groq
|
| 64 |
+
llm = ChatGroq(
|
| 65 |
+
api_key=os.environ.get("GROQ_API_KEY"),
|
| 66 |
+
model="meta-llama/llama-4-scout-17b-16e-instruct",
|
| 67 |
+
temperature=0.3,
|
| 68 |
+
max_tokens=1024
|
| 69 |
+
)
|
| 70 |
+
|
| 71 |
+
print("✅ Using Groq LLM")
|
| 72 |
+
|
| 73 |
+
# Function to ask a question
|
| 74 |
+
def ask_question(question, context):
|
| 75 |
+
final_prompt = prompt.invoke({"context": context, "question": question})
|
| 76 |
+
answer = llm.invoke(final_prompt)
|
| 77 |
+
return answer.content
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
|
utils/uploadFilePDFtoMD.py
ADDED
|
@@ -0,0 +1,148 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from docling.document_converter import DocumentConverter, PdfFormatOption
|
| 2 |
+
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
| 3 |
+
from docling.datamodel.base_models import InputFormat
|
| 4 |
+
import time
|
| 5 |
+
import base64
|
| 6 |
+
import re
|
| 7 |
+
from groq import Groq
|
| 8 |
+
import os
|
| 9 |
+
from dotenv import load_dotenv
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
|
| 12 |
+
load_dotenv() # Load environment variables from .env file if present
|
| 13 |
+
|
| 14 |
+
def convert_pdf_to_md(pdf_path: str) -> str:
|
| 15 |
+
"""Convert PDF to MD with image summaries. Returns MD string. (Server-adapted from select_file)"""
|
| 16 |
+
if not os.path.exists(pdf_path):
|
| 17 |
+
raise ValueError(f"PDF not found: {pdf_path}")
|
| 18 |
+
|
| 19 |
+
# Enable image extraction in pipeline options
|
| 20 |
+
pipeline_options = PdfPipelineOptions()
|
| 21 |
+
pipeline_options.do_formula_enrichment = True
|
| 22 |
+
pipeline_options.generate_picture_images = True # Key: enable image extraction
|
| 23 |
+
|
| 24 |
+
converter = DocumentConverter(format_options={
|
| 25 |
+
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
|
| 26 |
+
})
|
| 27 |
+
|
| 28 |
+
start_time = time.time()
|
| 29 |
+
result = converter.convert(pdf_path)
|
| 30 |
+
end_time = time.time()
|
| 31 |
+
|
| 32 |
+
# Export to Markdown (placeholders like <!-- image --> will be present)
|
| 33 |
+
md = result.document.export_to_markdown()
|
| 34 |
+
|
| 35 |
+
# Extract images in a list of dicts
|
| 36 |
+
images_list = [] # List to store dicts with image details
|
| 37 |
+
|
| 38 |
+
for item, _ in result.document.iterate_items():
|
| 39 |
+
if item.label == "picture": # Targets figures/images
|
| 40 |
+
image_data = item.image
|
| 41 |
+
uri = str(image_data.uri) # Data URI like 'data:image/png;base64,...'
|
| 42 |
+
|
| 43 |
+
# Decode the base64 data
|
| 44 |
+
match = re.match(r'data:image/(?P<type>.+);base64,(?P<data>.+)', uri)
|
| 45 |
+
if match:
|
| 46 |
+
img_type = match.group('type') # e.g., 'png' or 'jpeg'
|
| 47 |
+
img_bytes = base64.b64decode(match.group('data'))
|
| 48 |
+
|
| 49 |
+
# Store in list
|
| 50 |
+
images_list.append({
|
| 51 |
+
'page': item.prov[0].page_no if item.prov else 'Unknown',
|
| 52 |
+
'label': item.label,
|
| 53 |
+
'type': img_type,
|
| 54 |
+
'bytes': img_bytes,
|
| 55 |
+
'uri': uri
|
| 56 |
+
})
|
| 57 |
+
|
| 58 |
+
# Now, summarize images using VLM (Groq with Llama model)
|
| 59 |
+
client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
|
| 60 |
+
|
| 61 |
+
prompt_template = """
|
| 62 |
+
You are an expert research assistant in Artificial Intelligence.
|
| 63 |
+
Your task is to analyze and summarize a figure from a scientific paper.
|
| 64 |
+
|
| 65 |
+
The figure may describe an overall architecture, workflow, plot, charts or experimental setup.
|
| 66 |
+
Provide a clear, detailed summary that helps a reader understand the design without seeing the image.
|
| 67 |
+
|
| 68 |
+
When summarizing if figure is model architecture, include:
|
| 69 |
+
- The main purpose of the figure (what problem it addresses).
|
| 70 |
+
- The overall structure (e.g., input/output, branches, modules, flows).
|
| 71 |
+
- The key components (e.g., encoders, decoders, adapters, loss functions).
|
| 72 |
+
- The interactions or data flow between components.
|
| 73 |
+
- Any special innovations or unique design choices.
|
| 74 |
+
if figure is charts, images or plot, analyze it.
|
| 75 |
+
|
| 76 |
+
Format the summary inside **one section only**.
|
| 77 |
+
Do not create multiple headers like ## or ###.
|
| 78 |
+
Use bold or bullet points if needed.
|
| 79 |
+
|
| 80 |
+
Now summarize the following figure:
|
| 81 |
+
{image_caption_or_context}
|
| 82 |
+
"""
|
| 83 |
+
|
| 84 |
+
image_summaries = []
|
| 85 |
+
|
| 86 |
+
# Prepare list of base64 strings and types from images_list (assuming order matches placeholders)
|
| 87 |
+
images = [(base64.b64encode(img['bytes']).decode('utf-8'), img['type']) for img in images_list]
|
| 88 |
+
|
| 89 |
+
for img_b64, img_type in images:
|
| 90 |
+
try:
|
| 91 |
+
# Use correct MIME type based on extracted image type
|
| 92 |
+
img_data_url = f"data:image/{img_type};base64,{img_b64}"
|
| 93 |
+
|
| 94 |
+
completion = client.chat.completions.create(
|
| 95 |
+
model="meta-llama/llama-4-scout-17b-16e-instruct",
|
| 96 |
+
messages=[
|
| 97 |
+
{
|
| 98 |
+
"role": "user",
|
| 99 |
+
"content": [
|
| 100 |
+
{"type": "text", "text": prompt_template},
|
| 101 |
+
{"type": "image_url", "image_url": {"url": img_data_url}}
|
| 102 |
+
]
|
| 103 |
+
}
|
| 104 |
+
],
|
| 105 |
+
temperature=0.0,
|
| 106 |
+
max_completion_tokens=512,
|
| 107 |
+
top_p=1,
|
| 108 |
+
stream=False,
|
| 109 |
+
)
|
| 110 |
+
|
| 111 |
+
summary = completion.choices[0].message.content
|
| 112 |
+
image_summaries.append(summary)
|
| 113 |
+
|
| 114 |
+
except Exception as e:
|
| 115 |
+
print(f"Error processing image: {e}")
|
| 116 |
+
image_summaries.append("Error summarizing image.")
|
| 117 |
+
|
| 118 |
+
# Replace placeholders in Markdown with summaries
|
| 119 |
+
# Assuming placeholders are "<!-- image -->" and appear in the same order as extracted images
|
| 120 |
+
placeholder = "<!-- image -->"
|
| 121 |
+
if len(image_summaries) > 0:
|
| 122 |
+
# Split the Markdown by placeholder
|
| 123 |
+
md_parts = md.split(placeholder)
|
| 124 |
+
if len(md_parts) == len(image_summaries) + 1:
|
| 125 |
+
updated_md = md_parts[0]
|
| 126 |
+
for i in range(len(image_summaries)):
|
| 127 |
+
# Insert summary (formatted nicely in Markdown)
|
| 128 |
+
updated_md += f"\n**Image Summary:**\n{image_summaries[i]}\n" + md_parts[i + 1]
|
| 129 |
+
md = updated_md
|
| 130 |
+
else:
|
| 131 |
+
print("Warning: Number of placeholders doesn't match number of summaries.")
|
| 132 |
+
|
| 133 |
+
# Save paper to file md
|
| 134 |
+
# Extract the file name from the full file path
|
| 135 |
+
file_name = Path(pdf_path).stem + ".pdf" # Use stem + .pdf to match original basename logic
|
| 136 |
+
os.makedirs("../data", exist_ok=True)
|
| 137 |
+
|
| 138 |
+
# Save the file in the 'data' folder with the extracted file name
|
| 139 |
+
output_path = f"data/{file_name}.md"
|
| 140 |
+
with open(output_path, "w", encoding="utf-8") as f:
|
| 141 |
+
f.write(md)
|
| 142 |
+
return md
|
| 143 |
+
|
| 144 |
+
if __name__ == "__main__":
|
| 145 |
+
# For local testing: Replace with your good PDF path
|
| 146 |
+
pdf_path = r"E:\Study\AI\PE-CLIP.pdf" # Update this!
|
| 147 |
+
md = convert_pdf_to_md(pdf_path)
|
| 148 |
+
print(md[:1000]) # Print first 1000 characters of the Markdown
|
utils/vectorDB.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from langchain_pinecone import PineconeVectorStore
|
| 2 |
+
from pinecone import Pinecone, ServerlessSpec
|
| 3 |
+
from langchain.embeddings import HuggingFaceEmbeddings
|
| 4 |
+
import os
|
| 5 |
+
from dotenv import load_dotenv
|
| 6 |
+
|
| 7 |
+
load_dotenv()
|
| 8 |
+
|
| 9 |
+
# Init client (once)
|
| 10 |
+
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
|
| 11 |
+
index_name = "rag-chatbot" # Matches your dashboard name
|
| 12 |
+
|
| 13 |
+
# Create if not exists (idempotent; only runs first time)
|
| 14 |
+
if index_name not in pc.list_indexes().names():
|
| 15 |
+
pc.create_index(
|
| 16 |
+
name=index_name,
|
| 17 |
+
dimension=384, # MiniLM dims
|
| 18 |
+
metric="cosine",
|
| 19 |
+
spec=ServerlessSpec(cloud="aws", region="us-east-1")
|
| 20 |
+
)
|
| 21 |
+
|
| 22 |
+
def create_retriever(chunks, embeddings):
|
| 23 |
+
vector_store = PineconeVectorStore.from_documents(
|
| 24 |
+
chunks, embeddings, index_name=index_name
|
| 25 |
+
)
|
| 26 |
+
return vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 5})
|
| 27 |
+
|
| 28 |
+
def load_retriever(embeddings):
|
| 29 |
+
vector_store = PineconeVectorStore.from_existing_index(index_name, embeddings)
|
| 30 |
+
return vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 5})
|