Spaces:

trungnd7112004
/

FastAPI-backend-chatbotRAG

Running

App Files Files Community

Duc Trung commited on Oct 1

Commit

ee00031

0 Parent(s):

init backend

Browse files

Files changed (9) hide show

.gitignore +22 -0
__init__.py +0 -0
main.py +61 -0
requirements.txt +0 -0
utils/__init__.py +0 -0
utils/chunking.py +13 -0
utils/llm.py +94 -0
utils/uploadFilePDFtoMD.py +148 -0
utils/vectorDB.py +30 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,22 @@

+# Ignore environment variables
+.env
+.env.*
+# Ignore Python cache
+__pycache__/
+*.pyc
+*.pyo
+*.pyd
+# Ignore virtual environments
+.venv/
+venv/
+# Ignore IDE settings
+.vscode/
+.idea/
+*.swp
+# Ignore OS files
+.DS_Store
+Thumbs.db

__init__.py ADDED Viewed

File without changes

main.py ADDED Viewed

	@@ -0,0 +1,61 @@

+from fastapi import FastAPI, File, UploadFile, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
+import os
+from dotenv import load_dotenv
+from utils.uploadFilePDFtoMD import convert_pdf_to_md
+from utils.vectorDB import create_retriever, load_retriever
+from utils.chunking import split_text_by_markdown
+from langchain_community.embeddings import HuggingFaceEmbeddings
+from utils.llm import ask_question
+from pydantic import BaseModel
+class QueryRequest(BaseModel):
+    question: str
+load_dotenv()
+app = FastAPI()
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["https://*.streamlit.app"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
+@app.post("/uploadfile/")
+async def upload_file(file: UploadFile = File(...)):
+    if not file.filename.endswith(".pdf"):
+        raise HTTPException(status_code=400, detail="Only PDF files are supported.")
+    # Save uploaded file temporarily
+    temp_dir = "temp"
+    os.makedirs(temp_dir, exist_ok=True)
+    temp_path = os.path.join(temp_dir, file.filename)
+    with open(temp_path, "wb") as f:
+        f.write(await file.read())
+    try:
+        md = convert_pdf_to_md(temp_path)
+        chunks = split_text_by_markdown(md)
+        retriever = create_retriever(chunks, embeddings)
+        os.remove(temp_path)
+        return {"message": "File processed and vector store created successfully."}
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@app.post("/query")
+async def query(request: QueryRequest):
+    try:
+        retriever = load_retriever(embeddings)
+        retrieved_docs = retriever.invoke(request.question)  # Access via request.question
+        context = "\n\n".join([doc.page_content for doc in retrieved_docs])
+        answer = ask_question(request.question, context)
+        return {"question": request.question, "answer": answer}
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=int(os.getenv("PORT", 8000)))

requirements.txt ADDED Viewed

Binary file (13.6 kB). View file

utils/__init__.py ADDED Viewed

File without changes

utils/chunking.py ADDED Viewed

	@@ -0,0 +1,13 @@

+from langchain.text_splitter import MarkdownHeaderTextSplitter
+from langchain.schema import Document
+def split_text_by_markdown(input_md: str) -> list:
+    headers_to_split_on = [
+        ("#", "Header 1"),
+        ("##", "Header 2"),
+        ("###", "Header 3"),
+    ]
+    splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
+    chunks = splitter.split_text(input_md)
+    documents = [Document(page_content=chunk.page_content, metadata=chunk.metadata) for chunk in chunks]
+    return documents

utils/llm.py ADDED Viewed

	@@ -0,0 +1,94 @@

+from langchain.prompts import PromptTemplate
+from langchain_groq import ChatGroq
+import os
+from dotenv import load_dotenv
+load_dotenv()
+# load model from HuggingFace
+# def load_model(model_name="context-labs/meta-llama-Llama-3.2-3B-Instruct-FP16"):
+#     ## load model and tokenizer
+#     # Configure quantization for memory efficiency
+#     quantization_config = BitsAndBytesConfig(
+#         load_in_4bit=True,
+#         bnb_4bit_quant_type="nf4",
+#         bnb_4bit_use_double_quant=True,
+#         bnb_4bit_compute_dtype=torch.float16,
+#     )
+#
+#     tokenizer = AutoTokenizer.from_pretrained(model_name)
+#     model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=quantization_config, device_map="auto")
+#
+#     print("✅ Model and tokenizer loaded")
+#     return model, tokenizer
+# -----------------------------
+# Prompt Template
+# -----------------------------
+prompt = PromptTemplate(
+    input_variables=["context", "question"],
+    template="""
+    You are an experienced assistant specializing in question-answering tasks.
+    Utilize the provided context to respond to the question.
+    Rules:
+      - If the question refers to a **specific table**, note that tables may be identified by either:
+        • Roman numerals (I, II, III, IV, …)
+        • Arabic numerals (1, 2, 3, 4, …)
+      - Normalize references (e.g., "Table II" = "Table 2"). Always check both forms when matching.
+      - Only answer using information contained in that table.
+      - If the table is not found or the requested information is not in the table, respond with: "I don't know."
+      - If the question is about a **formula**:
+        • Extract the formula from the context (in LaTeX).
+        • Present it in a clean readable way:
+            - Use a block math display for clarity: $$ ... $$
+            - Then rewrite it inline in plain text (e.g., f_final^t = β·f_adapter^t + (1 - β)·f_original^t).
+        • Briefly explain what each symbol means if the context provides that information.
+        • If the formula is not found, respond with: "I don't know."
+      - If the question is not about a table or a formula, answer using the context as normal.
+      - Never provide an answer you are unsure about.
+      - Keep answers concise, factual, and easy for non-experts to read.
+    CONTEXT:
+    {context}
+    QUESTION: {question}
+    DETAILED RESPONSE:
+    """,
+    input_variable=["context", "question"]
+)
+#call api groq
+llm = ChatGroq(
+api_key=os.environ.get("GROQ_API_KEY"),
+model="meta-llama/llama-4-scout-17b-16e-instruct",
+temperature=0.3,
+max_tokens=1024
+)
+print("✅ Using Groq LLM")
+# Function to ask a question
+def ask_question(question, context):
+    final_prompt = prompt.invoke({"context": context, "question": question})
+    answer = llm.invoke(final_prompt)
+    return answer.content

utils/uploadFilePDFtoMD.py ADDED Viewed

	@@ -0,0 +1,148 @@

+from docling.document_converter import DocumentConverter, PdfFormatOption
+from docling.datamodel.pipeline_options import PdfPipelineOptions
+from docling.datamodel.base_models import InputFormat
+import time
+import base64
+import re
+from groq import Groq
+import os
+from dotenv import load_dotenv
+from pathlib import Path
+load_dotenv()  # Load environment variables from .env file if present
+def convert_pdf_to_md(pdf_path: str) -> str:
+    """Convert PDF to MD with image summaries. Returns MD string. (Server-adapted from select_file)"""
+    if not os.path.exists(pdf_path):
+        raise ValueError(f"PDF not found: {pdf_path}")
+    # Enable image extraction in pipeline options
+    pipeline_options = PdfPipelineOptions()
+    pipeline_options.do_formula_enrichment = True
+    pipeline_options.generate_picture_images = True  # Key: enable image extraction
+    converter = DocumentConverter(format_options={
+        InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
+    })
+    start_time = time.time()
+    result = converter.convert(pdf_path)
+    end_time = time.time()
+    # Export to Markdown (placeholders like <!-- image --> will be present)
+    md = result.document.export_to_markdown()
+    # Extract images in a list of dicts
+    images_list = []  # List to store dicts with image details
+    for item, _ in result.document.iterate_items():
+        if item.label == "picture":  # Targets figures/images
+            image_data = item.image
+            uri = str(image_data.uri)  # Data URI like 'data:image/png;base64,...'
+            # Decode the base64 data
+            match = re.match(r'data:image/(?P<type>.+);base64,(?P<data>.+)', uri)
+            if match:
+                img_type = match.group('type')  # e.g., 'png' or 'jpeg'
+                img_bytes = base64.b64decode(match.group('data'))
+                # Store in list
+                images_list.append({
+                    'page': item.prov[0].page_no if item.prov else 'Unknown',
+                    'label': item.label,
+                    'type': img_type,
+                    'bytes': img_bytes,
+                    'uri': uri
+                })
+    # Now, summarize images using VLM (Groq with Llama model)
+    client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
+    prompt_template = """
+    You are an expert research assistant in Artificial Intelligence.
+    Your task is to analyze and summarize a figure from a scientific paper.
+    The figure may describe an overall architecture, workflow, plot, charts or experimental setup.
+    Provide a clear, detailed summary that helps a reader understand the design without seeing the image.
+    When summarizing if figure is model architecture, include:
+    - The main purpose of the figure (what problem it addresses).
+    - The overall structure (e.g., input/output, branches, modules, flows).
+    - The key components (e.g., encoders, decoders, adapters, loss functions).
+    - The interactions or data flow between components.
+    - Any special innovations or unique design choices.
+    if figure is charts, images or plot, analyze it.
+    Format the summary inside **one section only**.
+    Do not create multiple headers like ## or ###.
+    Use bold or bullet points if needed.
+    Now summarize the following figure:
+    {image_caption_or_context}
+    """
+    image_summaries = []
+    # Prepare list of base64 strings and types from images_list (assuming order matches placeholders)
+    images = [(base64.b64encode(img['bytes']).decode('utf-8'), img['type']) for img in images_list]
+    for img_b64, img_type in images:
+        try:
+            # Use correct MIME type based on extracted image type
+            img_data_url = f"data:image/{img_type};base64,{img_b64}"
+            completion = client.chat.completions.create(
+                model="meta-llama/llama-4-scout-17b-16e-instruct",
+                messages=[
+                    {
+                        "role": "user",
+                        "content": [
+                            {"type": "text", "text": prompt_template},
+                            {"type": "image_url", "image_url": {"url": img_data_url}}
+                        ]
+                    }
+                ],
+                temperature=0.0,
+                max_completion_tokens=512,
+                top_p=1,
+                stream=False,
+            )
+            summary = completion.choices[0].message.content
+            image_summaries.append(summary)
+        except Exception as e:
+            print(f"Error processing image: {e}")
+            image_summaries.append("Error summarizing image.")
+    # Replace placeholders in Markdown with summaries
+    # Assuming placeholders are "<!-- image -->" and appear in the same order as extracted images
+    placeholder = "<!-- image -->"
+    if len(image_summaries) > 0:
+        # Split the Markdown by placeholder
+        md_parts = md.split(placeholder)
+        if len(md_parts) == len(image_summaries) + 1:
+            updated_md = md_parts[0]
+            for i in range(len(image_summaries)):
+                # Insert summary (formatted nicely in Markdown)
+                updated_md += f"\n**Image Summary:**\n{image_summaries[i]}\n" + md_parts[i + 1]
+            md = updated_md
+        else:
+            print("Warning: Number of placeholders doesn't match number of summaries.")
+    # Save paper to file md
+    # Extract the file name from the full file path
+    file_name = Path(pdf_path).stem + ".pdf"  # Use stem + .pdf to match original basename logic
+    os.makedirs("../data", exist_ok=True)
+    # Save the file in the 'data' folder with the extracted file name
+    output_path = f"data/{file_name}.md"
+    with open(output_path, "w", encoding="utf-8") as f:
+        f.write(md)
+    return md
+if __name__ == "__main__":
+    # For local testing: Replace with your good PDF path
+    pdf_path = r"E:\Study\AI\PE-CLIP.pdf"  # Update this!
+    md = convert_pdf_to_md(pdf_path)
+    print(md[:1000])  # Print first 1000 characters of the Markdown

utils/vectorDB.py ADDED Viewed

	@@ -0,0 +1,30 @@

+from langchain_pinecone import PineconeVectorStore
+from pinecone import Pinecone, ServerlessSpec
+from langchain.embeddings import HuggingFaceEmbeddings
+import os
+from dotenv import load_dotenv
+load_dotenv()
+# Init client (once)
+pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
+index_name = "rag-chatbot"  # Matches your dashboard name
+# Create if not exists (idempotent; only runs first time)
+if index_name not in pc.list_indexes().names():
+    pc.create_index(
+        name=index_name,
+        dimension=384,  # MiniLM dims
+        metric="cosine",
+        spec=ServerlessSpec(cloud="aws", region="us-east-1")
+    )
+def create_retriever(chunks, embeddings):
+    vector_store = PineconeVectorStore.from_documents(
+        chunks, embeddings, index_name=index_name
+    )
+    return vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 5})
+def load_retriever(embeddings):
+    vector_store = PineconeVectorStore.from_existing_index(index_name, embeddings)
+    return vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 5})