Duc Trung commited on
Commit
ee00031
·
0 Parent(s):

init backend

Browse files
.gitignore ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Ignore environment variables
2
+ .env
3
+ .env.*
4
+
5
+ # Ignore Python cache
6
+ __pycache__/
7
+ *.pyc
8
+ *.pyo
9
+ *.pyd
10
+
11
+ # Ignore virtual environments
12
+ .venv/
13
+ venv/
14
+
15
+ # Ignore IDE settings
16
+ .vscode/
17
+ .idea/
18
+ *.swp
19
+
20
+ # Ignore OS files
21
+ .DS_Store
22
+ Thumbs.db
__init__.py ADDED
File without changes
main.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, File, UploadFile, HTTPException
2
+ from fastapi.middleware.cors import CORSMiddleware
3
+ import os
4
+ from dotenv import load_dotenv
5
+ from utils.uploadFilePDFtoMD import convert_pdf_to_md
6
+ from utils.vectorDB import create_retriever, load_retriever
7
+ from utils.chunking import split_text_by_markdown
8
+ from langchain_community.embeddings import HuggingFaceEmbeddings
9
+ from utils.llm import ask_question
10
+ from pydantic import BaseModel
11
+
12
+ class QueryRequest(BaseModel):
13
+ question: str
14
+
15
+ load_dotenv()
16
+ app = FastAPI()
17
+
18
+ app.add_middleware(
19
+ CORSMiddleware,
20
+ allow_origins=["https://*.streamlit.app"],
21
+ allow_credentials=True,
22
+ allow_methods=["*"],
23
+ allow_headers=["*"],
24
+ )
25
+
26
+ embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
27
+
28
+ @app.post("/uploadfile/")
29
+ async def upload_file(file: UploadFile = File(...)):
30
+ if not file.filename.endswith(".pdf"):
31
+ raise HTTPException(status_code=400, detail="Only PDF files are supported.")
32
+
33
+ # Save uploaded file temporarily
34
+ temp_dir = "temp"
35
+ os.makedirs(temp_dir, exist_ok=True)
36
+ temp_path = os.path.join(temp_dir, file.filename)
37
+ with open(temp_path, "wb") as f:
38
+ f.write(await file.read())
39
+
40
+ try:
41
+ md = convert_pdf_to_md(temp_path)
42
+ chunks = split_text_by_markdown(md)
43
+ retriever = create_retriever(chunks, embeddings)
44
+ os.remove(temp_path)
45
+ return {"message": "File processed and vector store created successfully."}
46
+ except Exception as e:
47
+ raise HTTPException(status_code=500, detail=str(e))
48
+
49
+ @app.post("/query")
50
+ async def query(request: QueryRequest):
51
+ try:
52
+ retriever = load_retriever(embeddings)
53
+ retrieved_docs = retriever.invoke(request.question) # Access via request.question
54
+ context = "\n\n".join([doc.page_content for doc in retrieved_docs])
55
+ answer = ask_question(request.question, context)
56
+ return {"question": request.question, "answer": answer}
57
+ except Exception as e:
58
+ raise HTTPException(status_code=500, detail=str(e))
59
+ if __name__ == "__main__":
60
+ import uvicorn
61
+ uvicorn.run(app, host="0.0.0.0", port=int(os.getenv("PORT", 8000)))
requirements.txt ADDED
Binary file (13.6 kB). View file
 
utils/__init__.py ADDED
File without changes
utils/chunking.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.text_splitter import MarkdownHeaderTextSplitter
2
+ from langchain.schema import Document
3
+
4
+ def split_text_by_markdown(input_md: str) -> list:
5
+ headers_to_split_on = [
6
+ ("#", "Header 1"),
7
+ ("##", "Header 2"),
8
+ ("###", "Header 3"),
9
+ ]
10
+ splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
11
+ chunks = splitter.split_text(input_md)
12
+ documents = [Document(page_content=chunk.page_content, metadata=chunk.metadata) for chunk in chunks]
13
+ return documents
utils/llm.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.prompts import PromptTemplate
2
+ from langchain_groq import ChatGroq
3
+ import os
4
+ from dotenv import load_dotenv
5
+ load_dotenv()
6
+ # load model from HuggingFace
7
+ # def load_model(model_name="context-labs/meta-llama-Llama-3.2-3B-Instruct-FP16"):
8
+ # ## load model and tokenizer
9
+ # # Configure quantization for memory efficiency
10
+ # quantization_config = BitsAndBytesConfig(
11
+ # load_in_4bit=True,
12
+ # bnb_4bit_quant_type="nf4",
13
+ # bnb_4bit_use_double_quant=True,
14
+ # bnb_4bit_compute_dtype=torch.float16,
15
+ # )
16
+ #
17
+ # tokenizer = AutoTokenizer.from_pretrained(model_name)
18
+ # model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=quantization_config, device_map="auto")
19
+ #
20
+ # print("✅ Model and tokenizer loaded")
21
+ # return model, tokenizer
22
+
23
+ # -----------------------------
24
+ # Prompt Template
25
+ # -----------------------------
26
+
27
+ prompt = PromptTemplate(
28
+ input_variables=["context", "question"],
29
+ template="""
30
+ You are an experienced assistant specializing in question-answering tasks.
31
+ Utilize the provided context to respond to the question.
32
+
33
+ Rules:
34
+ - If the question refers to a **specific table**, note that tables may be identified by either:
35
+ • Roman numerals (I, II, III, IV, …)
36
+ • Arabic numerals (1, 2, 3, 4, …)
37
+ - Normalize references (e.g., "Table II" = "Table 2"). Always check both forms when matching.
38
+ - Only answer using information contained in that table.
39
+ - If the table is not found or the requested information is not in the table, respond with: "I don't know."
40
+
41
+ - If the question is about a **formula**:
42
+ • Extract the formula from the context (in LaTeX).
43
+ • Present it in a clean readable way:
44
+ - Use a block math display for clarity: $$ ... $$
45
+ - Then rewrite it inline in plain text (e.g., f_final^t = β·f_adapter^t + (1 - β)·f_original^t).
46
+ • Briefly explain what each symbol means if the context provides that information.
47
+ • If the formula is not found, respond with: "I don't know."
48
+
49
+ - If the question is not about a table or a formula, answer using the context as normal.
50
+ - Never provide an answer you are unsure about.
51
+ - Keep answers concise, factual, and easy for non-experts to read.
52
+
53
+ CONTEXT:
54
+ {context}
55
+
56
+ QUESTION: {question}
57
+
58
+ DETAILED RESPONSE:
59
+ """,
60
+ input_variable=["context", "question"]
61
+ )
62
+
63
+ #call api groq
64
+ llm = ChatGroq(
65
+ api_key=os.environ.get("GROQ_API_KEY"),
66
+ model="meta-llama/llama-4-scout-17b-16e-instruct",
67
+ temperature=0.3,
68
+ max_tokens=1024
69
+ )
70
+
71
+ print("✅ Using Groq LLM")
72
+
73
+ # Function to ask a question
74
+ def ask_question(question, context):
75
+ final_prompt = prompt.invoke({"context": context, "question": question})
76
+ answer = llm.invoke(final_prompt)
77
+ return answer.content
78
+
79
+
80
+
81
+
82
+
83
+
84
+
85
+
86
+
87
+
88
+
89
+
90
+
91
+
92
+
93
+
94
+
utils/uploadFilePDFtoMD.py ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from docling.document_converter import DocumentConverter, PdfFormatOption
2
+ from docling.datamodel.pipeline_options import PdfPipelineOptions
3
+ from docling.datamodel.base_models import InputFormat
4
+ import time
5
+ import base64
6
+ import re
7
+ from groq import Groq
8
+ import os
9
+ from dotenv import load_dotenv
10
+ from pathlib import Path
11
+
12
+ load_dotenv() # Load environment variables from .env file if present
13
+
14
+ def convert_pdf_to_md(pdf_path: str) -> str:
15
+ """Convert PDF to MD with image summaries. Returns MD string. (Server-adapted from select_file)"""
16
+ if not os.path.exists(pdf_path):
17
+ raise ValueError(f"PDF not found: {pdf_path}")
18
+
19
+ # Enable image extraction in pipeline options
20
+ pipeline_options = PdfPipelineOptions()
21
+ pipeline_options.do_formula_enrichment = True
22
+ pipeline_options.generate_picture_images = True # Key: enable image extraction
23
+
24
+ converter = DocumentConverter(format_options={
25
+ InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
26
+ })
27
+
28
+ start_time = time.time()
29
+ result = converter.convert(pdf_path)
30
+ end_time = time.time()
31
+
32
+ # Export to Markdown (placeholders like <!-- image --> will be present)
33
+ md = result.document.export_to_markdown()
34
+
35
+ # Extract images in a list of dicts
36
+ images_list = [] # List to store dicts with image details
37
+
38
+ for item, _ in result.document.iterate_items():
39
+ if item.label == "picture": # Targets figures/images
40
+ image_data = item.image
41
+ uri = str(image_data.uri) # Data URI like 'data:image/png;base64,...'
42
+
43
+ # Decode the base64 data
44
+ match = re.match(r'data:image/(?P<type>.+);base64,(?P<data>.+)', uri)
45
+ if match:
46
+ img_type = match.group('type') # e.g., 'png' or 'jpeg'
47
+ img_bytes = base64.b64decode(match.group('data'))
48
+
49
+ # Store in list
50
+ images_list.append({
51
+ 'page': item.prov[0].page_no if item.prov else 'Unknown',
52
+ 'label': item.label,
53
+ 'type': img_type,
54
+ 'bytes': img_bytes,
55
+ 'uri': uri
56
+ })
57
+
58
+ # Now, summarize images using VLM (Groq with Llama model)
59
+ client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
60
+
61
+ prompt_template = """
62
+ You are an expert research assistant in Artificial Intelligence.
63
+ Your task is to analyze and summarize a figure from a scientific paper.
64
+
65
+ The figure may describe an overall architecture, workflow, plot, charts or experimental setup.
66
+ Provide a clear, detailed summary that helps a reader understand the design without seeing the image.
67
+
68
+ When summarizing if figure is model architecture, include:
69
+ - The main purpose of the figure (what problem it addresses).
70
+ - The overall structure (e.g., input/output, branches, modules, flows).
71
+ - The key components (e.g., encoders, decoders, adapters, loss functions).
72
+ - The interactions or data flow between components.
73
+ - Any special innovations or unique design choices.
74
+ if figure is charts, images or plot, analyze it.
75
+
76
+ Format the summary inside **one section only**.
77
+ Do not create multiple headers like ## or ###.
78
+ Use bold or bullet points if needed.
79
+
80
+ Now summarize the following figure:
81
+ {image_caption_or_context}
82
+ """
83
+
84
+ image_summaries = []
85
+
86
+ # Prepare list of base64 strings and types from images_list (assuming order matches placeholders)
87
+ images = [(base64.b64encode(img['bytes']).decode('utf-8'), img['type']) for img in images_list]
88
+
89
+ for img_b64, img_type in images:
90
+ try:
91
+ # Use correct MIME type based on extracted image type
92
+ img_data_url = f"data:image/{img_type};base64,{img_b64}"
93
+
94
+ completion = client.chat.completions.create(
95
+ model="meta-llama/llama-4-scout-17b-16e-instruct",
96
+ messages=[
97
+ {
98
+ "role": "user",
99
+ "content": [
100
+ {"type": "text", "text": prompt_template},
101
+ {"type": "image_url", "image_url": {"url": img_data_url}}
102
+ ]
103
+ }
104
+ ],
105
+ temperature=0.0,
106
+ max_completion_tokens=512,
107
+ top_p=1,
108
+ stream=False,
109
+ )
110
+
111
+ summary = completion.choices[0].message.content
112
+ image_summaries.append(summary)
113
+
114
+ except Exception as e:
115
+ print(f"Error processing image: {e}")
116
+ image_summaries.append("Error summarizing image.")
117
+
118
+ # Replace placeholders in Markdown with summaries
119
+ # Assuming placeholders are "<!-- image -->" and appear in the same order as extracted images
120
+ placeholder = "<!-- image -->"
121
+ if len(image_summaries) > 0:
122
+ # Split the Markdown by placeholder
123
+ md_parts = md.split(placeholder)
124
+ if len(md_parts) == len(image_summaries) + 1:
125
+ updated_md = md_parts[0]
126
+ for i in range(len(image_summaries)):
127
+ # Insert summary (formatted nicely in Markdown)
128
+ updated_md += f"\n**Image Summary:**\n{image_summaries[i]}\n" + md_parts[i + 1]
129
+ md = updated_md
130
+ else:
131
+ print("Warning: Number of placeholders doesn't match number of summaries.")
132
+
133
+ # Save paper to file md
134
+ # Extract the file name from the full file path
135
+ file_name = Path(pdf_path).stem + ".pdf" # Use stem + .pdf to match original basename logic
136
+ os.makedirs("../data", exist_ok=True)
137
+
138
+ # Save the file in the 'data' folder with the extracted file name
139
+ output_path = f"data/{file_name}.md"
140
+ with open(output_path, "w", encoding="utf-8") as f:
141
+ f.write(md)
142
+ return md
143
+
144
+ if __name__ == "__main__":
145
+ # For local testing: Replace with your good PDF path
146
+ pdf_path = r"E:\Study\AI\PE-CLIP.pdf" # Update this!
147
+ md = convert_pdf_to_md(pdf_path)
148
+ print(md[:1000]) # Print first 1000 characters of the Markdown
utils/vectorDB.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_pinecone import PineconeVectorStore
2
+ from pinecone import Pinecone, ServerlessSpec
3
+ from langchain.embeddings import HuggingFaceEmbeddings
4
+ import os
5
+ from dotenv import load_dotenv
6
+
7
+ load_dotenv()
8
+
9
+ # Init client (once)
10
+ pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
11
+ index_name = "rag-chatbot" # Matches your dashboard name
12
+
13
+ # Create if not exists (idempotent; only runs first time)
14
+ if index_name not in pc.list_indexes().names():
15
+ pc.create_index(
16
+ name=index_name,
17
+ dimension=384, # MiniLM dims
18
+ metric="cosine",
19
+ spec=ServerlessSpec(cloud="aws", region="us-east-1")
20
+ )
21
+
22
+ def create_retriever(chunks, embeddings):
23
+ vector_store = PineconeVectorStore.from_documents(
24
+ chunks, embeddings, index_name=index_name
25
+ )
26
+ return vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 5})
27
+
28
+ def load_retriever(embeddings):
29
+ vector_store = PineconeVectorStore.from_existing_index(index_name, embeddings)
30
+ return vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 5})