Spaces:

zamal
/

Multimodal-Chat-Playground

Running on Zero

App Files Files Community

zamal commited on May 30

Commit

ca125f5

verified ·

1 Parent(s): af1ccb2

Update app.py

Browse files

Files changed (1) hide show

app.py +247 -96

app.py CHANGED Viewed

@@ -5,143 +5,294 @@ import gc
 from huggingface_hub.utils import HfHubHTTPError
 from langchain_core.prompts import PromptTemplate
 from langchain_huggingface import HuggingFaceEndpoint
-from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration
-from doctr.io import DocumentFile
-from doctr.models import ocr_predictor
-from pypdf import PdfReader
 from PIL import Image
-from utils import extract_images, image_to_bytes, clean_text
 from welcome_text import WELCOME_INTRO
 import chromadb
 from chromadb.utils import embedding_functions
 from langchain.text_splitter import RecursiveCharacterTextSplitter
-import gradio as gr
 # ─────────────────────────────────────────────────────────────────────────────
-# Globals
-CURRENT_VDB = None
 processor = None
 vision_model = None
-# OCR & V+L defaults
-OCR_CHOICES = {
-    "db_resnet50 + crnn_mobilenet_v3_large": ("db_resnet50", "crnn_mobilenet_v3_large"),
-    "db_resnet50 + crnn_resnet31": ("db_resnet50", "crnn_resnet31"),
-}
-SHARED_EMB_FN = embedding_functions.SentenceTransformerEmbeddingFunction(
-    model_name="all-MiniLM-L6-v2"
 )
-def get_image_description(img: Image.Image) -> str:
     global processor, vision_model
     if processor is None or vision_model is None:
-        # use the same default V+L model everywhere
-        vlm = "llava-hf/llava-v1.6-mistral-7b-hf"
-        processor = LlavaNextProcessor.from_pretrained(vlm)
         vision_model = LlavaNextForConditionalGeneration.from_pretrained(
-            vlm, torch_dtype=torch.float16, low_cpu_mem_usage=True
         ).to("cuda")
-    torch.cuda.empty_cache(); gc.collect()
     prompt = "[INST] <image>\nDescribe the image in a sentence [/INST]"
-    inputs = processor(prompt, img, return_tensors="pt").to("cuda")
-    out = vision_model.generate(**inputs, max_new_tokens=100)
-    return processor.decode(out[0], skip_special_tokens=True)
 def extract_data_from_pdfs(
-    docs, session, include_images, do_ocr, ocr_choice, vlm_choice, progress=gr.Progress()
 ):
     if not docs:
         raise gr.Error("No documents to process")
-    # 1) Optional OCR
-    local_ocr = None
     if do_ocr == "Get Text With OCR":
         db_m, crnn_m = OCR_CHOICES[ocr_choice]
         local_ocr = ocr_predictor(db_m, crnn_m, pretrained=True, assume_straight_pages=True)
-    # 2) Prepare V+L
     proc = LlavaNextProcessor.from_pretrained(vlm_choice)
-    vis = LlavaNextForConditionalGeneration.from_pretrained(
-        vlm_choice, torch_dtype=torch.float16, low_cpu_mem_usage=True
-    ).to("cuda")
-    # 3) Patch get_image_description to use this choice
-    def describe(img: Image.Image) -> str:
         torch.cuda.empty_cache(); gc.collect()
         prompt = "[INST] <image>\nDescribe the image in a sentence [/INST]"
         inp = proc(prompt, img, return_tensors="pt").to("cuda")
         out = vis.generate(**inp, max_new_tokens=100)
         return proc.decode(out[0], skip_special_tokens=True)
-    global get_image_description, CURRENT_VDB
     get_image_description = describe
-    # 4) Pull text + images
     progress(0.2, "Extracting text and images…")
-    full_text, images, names = "", [], []
-    for p in docs:
         if local_ocr:
-            pdf = DocumentFile.from_pdf(p)
             res = local_ocr(pdf)
-            full_text += " ".join(w.value for blk in res.pages for line in blk.lines for w in line.words) + "\n\n"
         else:
-            full_text += (PdfReader(p).pages[0].extract_text() or "") + "\n\n"
         if include_images == "Include Images":
-            imgs = extract_images([p])
             images.extend(imgs)
-            names.extend([os.path.basename(p)] * len(imgs))
-    # 5) Build in-memory Chroma
     progress(0.6, "Indexing in vector DB…")
-    client = chromadb.EphemeralClient()
-    for col in ("text_db", "image_db"):
-        if col in [c.name for c in client.list_collections()]:
-            client.delete_collection(col)
-    text_col = client.get_or_create_collection("text_db", embedding_function=SHARED_EMB_FN)
-    img_col = client.get_or_create_collection("image_db", embedding_function=SHARED_EMB_FN,
-                                              metadata={"hnsw:space":"cosine"})
-    if images:
-        descs, metas = [], []
-        for i, im in enumerate(images):
-            cap = get_image_description(im)
-            descs.append(f"{names[i]}: {cap}")
-            metas.append({"image": image_to_bytes(im)})
-        img_col.add(ids=[str(i) for i in range(len(images))],
-                    documents=descs, metadatas=metas)
-    splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
-    docs_ = splitter.create_documents([full_text])
-    text_col.add(ids=[str(i) for i in range(len(docs_))],
-                 documents=[d.page_content for d in docs_])
-    CURRENT_VDB = client
-    session["processed"] = True
-    sample = images[:4] if include_images=="Include Images" else []
-    return session, full_text[:2000]+"...", sample, "<h3>Done!</h3>"
-def conversation(session, question, num_ctx, img_ctx, history, temp, max_tok, model_id):
-    global CURRENT_VDB
-    if not session.get("processed") or CURRENT_VDB is None:
         raise gr.Error("Please extract data first")
-    # a) text retrieval
-    docs = CURRENT_VDB.get_collection("text_db")\
-         .query(query_texts=[question], n_results=int(num_ctx), include=["documents"])["documents"][0]
-    # b) image retrieval
-    img_q = CURRENT_VDB.get_collection("image_db")\
-           .query(query_texts=[question], n_results=int(img_ctx),
-                  include=["metadatas","documents"])
     img_descs = img_q["documents"][0] or ["No images found"]
     images = []
-    for m in img_q["metadatas"][0]:
-        b = m.get("image","")
-        try: images.append(Image.open(io.BytesIO(base64.b64decode(b))))
-        except: pass
     img_desc = "\n".join(img_descs)
-    # c) prompt & LLM
     prompt = PromptTemplate(
         template="""
 Context:
@@ -154,23 +305,23 @@ Question:
 {q}
 Answer:
-""", input_variables=["text","img_desc","q"])
     inp = prompt.format(text="\n\n".join(docs), img_desc=img_desc, q=question)
-    llm = HuggingFaceEndpoint(
-        repo_id=model_id, task="text-generation",
-        temperature=temp, max_new_tokens=max_tok,
-        huggingfacehub_api_token=HF_TOKEN
-    )
-    try:    ans = llm.invoke(inp)
     except HfHubHTTPError as e:
-        ans = f"❌ Model `{model_id}` not hosted." if e.response.status_code==404 else f"⚠️ HF API error: {e}"
     except Exception as e:
-        ans = f"⚠️ Unexpected error: {e}"
-    new_hist = history + [{"role":"user","content":question},
-                          {"role":"assistant","content":ans}]
-    return new_hist, docs, images
@@ -258,4 +409,4 @@ with gr.Blocks(css=CSS, theme=gr.themes.Soft()) as demo:
     )
 if __name__ == "__main__":
-    demo.launch()

 from huggingface_hub.utils import HfHubHTTPError
 from langchain_core.prompts import PromptTemplate
 from langchain_huggingface import HuggingFaceEndpoint
+import io, base64
+from PIL import Image
+import torch
+import gradio as gr
+import spaces
+import numpy as np
+import pandas as pd
+import pymupdf
 from PIL import Image
+from pypdf import PdfReader
+from dotenv import load_dotenv
+import shutil
+from chromadb.config import Settings, DEFAULT_TENANT, DEFAULT_DATABASE
 from welcome_text import WELCOME_INTRO
+from doctr.io import DocumentFile
+from doctr.models import ocr_predictor
+from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration
 import chromadb
 from chromadb.utils import embedding_functions
+from chromadb.utils.data_loaders import ImageLoader
+from langchain_core.prompts import PromptTemplate
 from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_huggingface import HuggingFaceEndpoint
+from utils import extract_pdfs, extract_images, clean_text, image_to_bytes
+from utils import *
 # ─────────────────────────────────────────────────────────────────────────────
+# Load .env
+load_dotenv()
+HF_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
 processor = None
 vision_model = None
+# OCR + multimodal image description setup
+ocr_model = ocr_predictor(
+    "db_resnet50", "crnn_mobilenet_v3_large", pretrained=True, assume_straight_pages=True
 )
+processor = LlavaNextProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")
+vision_model = LlavaNextForConditionalGeneration.from_pretrained(
+    "llava-hf/llava-v1.6-mistral-7b-hf",
+    torch_dtype=torch.float16,
+    low_cpu_mem_usage=True
+).to("cuda")
+# Add at the top of your module, alongside your other globals
+PERSIST_DIR = "./chroma_db"
+if os.path.exists(PERSIST_DIR):
+    shutil.rmtree(PERSIST_DIR)
+@spaces.GPU()
+def get_image_description(image: Image.Image) -> str:
+    """
+    Lazy-loads the Llava processor + model inside the GPU worker,
+    runs captioning, and returns a one-sentence description.
+    """
     global processor, vision_model
+    # On first call, instantiate + move to CUDA
     if processor is None or vision_model is None:
+        processor = LlavaNextProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")
         vision_model = LlavaNextForConditionalGeneration.from_pretrained(
+            "llava-hf/llava-v1.6-mistral-7b-hf",
+            torch_dtype=torch.float16,
+            low_cpu_mem_usage=True
         ).to("cuda")
+    torch.cuda.empty_cache()
+    gc.collect()
     prompt = "[INST] <image>\nDescribe the image in a sentence [/INST]"
+    inputs = processor(prompt, image, return_tensors="pt").to("cuda")
+    output = vision_model.generate(**inputs, max_new_tokens=100)
+    return processor.decode(output[0], skip_special_tokens=True)
+# Vector DB setup
+# at top of file, alongside your other imports
+from chromadb.utils import embedding_functions
+from chromadb.utils.data_loaders import ImageLoader
+import chromadb
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from utils import image_to_bytes  # your helper
+# 1) Create one shared embedding function (defaulting to All-MiniLM-L6-v2, 384-dim)
+SHARED_EMB_FN = embedding_functions.SentenceTransformerEmbeddingFunction(
+    model_name="all-MiniLM-L6-v2"
+)
+def get_vectordb(text: str, images: list[Image.Image], img_names: list[str]):
+    """
+    Build a *persistent* ChromaDB instance on disk, with two collections:
+      • text_db  (chunks of the PDF text)
+      • image_db (image descriptions + raw image bytes)
+    """
+    # 1) Make or clean the on-disk folder
+    shutil.rmtree(PERSIST_DIR, ignore_errors=True)
+    os.makedirs(PERSIST_DIR, exist_ok=True)
+    client = chromadb.PersistentClient(
+    path=PERSIST_DIR,
+    settings=Settings(),
+    tenant=DEFAULT_TENANT,
+    database=DEFAULT_DATABASE
+    )
+    # 3) Create / wipe collections
+    for col in ("text_db", "image_db"):
+        if col in [c.name for c in client.list_collections()]:
+            client.delete_collection(col)
+    text_col = client.get_or_create_collection(
+        name="text_db",
+        embedding_function=SHARED_EMB_FN
+    )
+    img_col = client.get_or_create_collection(
+        name="image_db",
+        embedding_function=SHARED_EMB_FN,
+        metadata={"hnsw:space": "cosine"}
+    )
+    # 4) Add images
+    if images:
+        descs, metas = [], []
+        for idx, img in enumerate(images):
+            try:
+                cap = get_image_description(img)
+            except:
+                cap = "⚠️ could not describe image"
+            descs.append(f"{img_names[idx]}: {cap}")
+            metas.append({"image": image_to_bytes(img)})
+        img_col.add(ids=[str(i) for i in range(len(images))],
+                    documents=descs,
+                    metadatas=metas)
+    # 5) Chunk & add text
+    splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
+    docs = splitter.create_documents([text])
+    text_col.add(ids=[str(i) for i in range(len(docs))],
+                 documents=[d.page_content for d in docs])
+    return client
+# Text extraction
+def result_to_text(result, as_text=False):
+    pages = []
+    for pg in result.pages:
+        txt = " ".join(w.value for block in pg.blocks for line in block.lines for w in line.words)
+        pages.append(clean_text(txt))
+    return "\n\n".join(pages) if as_text else pages
+OCR_CHOICES = {
+    "db_resnet50 + crnn_mobilenet_v3_large": ("db_resnet50", "crnn_mobilenet_v3_large"),
+    "db_resnet50 + crnn_resnet31":          ("db_resnet50", "crnn_resnet31"),
+}
+@spaces.GPU()
 def extract_data_from_pdfs(
+    docs: list[str],
+    session: dict,
+    include_images: str,
+    do_ocr: str,
+    ocr_choice: str,
+    vlm_choice: str,
+    progress=gr.Progress()
 ):
     if not docs:
         raise gr.Error("No documents to process")
+    # 1) OCR pipeline if requested
     if do_ocr == "Get Text With OCR":
         db_m, crnn_m = OCR_CHOICES[ocr_choice]
         local_ocr = ocr_predictor(db_m, crnn_m, pretrained=True, assume_straight_pages=True)
+    else:
+        local_ocr = None
+    # 2) Vision–language model
     proc = LlavaNextProcessor.from_pretrained(vlm_choice)
+    vis = (LlavaNextForConditionalGeneration
+           .from_pretrained(vlm_choice, torch_dtype=torch.float16, low_cpu_mem_usage=True)
+           .to("cuda"))
+    # 3) Monkey-patch caption fn
+    def describe(img):
         torch.cuda.empty_cache(); gc.collect()
         prompt = "[INST] <image>\nDescribe the image in a sentence [/INST]"
         inp = proc(prompt, img, return_tensors="pt").to("cuda")
         out = vis.generate(**inp, max_new_tokens=100)
         return proc.decode(out[0], skip_special_tokens=True)
+    global get_image_description
     get_image_description = describe
+    # 4) Extract text & images
     progress(0.2, "Extracting text and images…")
+    all_text = ""
+    images, names = [], []
+    for path in docs:
         if local_ocr:
+            pdf = DocumentFile.from_pdf(path)
             res = local_ocr(pdf)
+            all_text += result_to_text(res, as_text=True) + "\n\n"
         else:
+            all_text += (PdfReader(path).pages[0].extract_text() or "") + "\n\n"
         if include_images == "Include Images":
+            imgs = extract_images([path])
             images.extend(imgs)
+            names.extend([os.path.basename(path)] * len(imgs))
+    # 5) Build + persist the vectordb
     progress(0.6, "Indexing in vector DB…")
+    client = get_vectordb(all_text, images, names)
+    # 6) Mark session and return UI outputs
+    session["processed"] = True
+    session["persist_directory"] = PERSIST_DIR
+    sample_imgs = images[:4] if include_images == "Include Images" else []
+    return (
+        session,               # gr.State
+        all_text[:2000] + "...",
+        sample_imgs,
+        "<h3>Done!</h3>"
+    )
+# Chat function
+def conversation(
+    session: dict,
+    question: str,
+    num_ctx: int,
+    img_ctx: int,
+    history: list,
+    temp: float,
+    max_tok: int,
+    model_id: str
+):
+    pd = session.get("persist_directory")
+    if not session.get("processed") or not pd:
         raise gr.Error("Please extract data first")
+    # 1) Reopen the same persistent client (new API)
+    client = chromadb.PersistentClient(
+        path=pd,
+        settings=Settings(),
+        tenant=DEFAULT_TENANT,
+        database=DEFAULT_DATABASE
+    )
+    # 2) Text retrieval
+    text_col = client.get_collection("text_db")
+    docs = text_col.query(query_texts=[question],
+                          n_results=int(num_ctx),
+                          include=["documents"])["documents"][0]
+    # 3) Image retrieval
+    img_col = client.get_collection("image_db")
+    img_q = img_col.query(query_texts=[question],
+                          n_results=int(img_ctx),
+                          include=["metadatas","documents"])
     img_descs = img_q["documents"][0] or ["No images found"]
     images = []
+    for meta in img_q["metadatas"][0]:
+        b64 = meta.get("image","")
+        try:
+            images.append(Image.open(io.BytesIO(base64.b64decode(b64))))
+        except:
+            pass
     img_desc = "\n".join(img_descs)
+    # 4) Build prompt & call LLM
+    llm = HuggingFaceEndpoint(
+        repo_id=model_id,
+        task="text-generation",
+        temperature=temp,
+        max_new_tokens=max_tok,
+        huggingfacehub_api_token=HF_TOKEN
+    )
     prompt = PromptTemplate(
         template="""
 Context:
 {q}
 Answer:
+""", input_variables=["text","img_desc","q"]
+    )
     inp = prompt.format(text="\n\n".join(docs), img_desc=img_desc, q=question)
+    try:
+        answer = llm.invoke(inp)
     except HfHubHTTPError as e:
+        answer = "❌ Model not hosted" if e.response.status_code==404 else f"⚠️ HF error: {e}"
     except Exception as e:
+        answer = f"⚠️ Unexpected error: {e}"
+    new_history = history + [
+        {"role":"user", "content":question},
+        {"role":"assistant","content":answer}
+    ]
+    return new_history, docs, images
     )
 if __name__ == "__main__":
+    demo.launch()