Spaces:

trungnd7112004
/

FastAPI-backend-chatbotRAG

Sleeping

File size: 6,048 Bytes

from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.datamodel.base_models import InputFormat
import time
import base64
import re
from groq import Groq
import os
from dotenv import load_dotenv
from pathlib import Path

load_dotenv()  # Load environment variables from .env file if present

def convert_pdf_to_md(pdf_path: str) -> str:
    """Convert PDF to MD with image summaries. Returns MD string. (Server-adapted from select_file)"""
    if not os.path.exists(pdf_path):
        raise ValueError(f"PDF not found: {pdf_path}")

    # Enable image extraction in pipeline options
    pipeline_options = PdfPipelineOptions()
    pipeline_options.do_formula_enrichment = True
    pipeline_options.generate_picture_images = True  # Key: enable image extraction

    converter = DocumentConverter(format_options={
        InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
    })

    start_time = time.time()
    result = converter.convert(pdf_path)
    end_time = time.time()

    # Export to Markdown (placeholders like <!-- image --> will be present)
    md = result.document.export_to_markdown()

    # Extract images in a list of dicts
    images_list = []  # List to store dicts with image details

    for item, _ in result.document.iterate_items():
        if item.label == "picture":  # Targets figures/images
            image_data = item.image
            uri = str(image_data.uri)  # Data URI like 'data:image/png;base64,...'

            # Decode the base64 data
            match = re.match(r'data:image/(?P<type>.+);base64,(?P<data>.+)', uri)
            if match:
                img_type = match.group('type')  # e.g., 'png' or 'jpeg'
                img_bytes = base64.b64decode(match.group('data'))

                # Store in list
                images_list.append({
                    'page': item.prov[0].page_no if item.prov else 'Unknown',
                    'label': item.label,
                    'type': img_type,
                    'bytes': img_bytes,
                    'uri': uri
                })

    # Now, summarize images using VLM (Groq with Llama model)
    client = Groq(api_key=os.environ.get("GROQ_API_KEY"))

    prompt_template = """
    You are an expert research assistant in Artificial Intelligence. 
    Your task is to analyze and summarize a figure from a scientific paper.

    The figure may describe an overall architecture, workflow, plot, charts or experimental setup. 
    Provide a clear, detailed summary that helps a reader understand the design without seeing the image.

    When summarizing if figure is model architecture, include:
    - The main purpose of the figure (what problem it addresses).  
    - The overall structure (e.g., input/output, branches, modules, flows).  
    - The key components (e.g., encoders, decoders, adapters, loss functions).  
    - The interactions or data flow between components.  
    - Any special innovations or unique design choices.  
    if figure is charts, images or plot, analyze it.

    Format the summary inside **one section only**.  
    Do not create multiple headers like ## or ###.  
    Use bold or bullet points if needed.

    Now summarize the following figure:
    {image_caption_or_context}
    """

    image_summaries = []

    # Prepare list of base64 strings and types from images_list (assuming order matches placeholders)
    images = [(base64.b64encode(img['bytes']).decode('utf-8'), img['type']) for img in images_list]

    for img_b64, img_type in images:
        try:
            # Use correct MIME type based on extracted image type
            img_data_url = f"data:image/{img_type};base64,{img_b64}"

            completion = client.chat.completions.create(
                model="meta-llama/llama-4-scout-17b-16e-instruct",
                messages=[
                    {
                        "role": "user",
                        "content": [
                            {"type": "text", "text": prompt_template},
                            {"type": "image_url", "image_url": {"url": img_data_url}}
                        ]
                    }
                ],
                temperature=0.0,
                max_completion_tokens=512,
                top_p=1,
                stream=False,
            )

            summary = completion.choices[0].message.content
            image_summaries.append(summary)

        except Exception as e:
            print(f"Error processing image: {e}")
            image_summaries.append("Error summarizing image.")

    # Replace placeholders in Markdown with summaries
    # Assuming placeholders are "<!-- image -->" and appear in the same order as extracted images
    placeholder = "<!-- image -->"
    if len(image_summaries) > 0:
        # Split the Markdown by placeholder
        md_parts = md.split(placeholder)
        if len(md_parts) == len(image_summaries) + 1:
            updated_md = md_parts[0]
            for i in range(len(image_summaries)):
                # Insert summary (formatted nicely in Markdown)
                updated_md += f"\n**Image Summary:**\n{image_summaries[i]}\n" + md_parts[i + 1]
            md = updated_md
        else:
            print("Warning: Number of placeholders doesn't match number of summaries.")

    # Save paper to file md
    # Extract the file name from the full file path
    # file_name = Path(pdf_path).stem + ".pdf"  # Use stem + .pdf to match original basename logic
    # os.makedirs("../data", exist_ok=True)
    #
    # # Save the file in the 'data' folder with the extracted file name
    # output_path = f"data/{file_name}.md"
    # with open(output_path, "w", encoding="utf-8") as f:
    #     f.write(md)
    return md

if __name__ == "__main__":
    # For local testing: Replace with your good PDF path
    pdf_path = r"E:\Study\AI\PE-CLIP.pdf"  # Update this!
    md = convert_pdf_to_md(pdf_path)
    print(md[:1000])  # Print first 1000 characters of the Markdown