|
|
from docling.document_converter import DocumentConverter, PdfFormatOption |
|
|
from docling.datamodel.pipeline_options import PdfPipelineOptions |
|
|
from docling.datamodel.base_models import InputFormat |
|
|
import time |
|
|
import base64 |
|
|
import re |
|
|
from groq import Groq |
|
|
import os |
|
|
from dotenv import load_dotenv |
|
|
from pathlib import Path |
|
|
|
|
|
load_dotenv() |
|
|
|
|
|
def convert_pdf_to_md(pdf_path: str) -> str: |
|
|
"""Convert PDF to MD with image summaries. Returns MD string. (Server-adapted from select_file)""" |
|
|
if not os.path.exists(pdf_path): |
|
|
raise ValueError(f"PDF not found: {pdf_path}") |
|
|
|
|
|
|
|
|
pipeline_options = PdfPipelineOptions() |
|
|
pipeline_options.do_formula_enrichment = True |
|
|
pipeline_options.generate_picture_images = True |
|
|
|
|
|
converter = DocumentConverter(format_options={ |
|
|
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options) |
|
|
}) |
|
|
|
|
|
start_time = time.time() |
|
|
result = converter.convert(pdf_path) |
|
|
end_time = time.time() |
|
|
|
|
|
|
|
|
md = result.document.export_to_markdown() |
|
|
|
|
|
|
|
|
images_list = [] |
|
|
|
|
|
for item, _ in result.document.iterate_items(): |
|
|
if item.label == "picture": |
|
|
image_data = item.image |
|
|
uri = str(image_data.uri) |
|
|
|
|
|
|
|
|
match = re.match(r'data:image/(?P<type>.+);base64,(?P<data>.+)', uri) |
|
|
if match: |
|
|
img_type = match.group('type') |
|
|
img_bytes = base64.b64decode(match.group('data')) |
|
|
|
|
|
|
|
|
images_list.append({ |
|
|
'page': item.prov[0].page_no if item.prov else 'Unknown', |
|
|
'label': item.label, |
|
|
'type': img_type, |
|
|
'bytes': img_bytes, |
|
|
'uri': uri |
|
|
}) |
|
|
|
|
|
|
|
|
client = Groq(api_key=os.environ.get("GROQ_API_KEY")) |
|
|
|
|
|
prompt_template = """ |
|
|
You are an expert research assistant in Artificial Intelligence. |
|
|
Your task is to analyze and summarize a figure from a scientific paper. |
|
|
|
|
|
The figure may describe an overall architecture, workflow, plot, charts or experimental setup. |
|
|
Provide a clear, detailed summary that helps a reader understand the design without seeing the image. |
|
|
|
|
|
When summarizing if figure is model architecture, include: |
|
|
- The main purpose of the figure (what problem it addresses). |
|
|
- The overall structure (e.g., input/output, branches, modules, flows). |
|
|
- The key components (e.g., encoders, decoders, adapters, loss functions). |
|
|
- The interactions or data flow between components. |
|
|
- Any special innovations or unique design choices. |
|
|
if figure is charts, images or plot, analyze it. |
|
|
|
|
|
Format the summary inside **one section only**. |
|
|
Do not create multiple headers like ## or ###. |
|
|
Use bold or bullet points if needed. |
|
|
|
|
|
Now summarize the following figure: |
|
|
{image_caption_or_context} |
|
|
""" |
|
|
|
|
|
image_summaries = [] |
|
|
|
|
|
|
|
|
images = [(base64.b64encode(img['bytes']).decode('utf-8'), img['type']) for img in images_list] |
|
|
|
|
|
for img_b64, img_type in images: |
|
|
try: |
|
|
|
|
|
img_data_url = f"data:image/{img_type};base64,{img_b64}" |
|
|
|
|
|
completion = client.chat.completions.create( |
|
|
model="meta-llama/llama-4-scout-17b-16e-instruct", |
|
|
messages=[ |
|
|
{ |
|
|
"role": "user", |
|
|
"content": [ |
|
|
{"type": "text", "text": prompt_template}, |
|
|
{"type": "image_url", "image_url": {"url": img_data_url}} |
|
|
] |
|
|
} |
|
|
], |
|
|
temperature=0.0, |
|
|
max_completion_tokens=512, |
|
|
top_p=1, |
|
|
stream=False, |
|
|
) |
|
|
|
|
|
summary = completion.choices[0].message.content |
|
|
image_summaries.append(summary) |
|
|
|
|
|
except Exception as e: |
|
|
print(f"Error processing image: {e}") |
|
|
image_summaries.append("Error summarizing image.") |
|
|
|
|
|
|
|
|
|
|
|
placeholder = "<!-- image -->" |
|
|
if len(image_summaries) > 0: |
|
|
|
|
|
md_parts = md.split(placeholder) |
|
|
if len(md_parts) == len(image_summaries) + 1: |
|
|
updated_md = md_parts[0] |
|
|
for i in range(len(image_summaries)): |
|
|
|
|
|
updated_md += f"\n**Image Summary:**\n{image_summaries[i]}\n" + md_parts[i + 1] |
|
|
md = updated_md |
|
|
else: |
|
|
print("Warning: Number of placeholders doesn't match number of summaries.") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return md |
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
|
|
pdf_path = r"E:\Study\AI\PE-CLIP.pdf" |
|
|
md = convert_pdf_to_md(pdf_path) |
|
|
print(md[:1000]) |