File size: 6,048 Bytes
ee00031 a07893d ee00031 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 |
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.datamodel.base_models import InputFormat
import time
import base64
import re
from groq import Groq
import os
from dotenv import load_dotenv
from pathlib import Path
load_dotenv() # Load environment variables from .env file if present
def convert_pdf_to_md(pdf_path: str) -> str:
"""Convert PDF to MD with image summaries. Returns MD string. (Server-adapted from select_file)"""
if not os.path.exists(pdf_path):
raise ValueError(f"PDF not found: {pdf_path}")
# Enable image extraction in pipeline options
pipeline_options = PdfPipelineOptions()
pipeline_options.do_formula_enrichment = True
pipeline_options.generate_picture_images = True # Key: enable image extraction
converter = DocumentConverter(format_options={
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
})
start_time = time.time()
result = converter.convert(pdf_path)
end_time = time.time()
# Export to Markdown (placeholders like <!-- image --> will be present)
md = result.document.export_to_markdown()
# Extract images in a list of dicts
images_list = [] # List to store dicts with image details
for item, _ in result.document.iterate_items():
if item.label == "picture": # Targets figures/images
image_data = item.image
uri = str(image_data.uri) # Data URI like 'data:image/png;base64,...'
# Decode the base64 data
match = re.match(r'data:image/(?P<type>.+);base64,(?P<data>.+)', uri)
if match:
img_type = match.group('type') # e.g., 'png' or 'jpeg'
img_bytes = base64.b64decode(match.group('data'))
# Store in list
images_list.append({
'page': item.prov[0].page_no if item.prov else 'Unknown',
'label': item.label,
'type': img_type,
'bytes': img_bytes,
'uri': uri
})
# Now, summarize images using VLM (Groq with Llama model)
client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
prompt_template = """
You are an expert research assistant in Artificial Intelligence.
Your task is to analyze and summarize a figure from a scientific paper.
The figure may describe an overall architecture, workflow, plot, charts or experimental setup.
Provide a clear, detailed summary that helps a reader understand the design without seeing the image.
When summarizing if figure is model architecture, include:
- The main purpose of the figure (what problem it addresses).
- The overall structure (e.g., input/output, branches, modules, flows).
- The key components (e.g., encoders, decoders, adapters, loss functions).
- The interactions or data flow between components.
- Any special innovations or unique design choices.
if figure is charts, images or plot, analyze it.
Format the summary inside **one section only**.
Do not create multiple headers like ## or ###.
Use bold or bullet points if needed.
Now summarize the following figure:
{image_caption_or_context}
"""
image_summaries = []
# Prepare list of base64 strings and types from images_list (assuming order matches placeholders)
images = [(base64.b64encode(img['bytes']).decode('utf-8'), img['type']) for img in images_list]
for img_b64, img_type in images:
try:
# Use correct MIME type based on extracted image type
img_data_url = f"data:image/{img_type};base64,{img_b64}"
completion = client.chat.completions.create(
model="meta-llama/llama-4-scout-17b-16e-instruct",
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": prompt_template},
{"type": "image_url", "image_url": {"url": img_data_url}}
]
}
],
temperature=0.0,
max_completion_tokens=512,
top_p=1,
stream=False,
)
summary = completion.choices[0].message.content
image_summaries.append(summary)
except Exception as e:
print(f"Error processing image: {e}")
image_summaries.append("Error summarizing image.")
# Replace placeholders in Markdown with summaries
# Assuming placeholders are "<!-- image -->" and appear in the same order as extracted images
placeholder = "<!-- image -->"
if len(image_summaries) > 0:
# Split the Markdown by placeholder
md_parts = md.split(placeholder)
if len(md_parts) == len(image_summaries) + 1:
updated_md = md_parts[0]
for i in range(len(image_summaries)):
# Insert summary (formatted nicely in Markdown)
updated_md += f"\n**Image Summary:**\n{image_summaries[i]}\n" + md_parts[i + 1]
md = updated_md
else:
print("Warning: Number of placeholders doesn't match number of summaries.")
# Save paper to file md
# Extract the file name from the full file path
# file_name = Path(pdf_path).stem + ".pdf" # Use stem + .pdf to match original basename logic
# os.makedirs("../data", exist_ok=True)
#
# # Save the file in the 'data' folder with the extracted file name
# output_path = f"data/{file_name}.md"
# with open(output_path, "w", encoding="utf-8") as f:
# f.write(md)
return md
if __name__ == "__main__":
# For local testing: Replace with your good PDF path
pdf_path = r"E:\Study\AI\PE-CLIP.pdf" # Update this!
md = convert_pdf_to_md(pdf_path)
print(md[:1000]) # Print first 1000 characters of the Markdown |