aws update + image description
Browse files- __pycache__/inference_svm_model.cpython-310.pyc +0 -0
- __pycache__/mineru_single.cpython-310.pyc +0 -0
- __pycache__/worker.cpython-310.pyc +0 -0
- download_models_hf.py +1 -1
- mineru.log +0 -0
- mineru_single.py +119 -47
__pycache__/inference_svm_model.cpython-310.pyc
CHANGED
|
Binary files a/__pycache__/inference_svm_model.cpython-310.pyc and b/__pycache__/inference_svm_model.cpython-310.pyc differ
|
|
|
__pycache__/mineru_single.cpython-310.pyc
CHANGED
|
Binary files a/__pycache__/mineru_single.cpython-310.pyc and b/__pycache__/mineru_single.cpython-310.pyc differ
|
|
|
__pycache__/worker.cpython-310.pyc
CHANGED
|
Binary files a/__pycache__/worker.cpython-310.pyc and b/__pycache__/worker.cpython-310.pyc differ
|
|
|
download_models_hf.py
CHANGED
|
@@ -60,7 +60,7 @@ if __name__ == '__main__':
|
|
| 60 |
json_mods = {
|
| 61 |
'models-dir': model_dir,
|
| 62 |
'layoutreader-model-dir': layoutreader_model_dir,
|
| 63 |
-
'device-mode': 'cuda'
|
| 64 |
}
|
| 65 |
|
| 66 |
download_and_modify_json(json_url, config_file, json_mods)
|
|
|
|
| 60 |
json_mods = {
|
| 61 |
'models-dir': model_dir,
|
| 62 |
'layoutreader-model-dir': layoutreader_model_dir,
|
| 63 |
+
'device-mode': 'cuda',
|
| 64 |
}
|
| 65 |
|
| 66 |
download_and_modify_json(json_url, config_file, json_mods)
|
mineru.log
ADDED
|
File without changes
|
mineru_single.py
CHANGED
|
@@ -6,52 +6,52 @@ import requests
|
|
| 6 |
import logging
|
| 7 |
import torch
|
| 8 |
import gc
|
| 9 |
-
|
| 10 |
from magic_pdf.data.dataset import PymuDocDataset
|
| 11 |
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
|
| 12 |
from magic_pdf.data.io.s3 import S3Writer
|
| 13 |
from magic_pdf.data.data_reader_writer.base import DataWriter
|
| 14 |
-
|
| 15 |
from inference_svm_model import SVMModel
|
|
|
|
|
|
|
|
|
|
| 16 |
|
| 17 |
logging.basicConfig(
|
| 18 |
level=logging.INFO,
|
| 19 |
-
format="%(asctime)s [%(levelname)s] %(name)s - %(message)s"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
)
|
|
|
|
| 21 |
logger = logging.getLogger(__name__)
|
|
|
|
| 22 |
|
| 23 |
class Processor:
|
| 24 |
def __init__(self):
|
| 25 |
try:
|
| 26 |
-
self.s3_writer =
|
| 27 |
ak=os.getenv("S3_ACCESS_KEY"),
|
| 28 |
sk=os.getenv("S3_SECRET_KEY"),
|
| 29 |
bucket=os.getenv("S3_BUCKET_NAME"),
|
| 30 |
endpoint_url=os.getenv("S3_ENDPOINT"),
|
| 31 |
)
|
| 32 |
-
|
| 33 |
# self.svm_model = SVMModel()
|
| 34 |
# logger.info("Classification model initialized successfully")
|
| 35 |
-
|
| 36 |
with open("/home/user/magic-pdf.json", "r") as f:
|
| 37 |
config = json.load(f)
|
| 38 |
-
|
| 39 |
# self.layout_mode = "doclayout_yolo"
|
| 40 |
-
|
| 41 |
self.layout_mode = config["layout-config"]["model"]
|
| 42 |
self.formula_enable = config["formula-config"]["enable"]
|
| 43 |
-
self.table_enable =
|
| 44 |
self.language = "en"
|
| 45 |
-
|
| 46 |
endpoint = os.getenv("S3_ENDPOINT", "").rstrip("/")
|
| 47 |
bucket = os.getenv("S3_BUCKET_NAME", "")
|
| 48 |
-
self.prefix = "
|
| 49 |
-
|
| 50 |
logger.info("Processor initialized successfully")
|
| 51 |
except Exception as e:
|
| 52 |
logger.error("Failed to initialize Processor: %s", str(e))
|
| 53 |
raise
|
| 54 |
-
|
| 55 |
def cleanup_gpu(self):
|
| 56 |
"""
|
| 57 |
Releases GPU memory, use garbage collection to clear PyTorch's CUDA cache.
|
|
@@ -63,22 +63,18 @@ class Processor:
|
|
| 63 |
logger.info("GPU memory cleaned up.")
|
| 64 |
except Exception as e:
|
| 65 |
logger.error("Error during GPU cleanup: %s", e)
|
| 66 |
-
|
| 67 |
def process(self, file_url: str, key: str) -> str:
|
| 68 |
"""
|
| 69 |
Process a single PDF, returning final Markdown with irrelevant images removed.
|
| 70 |
"""
|
| 71 |
logger.info("Processing file: %s", file_url)
|
| 72 |
-
|
| 73 |
try:
|
| 74 |
response = requests.get(file_url)
|
| 75 |
if response.status_code != 200:
|
| 76 |
logger.error("Failed to download PDF from %s. Status code: %d", file_url, response.status_code)
|
| 77 |
raise Exception(f"Failed to download PDF: {file_url}")
|
| 78 |
-
|
| 79 |
pdf_bytes = response.content
|
| 80 |
logger.info("Downloaded %d bytes for file_url='%s'", len(pdf_bytes), file_url)
|
| 81 |
-
|
| 82 |
# Analyze PDF with OCR
|
| 83 |
dataset = PymuDocDataset(pdf_bytes)
|
| 84 |
inference = doc_analyze(
|
|
@@ -90,60 +86,136 @@ class Processor:
|
|
| 90 |
table_enable=self.table_enable
|
| 91 |
)
|
| 92 |
logger.info("doc_analyze complete for key='%s'. Started extracting images...", key)
|
| 93 |
-
|
| 94 |
# Classify images and remove irrelevant ones
|
| 95 |
# image_writer = ImageWriter(self.s3_writer)
|
| 96 |
-
|
| 97 |
-
image_writer = ImageWriter(self.s3_writer, image_base_path) # Pass base path to ImageWriter
|
| 98 |
-
|
| 99 |
pipe_result = inference.pipe_ocr_mode(image_writer, lang=self.language)
|
| 100 |
logger.info("OCR pipeline completed for key='%s'.", key)
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
final_markdown = image_writer.remove_redundant_images(md_content)
|
| 104 |
logger.info("Completed PDF process for key='%s'. Final MD length=%d", key, len(final_markdown))
|
| 105 |
return final_markdown
|
| 106 |
finally:
|
| 107 |
# GPU memory is cleaned up after each processing.
|
| 108 |
self.cleanup_gpu()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 109 |
|
| 110 |
class ImageWriter(DataWriter):
|
| 111 |
"""
|
| 112 |
Receives each extracted image. Classifies it, uploads if relevant, or flags
|
| 113 |
it for removal if irrelevant.
|
| 114 |
"""
|
| 115 |
-
def __init__(self, s3_writer:
|
| 116 |
self.s3_writer = s3_writer
|
| 117 |
self.base_path = base_path
|
| 118 |
# self.svm_model = svm_model
|
| 119 |
self._redundant_images_paths = []
|
| 120 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 121 |
def write(self, path: str, data: bytes) -> None:
|
| 122 |
"""
|
| 123 |
Called for each extracted image. If relevant, upload to S3; otherwise mark for removal.
|
| 124 |
"""
|
| 125 |
-
|
| 126 |
-
full_path = f"{self.base_path}{path}"
|
| 127 |
self.s3_writer.write(full_path, data)
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 138 |
return md_content
|
| 139 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 140 |
if __name__ == "__main__":
|
| 141 |
processor = Processor()
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
print("Single file Markdown:\n", markdown_result)
|
| 146 |
-
|
| 147 |
-
multiple_urls = ["https://example.com/file1.pdf", "https://example.com/file2.pdf"]
|
| 148 |
-
batch_results = processor.process_batch(multiple_urls)
|
| 149 |
-
print("Batch results:", batch_results)
|
|
|
|
| 6 |
import logging
|
| 7 |
import torch
|
| 8 |
import gc
|
|
|
|
| 9 |
from magic_pdf.data.dataset import PymuDocDataset
|
| 10 |
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
|
| 11 |
from magic_pdf.data.io.s3 import S3Writer
|
| 12 |
from magic_pdf.data.data_reader_writer.base import DataWriter
|
|
|
|
| 13 |
from inference_svm_model import SVMModel
|
| 14 |
+
import concurrent.futures
|
| 15 |
+
import boto3
|
| 16 |
+
from io import BytesIO
|
| 17 |
|
| 18 |
logging.basicConfig(
|
| 19 |
level=logging.INFO,
|
| 20 |
+
format="%(asctime)s [%(levelname)s] %(name)s - %(message)s",
|
| 21 |
+
handlers=[
|
| 22 |
+
logging.StreamHandler(), # This will output to console
|
| 23 |
+
logging.FileHandler('mineru.log') # This will save to a file
|
| 24 |
+
]
|
| 25 |
)
|
| 26 |
+
|
| 27 |
logger = logging.getLogger(__name__)
|
| 28 |
+
logger.setLevel(logging.INFO) # Ensure logger level is set to INFO
|
| 29 |
|
| 30 |
class Processor:
|
| 31 |
def __init__(self):
|
| 32 |
try:
|
| 33 |
+
self.s3_writer = s3Writer(
|
| 34 |
ak=os.getenv("S3_ACCESS_KEY"),
|
| 35 |
sk=os.getenv("S3_SECRET_KEY"),
|
| 36 |
bucket=os.getenv("S3_BUCKET_NAME"),
|
| 37 |
endpoint_url=os.getenv("S3_ENDPOINT"),
|
| 38 |
)
|
|
|
|
| 39 |
# self.svm_model = SVMModel()
|
| 40 |
# logger.info("Classification model initialized successfully")
|
|
|
|
| 41 |
with open("/home/user/magic-pdf.json", "r") as f:
|
| 42 |
config = json.load(f)
|
|
|
|
| 43 |
# self.layout_mode = "doclayout_yolo"
|
|
|
|
| 44 |
self.layout_mode = config["layout-config"]["model"]
|
| 45 |
self.formula_enable = config["formula-config"]["enable"]
|
| 46 |
+
self.table_enable = False
|
| 47 |
self.language = "en"
|
|
|
|
| 48 |
endpoint = os.getenv("S3_ENDPOINT", "").rstrip("/")
|
| 49 |
bucket = os.getenv("S3_BUCKET_NAME", "")
|
| 50 |
+
self.prefix = "document-extracts/"
|
|
|
|
| 51 |
logger.info("Processor initialized successfully")
|
| 52 |
except Exception as e:
|
| 53 |
logger.error("Failed to initialize Processor: %s", str(e))
|
| 54 |
raise
|
|
|
|
| 55 |
def cleanup_gpu(self):
|
| 56 |
"""
|
| 57 |
Releases GPU memory, use garbage collection to clear PyTorch's CUDA cache.
|
|
|
|
| 63 |
logger.info("GPU memory cleaned up.")
|
| 64 |
except Exception as e:
|
| 65 |
logger.error("Error during GPU cleanup: %s", e)
|
|
|
|
| 66 |
def process(self, file_url: str, key: str) -> str:
|
| 67 |
"""
|
| 68 |
Process a single PDF, returning final Markdown with irrelevant images removed.
|
| 69 |
"""
|
| 70 |
logger.info("Processing file: %s", file_url)
|
|
|
|
| 71 |
try:
|
| 72 |
response = requests.get(file_url)
|
| 73 |
if response.status_code != 200:
|
| 74 |
logger.error("Failed to download PDF from %s. Status code: %d", file_url, response.status_code)
|
| 75 |
raise Exception(f"Failed to download PDF: {file_url}")
|
|
|
|
| 76 |
pdf_bytes = response.content
|
| 77 |
logger.info("Downloaded %d bytes for file_url='%s'", len(pdf_bytes), file_url)
|
|
|
|
| 78 |
# Analyze PDF with OCR
|
| 79 |
dataset = PymuDocDataset(pdf_bytes)
|
| 80 |
inference = doc_analyze(
|
|
|
|
| 86 |
table_enable=self.table_enable
|
| 87 |
)
|
| 88 |
logger.info("doc_analyze complete for key='%s'. Started extracting images...", key)
|
|
|
|
| 89 |
# Classify images and remove irrelevant ones
|
| 90 |
# image_writer = ImageWriter(self.s3_writer)
|
| 91 |
+
image_writer = ImageWriter(self.s3_writer, f"{self.prefix}{key}/") # Pass base path to ImageWriter
|
|
|
|
|
|
|
| 92 |
pipe_result = inference.pipe_ocr_mode(image_writer, lang=self.language)
|
| 93 |
logger.info("OCR pipeline completed for key='%s'.", key)
|
| 94 |
+
md_content = pipe_result.get_markdown(f"{self.prefix}{key}/")
|
| 95 |
+
final_markdown = image_writer.post_process(f"{self.prefix}{key}/",md_content)
|
|
|
|
| 96 |
logger.info("Completed PDF process for key='%s'. Final MD length=%d", key, len(final_markdown))
|
| 97 |
return final_markdown
|
| 98 |
finally:
|
| 99 |
# GPU memory is cleaned up after each processing.
|
| 100 |
self.cleanup_gpu()
|
| 101 |
+
class s3Writer:
|
| 102 |
+
def __init__(self, ak: str, sk: str, bucket: str, endpoint_url: str):
|
| 103 |
+
self.bucket = bucket
|
| 104 |
+
self.client = boto3.client('s3',
|
| 105 |
+
aws_access_key_id=ak,
|
| 106 |
+
aws_secret_access_key=sk,
|
| 107 |
+
endpoint_url=endpoint_url
|
| 108 |
+
)
|
| 109 |
+
|
| 110 |
+
def write(self, path: str, data: bytes) -> None:
|
| 111 |
+
"""Upload data to S3 using proper keyword arguments"""
|
| 112 |
+
try:
|
| 113 |
+
# Convert bytes to file-like object
|
| 114 |
+
file_obj = BytesIO(data)
|
| 115 |
+
|
| 116 |
+
# Upload using upload_fileobj
|
| 117 |
+
self.client.upload_fileobj(
|
| 118 |
+
file_obj,
|
| 119 |
+
self.bucket,
|
| 120 |
+
path
|
| 121 |
+
)
|
| 122 |
+
except Exception as e:
|
| 123 |
+
logger.error(f"Failed to upload to S3: {str(e)}")
|
| 124 |
+
raise
|
| 125 |
|
| 126 |
class ImageWriter(DataWriter):
|
| 127 |
"""
|
| 128 |
Receives each extracted image. Classifies it, uploads if relevant, or flags
|
| 129 |
it for removal if irrelevant.
|
| 130 |
"""
|
| 131 |
+
def __init__(self, s3_writer: s3Writer, base_path: str):
|
| 132 |
self.s3_writer = s3_writer
|
| 133 |
self.base_path = base_path
|
| 134 |
# self.svm_model = svm_model
|
| 135 |
self._redundant_images_paths = []
|
| 136 |
+
self.descriptions = {}
|
| 137 |
+
"""
|
| 138 |
+
{
|
| 139 |
+
"{path}": {
|
| 140 |
+
"description": "{description}",
|
| 141 |
+
"full_path": "{full_path}"
|
| 142 |
+
}
|
| 143 |
+
}
|
| 144 |
+
"""
|
| 145 |
+
|
| 146 |
def write(self, path: str, data: bytes) -> None:
|
| 147 |
"""
|
| 148 |
Called for each extracted image. If relevant, upload to S3; otherwise mark for removal.
|
| 149 |
"""
|
| 150 |
+
full_path = f"{self.base_path}" + path.split("/")[-1]
|
|
|
|
| 151 |
self.s3_writer.write(full_path, data)
|
| 152 |
+
self.descriptions[path] = {
|
| 153 |
+
"data": data,
|
| 154 |
+
"full_path": full_path
|
| 155 |
+
}
|
| 156 |
+
|
| 157 |
+
def post_process(self, key: str, md_content: str) -> str:
|
| 158 |
+
max_workers = len(self.descriptions)
|
| 159 |
+
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
|
| 160 |
+
future_to_file = {
|
| 161 |
+
executor.submit(
|
| 162 |
+
call_gemini_for_image_description,
|
| 163 |
+
self.descriptions[path]['data']
|
| 164 |
+
): path for path in self.descriptions.keys()
|
| 165 |
+
}
|
| 166 |
+
for future in concurrent.futures.as_completed(future_to_file):
|
| 167 |
+
path = future_to_file[future]
|
| 168 |
+
try:
|
| 169 |
+
description = future.result()
|
| 170 |
+
if description:
|
| 171 |
+
self.descriptions[path]['description'] = description
|
| 172 |
+
except Exception as e:
|
| 173 |
+
logger.error(f"[ERROR] Processing {path}: {str(e)}")
|
| 174 |
+
|
| 175 |
+
for path, info in self.descriptions.items():
|
| 176 |
+
description = info['description']
|
| 177 |
+
full_path = info['full_path']
|
| 178 |
+
md_content = md_content.replace(f"", f"")
|
| 179 |
return md_content
|
| 180 |
+
|
| 181 |
+
def call_gemini_for_image_description(image_data: bytes) -> str:
|
| 182 |
+
"""Convert image bytes to Gemini-compatible format and get description"""
|
| 183 |
+
from google import genai
|
| 184 |
+
import base64
|
| 185 |
+
|
| 186 |
+
try:
|
| 187 |
+
# Initialize Gemini client
|
| 188 |
+
client = genai.Client(api_key="AIzaSyDtoakpXa2pjJwcQB6TJ5QaXHNSA5JxcrU")
|
| 189 |
+
|
| 190 |
+
# Generate content with proper image format
|
| 191 |
+
response = client.models.generate_content(
|
| 192 |
+
model="gemini-2.0-flash",
|
| 193 |
+
contents=[
|
| 194 |
+
{
|
| 195 |
+
"parts": [
|
| 196 |
+
{"text": """The provided image is a part of a question paper or markscheme. Extract all the necessary information from the image to be able to identify the question.
|
| 197 |
+
For example, if there is an image that contains text like: "Q1 Part A Answer: Life on earth was created by diety..." you should return "Q1 Part A Answer"
|
| 198 |
+
If there is no text on this image, return the description of the image. 20 words max."""},
|
| 199 |
+
{
|
| 200 |
+
"inline_data": {
|
| 201 |
+
"mime_type": "image/jpeg",
|
| 202 |
+
"data": base64.b64encode(image_data).decode('utf-8')
|
| 203 |
+
}
|
| 204 |
+
}
|
| 205 |
+
]
|
| 206 |
+
}
|
| 207 |
+
]
|
| 208 |
+
)
|
| 209 |
+
|
| 210 |
+
# Get the response text
|
| 211 |
+
description = response.text.strip() if response and response.text else "Image description unavailable"
|
| 212 |
+
return description
|
| 213 |
+
|
| 214 |
+
except Exception as e:
|
| 215 |
+
logger.error(f"Error getting image description: {str(e)}")
|
| 216 |
+
return ("error", "Error describing image", None)
|
| 217 |
if __name__ == "__main__":
|
| 218 |
processor = Processor()
|
| 219 |
+
single_url = "https://quextro-resources.s3.eu-west-2.amazonaws.com/1739967958667-643657-mark-scheme-computer-principles.pdf?response-content-disposition=inline&X-Amz-Content-Sha256=UNSIGNED-PAYLOAD&X-Amz-Security-Token=IQoJb3JpZ2luX2VjEJT%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FwEaCWV1LXdlc3QtMiJGMEQCIARfSyuot0h2RNrcqVQkc2T%2B1fJZ64NfjmkmAFgCkTG6AiArmbJDAUr7T85HdqAT2RbyLhmiIgpSo3ci4%2FUtSap2wCrUAwi8%2F%2F%2F%2F%2F%2F%2F%2F%2F%2F8BEAAaDDUwOTM5OTYxODAzMCIMkfFm%2FgBrHsH1qh59KqgDjfZd1%2BKGzxkn7JorfQ07dL%2BL5fjCA6kmNAzCnCjDpTLnNjBfB1vnO2ZLvtC8RNvnaewY6tFWUfl39dC62ldnfajHeFmxkZqBcbDf3oOGnuO2PIvBgb5%2BvppVDkYjWz7vv5TzpgC2sVzjA38QMwxAnausYWDgspap7qjlfoLJUiBOq9SIMZyKVsfeAf4OiUl0TDc2nheqvNXOJy9TPh94KWbBT35vP3fU9A7ZdF4sElm4nVZMnOPdbR7%2Ba6F57nPLZvUaLZC5Nb011ef6%2BhAxr9yeONh5MAoTGUH2qzedDmN%2FbKannddBy%2FNIaP%2BhF7lWUkKemQrM5vajwU6k2Q45pLruKWRkjtrWxdmkQE4zb67ETj5eGL%2BlPPj%2BPtQWzF7UaoWPUH4tGBZ%2Bqdu479rU1ZSg%2B15lR%2F8SAgP%2BydATGwyRtXEvMRJZIiUems8i6ehxWC%2FscY2%2FtCk9OREKhLwOEEdJDAR4vqt68lnnvVomHrVjwNQvyP9A4V8Ct%2B0SjxP%2F86kJnX3o%2FVEoFT44JWICuMuf8kwoelUbZGPl6SaftGsRSUvoy7PV5TCN3du9BjrlAjKhLpjsCwgp1rJ8cPBFcUgOmL3iXrtHs3FhDLljxbXRZ%2FadHkxAlzf%2BXym%2BFBnhdCkDfmWcMEH3GAOFfv%2FlE5SsZMO1JoXbzQlO3OX6nrUacj7LF7ZoO8TYMVoTyEZSLEABNOU7KCILaFeDGRDJ8Ia5I3jnXvOVouFn2VnhykCuWPTunjkMEQBiHa3mbZP0mVcSviujHXatN11INiR%2BPwAN5oxKXeT25B%2FCCI3wib5Av2tzp8zuw8joib5PWNXOYfRgMR7R0Sj%2FjW5SxWr%2BTD9TAD3%2Fqj5pj3Oo13dNGdv5RwGqk1iHd8okpkFYlxEmXD2tTanpxX8ON1%2FLHz%2BNEUJDOogx8TLw5I6mkVs3zjoMhhwn2%2BWrlnNa%2F3i9lAGyLY6Ps4U23Hv7b4gpH4%2BeJN72Z95hrNtcumq4uuf0pRoJPQ9pjiZttjeDwNZzb7d3XuiEQeOgK8rpTeEgduxhdJOOLwZGrg%3D%3D&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=ASIAXNGUVKHXFLYKHBHD%2F20250220%2Feu-west-2%2Fs3%2Faws4_request&X-Amz-Date=20250220T111935Z&X-Amz-Expires=10800&X-Amz-SignedHeaders=host&X-Amz-Signature=64aa008fdafe72f1a693078156451c0f6f702e89e546954d6b3d61abf9f73ec8"
|
| 220 |
+
markdown_result = processor.process(single_url, key="1234323")
|
| 221 |
+
print("Single file Markdown:\n", markdown_result)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|