AlanXian commited on
Commit
e8c1452
·
1 Parent(s): 76eb9fc

update: use nougat-transformer

Browse files
Files changed (2) hide show
  1. app.py +58 -112
  2. requirements.txt +3 -3
app.py CHANGED
@@ -10,11 +10,17 @@ import sys
10
  import importlib.util
11
  from tqdm import tqdm
12
 
13
- # Check if nougat-ocr is installed
14
- NOUGAT_AVAILABLE = importlib.util.find_spec("nougat") is not None
15
- if not NOUGAT_AVAILABLE:
16
- print("Warning: nougat-ocr is not installed. PDF to Markdown conversion will not be available.")
17
- print("To install, run: pip install -U 'git+https://github.com/facebookresearch/nougat.git'")
 
 
 
 
 
 
18
 
19
  # Set an environment variable
20
  HF_TOKEN = os.environ.get("HF_TOKEN", None)
@@ -28,7 +34,7 @@ DESCRIPTION = '''
28
  <p>This Space demonstrates the instruction-tuned model <a href="https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct"><b>Meta Llama3 8b Chat</b></a>. Meta Llama3 is the new open LLM and comes in two sizes: 8b and 70b. Feel free to play with it, or duplicate to run privately!</p>
29
  <p>🔎 For more details about the Llama3 release and how to use the model with <code>transformers</code>, take a look <a href="https://huggingface.co/blog/llama3">at our blog post</a>.</p>
30
  <p>🦕 Looking for an even more powerful model? Check out the <a href="https://huggingface.co/chat/"><b>Hugging Chat</b></a> integration for Meta Llama 3 70b</p>
31
- <p>📝 <b>PDF处理功能:</b> 本应用使用<a href="https://github.com/facebookresearch/nougat">Nougat</a>进行高质量PDF到Markdown的转换。该工具能够很好地保留原始布局、数学公式和表格,提供最佳的PDF文档处理体验。</p>
32
  </div>
33
  '''
34
 
@@ -87,115 +93,69 @@ except:
87
  if not terminators:
88
  terminators = [2] # 使用常见的</s>标记ID作为默认值
89
 
90
- # 使用CUDA运行NougatPDF处理函数
91
- def process_pdf_with_nougat_gpu(pdf_path, output_dir=None):
92
- """使用GPU运行Nougat处理PDF文件"""
93
- try:
94
- # 如果未指定输出目录,使用PDF所在目录
95
- if output_dir is None:
96
- output_dir = os.path.dirname(pdf_path)
97
-
98
- # 设置CUDA环境变量
99
- env = os.environ.copy()
100
- env["CUDA_VISIBLE_DEVICES"] = "0" # 使用第一个GPU
101
- env["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
102
-
103
- # 执行带有GPU支持的Nougat命令
104
- print(f"使用GPU运行Nougat: {pdf_path}")
105
- cmd = ["nougat", pdf_path, "-o", output_dir, "--device", "cuda"]
106
-
107
- # 执行命令并捕获输出
108
- result = subprocess.run(
109
- cmd,
110
- stdout=subprocess.PIPE,
111
- stderr=subprocess.PIPE,
112
- text=True,
113
- env=env,
114
- timeout=300 # 5分钟超时
115
- )
116
-
117
- # 检查命令执行结果
118
- if result.returncode != 0:
119
- print(f"Nougat GPU处理失败: {result.stderr}")
120
- return None, result.stderr
121
-
122
- # 获取生成的markdown文件路径
123
- base_name = os.path.basename(pdf_path)
124
- name_without_ext = os.path.splitext(base_name)[0]
125
- markdown_path = os.path.join(output_dir, f"{name_without_ext}.mmd")
126
-
127
- # 检查markdown文件是否生成
128
- if not os.path.exists(markdown_path):
129
- return None, "Nougat处理完成,但未找到生成的Markdown文件"
130
-
131
- # 读取markdown内容
132
- with open(markdown_path, "r", encoding="utf-8") as f:
133
- markdown_content = f.read()
134
-
135
- return markdown_content, None
136
-
137
- except subprocess.TimeoutExpired:
138
- return None, "Nougat处理超时"
139
-
140
- except Exception as e:
141
- import traceback
142
- error = f"Nougat处理异常: {str(e)}\n{traceback.format_exc()}"
143
- print(error)
144
- return None, error
145
-
146
- # 使用Python API的GPU处理方式
147
  @spaces.GPU(stateless=True)
148
- def process_pdf_with_nougat_api(pdf_path):
149
- """使用Nougat Python API与GPU处理PDF文件"""
150
  try:
151
- # 导入必要的库
152
- from nougat import NougatModel
153
- from nougat.utils.checkpoint import get_checkpoint
154
- from nougat.dataset.rasterize import rasterize_paper
155
- import torch
156
-
157
  # ��保GPU可用
158
  if not torch.cuda.is_available():
159
- return None, "GPU不可用,无法使用Nougat API处理PDF"
160
 
161
  # 显示GPU信息
162
  device_count = torch.cuda.device_count()
163
  device_name = torch.cuda.get_device_name(0) if device_count > 0 else "Unknown"
164
  print(f"使用GPU: {device_name}, 可用GPU数量: {device_count}")
165
 
166
- # 初始化模型并移至GPU
167
- ckpt = get_checkpoint()
168
- model = NougatModel.from_pretrained(ckpt)
169
- device = torch.device("cuda")
 
 
 
170
  model = model.to(device)
171
 
172
- # 处理PDF
 
 
 
 
173
  markdown_content = ""
174
- pages = list(rasterize_paper(pdf_path))
175
 
176
- # 使用tqdm显示进度
177
- for page_idx, page in enumerate(tqdm(pages, desc="处理PDF页面")):
178
- page = page.to(device)
179
- markdown = model.inference(page)
180
- markdown_content += f"--- Page {page_idx+1} ---\n{markdown}\n\n"
 
 
 
 
 
 
 
 
 
 
181
 
182
  return markdown_content, None
183
 
184
  except Exception as e:
185
  import traceback
186
- error = f"Nougat API处理异常: {str(e)}\n{traceback.format_exc()}"
187
  print(error)
188
  return None, error
189
 
190
  # 添加PDF转换为Markdown函数
191
  def convert_pdf_to_markdown(pdf_file):
192
- """使用Nougat将PDF转换为Markdown (GPU优化版)"""
193
  if pdf_file is None:
194
  return "", "未上传PDF"
195
 
196
- # 检查Nougat是否可用
197
- if not NOUGAT_AVAILABLE:
198
- return "", "错误: Nougat未安装。请执行 'pip install -U \"git+https://github.com/facebookresearch/nougat.git\"' 安装后重试。"
199
 
200
  try:
201
  # 创建临时目录用于存储PDF和输出文件
@@ -205,39 +165,25 @@ def convert_pdf_to_markdown(pdf_file):
205
  with open(temp_pdf_path, "wb") as f:
206
  f.write(pdf_file)
207
 
208
- # 方法1: 首先尝试使用命令行GPU方式
209
- print("方法1: 尝试使用命令行GPU方式处理PDF...")
210
- markdown_content, error = process_pdf_with_nougat_gpu(temp_pdf_path, temp_dir)
211
-
212
- if markdown_content is not None:
213
- # 限制文本长度
214
- if len(markdown_content) > 20000:
215
- markdown_content = markdown_content[:20000] + "\n\n...(Markdown内容已截断)"
216
-
217
- status = f"PDF已成功转换为Markdown (GPU命令行): 生成了{len(markdown_content)}个字符"
218
- return markdown_content, status
219
-
220
- # 方法2: 如果命令行方式失败,尝试使用Python API方式
221
- print(f"方法1失败: {error}")
222
- print("方法2: 尝试使用Python API GPU方式处理PDF...")
223
-
224
- markdown_content, api_error = process_pdf_with_nougat_api(temp_pdf_path)
225
 
226
  if markdown_content is not None:
227
  # 限制文本长度
228
  if len(markdown_content) > 20000:
229
  markdown_content = markdown_content[:20000] + "\n\n...(Markdown内容已截断)"
230
 
231
- status = f"PDF已成功转换为Markdown (GPU API): 生成了{len(markdown_content)}个���符"
232
  return markdown_content, status
233
 
234
- # 所有方法都失败
235
- return "", f"PDF转换失败: 所有GPU方法都失败了\n命令行错误: {error}\nAPI错误: {api_error}"
236
 
237
  except Exception as e:
238
  import traceback
239
  error_details = traceback.format_exc()
240
- print(f"Nougat转换错误: {str(e)}\n{error_details}")
241
  return "", f"Markdown转换错误: {str(e)}"
242
 
243
  @spaces.GPU(duration=120, stateless=True)
@@ -362,16 +308,16 @@ with gr.Blocks(fill_height=True, css=css) as demo:
362
 
363
  clear_pdf_btn = gr.Button("清除PDF")
364
 
365
- if NOUGAT_AVAILABLE:
366
  nougat_info = """
367
  <div style="margin-top: 10px; margin-bottom: 10px;">
368
- <p><b>Nougat PDF处理:</b> 系统将使用Nougat将上传的PDF转换为高质量Markdown。Nougat能够很好地保留原始布局、数学公式和表格,远优于传统的PDF文本提取。</p>
369
  </div>
370
  """
371
  else:
372
  nougat_info = """
373
  <div style="margin-top: 10px; margin-bottom: 10px; color: #d32f2f;">
374
- <p><b>Nougat未安装:</b> PDF处理功能需要Nougat。请执行 <code>pip install -U 'git+https://github.com/facebookresearch/nougat.git'</code> 安装后重试。</p>
375
  </div>
376
  """
377
 
 
10
  import importlib.util
11
  from tqdm import tqdm
12
 
13
+ # Updated imports for transformers-based Nougat
14
+ TRANSFORMERS_NOUGAT_AVAILABLE = importlib.util.find_spec("transformers") is not None
15
+ try:
16
+ from transformers import VisionEncoderDecoderModel, NougatProcessor, NougatImageProcessor
17
+ from PIL import Image
18
+ import pdf2image
19
+ TRANSFORMERS_NOUGAT_AVAILABLE = True
20
+ except ImportError:
21
+ TRANSFORMERS_NOUGAT_AVAILABLE = False
22
+ print("Warning: transformers with Nougat support is not installed. PDF to Markdown conversion will not be available.")
23
+ print("To install required packages, run: pip install transformers pdf2image Pillow")
24
 
25
  # Set an environment variable
26
  HF_TOKEN = os.environ.get("HF_TOKEN", None)
 
34
  <p>This Space demonstrates the instruction-tuned model <a href="https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct"><b>Meta Llama3 8b Chat</b></a>. Meta Llama3 is the new open LLM and comes in two sizes: 8b and 70b. Feel free to play with it, or duplicate to run privately!</p>
35
  <p>🔎 For more details about the Llama3 release and how to use the model with <code>transformers</code>, take a look <a href="https://huggingface.co/blog/llama3">at our blog post</a>.</p>
36
  <p>🦕 Looking for an even more powerful model? Check out the <a href="https://huggingface.co/chat/"><b>Hugging Chat</b></a> integration for Meta Llama 3 70b</p>
37
+ <p>📝 <b>PDF处理功能:</b> 本应用使用<a href="https://huggingface.co/docs/transformers/model_doc/nougat">Transformers Nougat</a>进行高质量PDF到Markdown的转换。该工具能够很好地保留原始布局、数学公式和表格,提供最佳的PDF文档处理体验。</p>
38
  </div>
39
  '''
40
 
 
93
  if not terminators:
94
  terminators = [2] # 使用常见的</s>标记ID作为默认值
95
 
96
+ # 使用transformers库中的Nougat模型处理PDF
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
  @spaces.GPU(stateless=True)
98
+ def process_pdf_with_transformers_nougat(pdf_path):
99
+ """使用transformers库中的Nougat模型将PDF转换为Markdown"""
100
  try:
 
 
 
 
 
 
101
  # ��保GPU可用
102
  if not torch.cuda.is_available():
103
+ return None, "GPU不可用,无法使用Nougat处理PDF"
104
 
105
  # 显示GPU信息
106
  device_count = torch.cuda.device_count()
107
  device_name = torch.cuda.get_device_name(0) if device_count > 0 else "Unknown"
108
  print(f"使用GPU: {device_name}, 可用GPU数量: {device_count}")
109
 
110
+ # 加载Nougat模型和处理器
111
+ processor = NougatProcessor.from_pretrained("facebook/nougat-base")
112
+ image_processor = NougatImageProcessor.from_pretrained("facebook/nougat-base")
113
+ model = VisionEncoderDecoderModel.from_pretrained("facebook/nougat-base")
114
+
115
+ # 将模型移到GPU
116
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
117
  model = model.to(device)
118
 
119
+ # PDF转换为图像
120
+ print(f"将PDF转换为图像: {pdf_path}")
121
+ images = pdf2image.convert_from_path(pdf_path)
122
+
123
+ # 处理每一页并生成Markdown
124
  markdown_content = ""
 
125
 
126
+ for page_idx, image in enumerate(tqdm(images, desc="处理PDF页面")):
127
+ # 处理图像
128
+ pixel_values = image_processor(image, return_tensors="pt").pixel_values.to(device)
129
+
130
+ # 生成文本
131
+ outputs = model.generate(
132
+ pixel_values,
133
+ max_length=1024,
134
+ num_beams=4,
135
+ early_stopping=True
136
+ )
137
+
138
+ # 解码输出
139
+ page_markdown = processor.decode(outputs[0], skip_special_tokens=True)
140
+ markdown_content += f"--- Page {page_idx+1} ---\n{page_markdown}\n\n"
141
 
142
  return markdown_content, None
143
 
144
  except Exception as e:
145
  import traceback
146
+ error = f"Transformers Nougat处理异常: {str(e)}\n{traceback.format_exc()}"
147
  print(error)
148
  return None, error
149
 
150
  # 添加PDF转换为Markdown函数
151
  def convert_pdf_to_markdown(pdf_file):
152
+ """使用Transformers Nougat将PDF转换为Markdown"""
153
  if pdf_file is None:
154
  return "", "未上传PDF"
155
 
156
+ # 检查Transformers Nougat是否可用
157
+ if not TRANSFORMERS_NOUGAT_AVAILABLE:
158
+ return "", "错误: Transformers Nougat未安装。请执行 'pip install transformers pdf2image Pillow' 安装后重试。"
159
 
160
  try:
161
  # 创建临时目录用于存储PDF和输出文件
 
165
  with open(temp_pdf_path, "wb") as f:
166
  f.write(pdf_file)
167
 
168
+ # 使用Transformers Nougat处理PDF
169
+ print("使用Transformers Nougat处理PDF...")
170
+ markdown_content, error = process_pdf_with_transformers_nougat(temp_pdf_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
171
 
172
  if markdown_content is not None:
173
  # 限制文本长度
174
  if len(markdown_content) > 20000:
175
  markdown_content = markdown_content[:20000] + "\n\n...(Markdown内容已截断)"
176
 
177
+ status = f"PDF已成功转换为Markdown (Transformers Nougat): 生成了{len(markdown_content)}个字符"
178
  return markdown_content, status
179
 
180
+ # 处理失败
181
+ return "", f"PDF转换失败: Transformers Nougat处理失败\n错误: {error}"
182
 
183
  except Exception as e:
184
  import traceback
185
  error_details = traceback.format_exc()
186
+ print(f"Transformers Nougat转换错误: {str(e)}\n{error_details}")
187
  return "", f"Markdown转换错误: {str(e)}"
188
 
189
  @spaces.GPU(duration=120, stateless=True)
 
308
 
309
  clear_pdf_btn = gr.Button("清除PDF")
310
 
311
+ if TRANSFORMERS_NOUGAT_AVAILABLE:
312
  nougat_info = """
313
  <div style="margin-top: 10px; margin-bottom: 10px;">
314
+ <p><b>Transformers Nougat PDF处理:</b> 系统将使用Transformers库中的Nougat模型将上传的PDF转换为高质量Markdown。Nougat能够很好地保留原始布局、数学公式和表格,远优于传统的PDF文本提取。</p>
315
  </div>
316
  """
317
  else:
318
  nougat_info = """
319
  <div style="margin-top: 10px; margin-bottom: 10px; color: #d32f2f;">
320
+ <p><b>Transformers Nougat未安装:</b> PDF处理功能需要Transformers Nougat。请执行 <code>pip install transformers pdf2image Pillow</code> 安装后重试。</p>
321
  </div>
322
  """
323
 
requirements.txt CHANGED
@@ -1,11 +1,11 @@
1
  huggingface_hub
2
  pydantic==2.10.6
3
- transformers[torch]
4
  torch
5
  tqdm
6
  accelerate
7
  gradio
8
  python-dotenv
9
  albumentations==1.3.1
10
- # 直接从GitHub源码安装Nougat,绕过版本兼容性问题
11
- git+https://github.com/facebookresearch/nougat.git
 
1
  huggingface_hub
2
  pydantic==2.10.6
3
+ transformers[torch]>=4.36.0
4
  torch
5
  tqdm
6
  accelerate
7
  gradio
8
  python-dotenv
9
  albumentations==1.3.1
10
+ pdf2image
11
+ Pillow