Spaces:

AlanXian
/

shuishanllm_chat

Runtime error

App Files Files Community

AlanXian commited on Apr 7

Commit

e8c1452

1 Parent(s): 76eb9fc

update: use nougat-transformer

Browse files

Files changed (2) hide show

app.py +58 -112
requirements.txt +3 -3

app.py CHANGED Viewed

@@ -10,11 +10,17 @@ import sys
 import importlib.util
 from tqdm import tqdm
-# Check if nougat-ocr is installed
-NOUGAT_AVAILABLE = importlib.util.find_spec("nougat") is not None
-if not NOUGAT_AVAILABLE:
-    print("Warning: nougat-ocr is not installed. PDF to Markdown conversion will not be available.")
-    print("To install, run: pip install -U 'git+https://github.com/facebookresearch/nougat.git'")
 # Set an environment variable
 HF_TOKEN = os.environ.get("HF_TOKEN", None)
@@ -28,7 +34,7 @@ DESCRIPTION = '''
 <p>This Space demonstrates the instruction-tuned model <a href="https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct"><b>Meta Llama3 8b Chat</b></a>. Meta Llama3 is the new open LLM and comes in two sizes: 8b and 70b. Feel free to play with it, or duplicate to run privately!</p>
 <p>🔎 For more details about the Llama3 release and how to use the model with <code>transformers</code>, take a look <a href="https://huggingface.co/blog/llama3">at our blog post</a>.</p>
 <p>🦕 Looking for an even more powerful model? Check out the <a href="https://huggingface.co/chat/"><b>Hugging Chat</b></a> integration for Meta Llama 3 70b</p>
-<p>📝 <b>PDF处理功能:</b> 本应用使用<a href="https://github.com/facebookresearch/nougat">Nougat</a>进行高质量PDF到Markdown的转换。该工具能够很好地保留原始布局、数学公式和表格，提供最佳的PDF文档处理体验。</p>
 </div>
 '''
@@ -87,115 +93,69 @@ except:
 if not terminators:
     terminators = [2]  # 使用常见的</s>标记ID作为默认值
-# 使用CUDA运行Nougat的PDF处理函数
-def process_pdf_with_nougat_gpu(pdf_path, output_dir=None):
-    """使用GPU运行Nougat处理PDF文件"""
-    try:
-        # 如果未指定输出目录，使用PDF所在目录
-        if output_dir is None:
-            output_dir = os.path.dirname(pdf_path)
-        # 设置CUDA环境变量
-        env = os.environ.copy()
-        env["CUDA_VISIBLE_DEVICES"] = "0"  # 使用第一个GPU
-        env["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
-        # 执行带有GPU支持的Nougat命令
-        print(f"使用GPU运行Nougat: {pdf_path}")
-        cmd = ["nougat", pdf_path, "-o", output_dir, "--device", "cuda"]
-        # 执行命令并捕获输出
-        result = subprocess.run(
-            cmd,
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
-            text=True,
-            env=env,
-            timeout=300  # 5分钟超时
-        )
-        # 检查命令执行结果
-        if result.returncode != 0:
-            print(f"Nougat GPU处理失败: {result.stderr}")
-            return None, result.stderr
-        # 获取生成的markdown文件路径
-        base_name = os.path.basename(pdf_path)
-        name_without_ext = os.path.splitext(base_name)[0]
-        markdown_path = os.path.join(output_dir, f"{name_without_ext}.mmd")
-        # 检查markdown文件是否生成
-        if not os.path.exists(markdown_path):
-            return None, "Nougat处理完成，但未找到生成的Markdown文件"
-        # 读取markdown内容
-        with open(markdown_path, "r", encoding="utf-8") as f:
-            markdown_content = f.read()
-        return markdown_content, None
-    except subprocess.TimeoutExpired:
-        return None, "Nougat处理超时"
-    except Exception as e:
-        import traceback
-        error = f"Nougat处理异常: {str(e)}\n{traceback.format_exc()}"
-        print(error)
-        return None, error
-# 使用Python API的GPU处理方式
 @spaces.GPU(stateless=True)
-def process_pdf_with_nougat_api(pdf_path):
-    """使用Nougat Python API与GPU处理PDF文件"""
     try:
-        # 导入必要的库
-        from nougat import NougatModel
-        from nougat.utils.checkpoint import get_checkpoint
-        from nougat.dataset.rasterize import rasterize_paper
-        import torch
         # ��保GPU可用
         if not torch.cuda.is_available():
-            return None, "GPU不可用，无法使用Nougat API处理PDF"
         # 显示GPU信息
         device_count = torch.cuda.device_count()
         device_name = torch.cuda.get_device_name(0) if device_count > 0 else "Unknown"
         print(f"使用GPU: {device_name}, 可用GPU数量: {device_count}")
-        # 初始化模型并移至GPU
-        ckpt = get_checkpoint()
-        model = NougatModel.from_pretrained(ckpt)
-        device = torch.device("cuda")
         model = model.to(device)
-        # 处理PDF
         markdown_content = ""
-        pages = list(rasterize_paper(pdf_path))
-        # 使用tqdm显示进度
-        for page_idx, page in enumerate(tqdm(pages, desc="处理PDF页面")):
-            page = page.to(device)
-            markdown = model.inference(page)
-            markdown_content += f"--- Page {page_idx+1} ---\n{markdown}\n\n"
         return markdown_content, None
     except Exception as e:
         import traceback
-        error = f"Nougat API处理异常: {str(e)}\n{traceback.format_exc()}"
         print(error)
         return None, error
 # 添加PDF转换为Markdown函数
 def convert_pdf_to_markdown(pdf_file):
-    """使用Nougat将PDF转换为Markdown (GPU优化版)"""
     if pdf_file is None:
         return "", "未上传PDF"
-    # 检查Nougat是否可用
-    if not NOUGAT_AVAILABLE:
-        return "", "错误: Nougat未安装。请执行 'pip install -U \"git+https://github.com/facebookresearch/nougat.git\"' 安装后重试。"
     try:
         # 创建临时目录用于存储PDF和输出文件
@@ -205,39 +165,25 @@ def convert_pdf_to_markdown(pdf_file):
             with open(temp_pdf_path, "wb") as f:
                 f.write(pdf_file)
-            # 方法1: 首先尝试使用命令行GPU方式
-            print("方法1: 尝试使用命令行GPU方式处理PDF...")
-            markdown_content, error = process_pdf_with_nougat_gpu(temp_pdf_path, temp_dir)
-            if markdown_content is not None:
-                # 限制文本长度
-                if len(markdown_content) > 20000:
-                    markdown_content = markdown_content[:20000] + "\n\n...(Markdown内容已截断)"
-                status = f"PDF已成功转换为Markdown (GPU命令行): 生成了{len(markdown_content)}个字符"
-                return markdown_content, status
-            # 方法2: 如果命令行方式失败，尝试使用Python API方式
-            print(f"方法1失败: {error}")
-            print("方法2: 尝试使用Python API GPU方式处理PDF...")
-            markdown_content, api_error = process_pdf_with_nougat_api(temp_pdf_path)
             if markdown_content is not None:
                 # 限制文本长度
                 if len(markdown_content) > 20000:
                     markdown_content = markdown_content[:20000] + "\n\n...(Markdown内容已截断)"
-                status = f"PDF已成功转换为Markdown (GPU API): 生成了{len(markdown_content)}个���符"
                 return markdown_content, status
-            # 所有方法都失败
-            return "", f"PDF转换失败: 所有GPU方法都失败了\n命令行错误: {error}\nAPI错误: {api_error}"
     except Exception as e:
         import traceback
         error_details = traceback.format_exc()
-        print(f"Nougat转换错误: {str(e)}\n{error_details}")
         return "", f"Markdown转换错误: {str(e)}"
 @spaces.GPU(duration=120, stateless=True)
@@ -362,16 +308,16 @@ with gr.Blocks(fill_height=True, css=css) as demo:
             clear_pdf_btn = gr.Button("清除PDF")
-            if NOUGAT_AVAILABLE:
                 nougat_info = """
                 <div style="margin-top: 10px; margin-bottom: 10px;">
-                    <p><b>Nougat PDF处理:</b> 系统将使用Nougat将上传的PDF转换为高质量Markdown。Nougat能够很好地保留原始布局、数学公式和表格，远优于传统的PDF文本提取。</p>
                 </div>
                 """
             else:
                 nougat_info = """
                 <div style="margin-top: 10px; margin-bottom: 10px; color: #d32f2f;">
-                    <p><b>Nougat未安装:</b> PDF处理功能需要Nougat。请执行 <code>pip install -U 'git+https://github.com/facebookresearch/nougat.git'</code> 安装后重试。</p>
                 </div>
                 """

 import importlib.util
 from tqdm import tqdm
+# Updated imports for transformers-based Nougat
+TRANSFORMERS_NOUGAT_AVAILABLE = importlib.util.find_spec("transformers") is not None
+try:
+    from transformers import VisionEncoderDecoderModel, NougatProcessor, NougatImageProcessor
+    from PIL import Image
+    import pdf2image
+    TRANSFORMERS_NOUGAT_AVAILABLE = True
+except ImportError:
+    TRANSFORMERS_NOUGAT_AVAILABLE = False
+    print("Warning: transformers with Nougat support is not installed. PDF to Markdown conversion will not be available.")
+    print("To install required packages, run: pip install transformers pdf2image Pillow")
 # Set an environment variable
 HF_TOKEN = os.environ.get("HF_TOKEN", None)
 <p>This Space demonstrates the instruction-tuned model <a href="https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct"><b>Meta Llama3 8b Chat</b></a>. Meta Llama3 is the new open LLM and comes in two sizes: 8b and 70b. Feel free to play with it, or duplicate to run privately!</p>
 <p>🔎 For more details about the Llama3 release and how to use the model with <code>transformers</code>, take a look <a href="https://huggingface.co/blog/llama3">at our blog post</a>.</p>
 <p>🦕 Looking for an even more powerful model? Check out the <a href="https://huggingface.co/chat/"><b>Hugging Chat</b></a> integration for Meta Llama 3 70b</p>
+<p>📝 <b>PDF处理功能:</b> 本应用使用<a href="https://huggingface.co/docs/transformers/model_doc/nougat">Transformers Nougat</a>进行高质量PDF到Markdown的转换。该工具能够很好地保留原始布局、数学公式和表格，提供最佳的PDF文档处理体验。</p>
 </div>
 '''
 if not terminators:
     terminators = [2]  # 使用常见的</s>标记ID作为默认值
+# 使用transformers库中的Nougat模型处理PDF
 @spaces.GPU(stateless=True)
+def process_pdf_with_transformers_nougat(pdf_path):
+    """使用transformers库中的Nougat模型将PDF转换为Markdown"""
     try:
         # ��保GPU可用
         if not torch.cuda.is_available():
+            return None, "GPU不可用，无法使用Nougat处理PDF"
         # 显示GPU信息
         device_count = torch.cuda.device_count()
         device_name = torch.cuda.get_device_name(0) if device_count > 0 else "Unknown"
         print(f"使用GPU: {device_name}, 可用GPU数量: {device_count}")
+        # 加载Nougat模型和处理器
+        processor = NougatProcessor.from_pretrained("facebook/nougat-base")
+        image_processor = NougatImageProcessor.from_pretrained("facebook/nougat-base")
+        model = VisionEncoderDecoderModel.from_pretrained("facebook/nougat-base")
+        # 将模型移到GPU
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         model = model.to(device)
+        # 将PDF转换为图像
+        print(f"将PDF转换为图像: {pdf_path}")
+        images = pdf2image.convert_from_path(pdf_path)
+        # 处理每一页并生成Markdown
         markdown_content = ""
+        for page_idx, image in enumerate(tqdm(images, desc="处理PDF页面")):
+            # 处理图像
+            pixel_values = image_processor(image, return_tensors="pt").pixel_values.to(device)
+            # 生成文本
+            outputs = model.generate(
+                pixel_values,
+                max_length=1024,
+                num_beams=4,
+                early_stopping=True
+            )
+            # 解码输出
+            page_markdown = processor.decode(outputs[0], skip_special_tokens=True)
+            markdown_content += f"--- Page {page_idx+1} ---\n{page_markdown}\n\n"
         return markdown_content, None
     except Exception as e:
         import traceback
+        error = f"Transformers Nougat处理异常: {str(e)}\n{traceback.format_exc()}"
         print(error)
         return None, error
 # 添加PDF转换为Markdown函数
 def convert_pdf_to_markdown(pdf_file):
+    """使用Transformers Nougat将PDF转换为Markdown"""
     if pdf_file is None:
         return "", "未上传PDF"
+    # 检查Transformers Nougat是否可用
+    if not TRANSFORMERS_NOUGAT_AVAILABLE:
+        return "", "错误: Transformers Nougat未安装。请执行 'pip install transformers pdf2image Pillow' 安装后重试。"
     try:
         # 创建临时目录用于存储PDF和输出文件
             with open(temp_pdf_path, "wb") as f:
                 f.write(pdf_file)
+            # 使用Transformers Nougat处理PDF
+            print("使用Transformers Nougat处理PDF...")
+            markdown_content, error = process_pdf_with_transformers_nougat(temp_pdf_path)
             if markdown_content is not None:
                 # 限制文本长度
                 if len(markdown_content) > 20000:
                     markdown_content = markdown_content[:20000] + "\n\n...(Markdown内容已截断)"
+                status = f"PDF已成功转换为Markdown (Transformers Nougat): 生成了{len(markdown_content)}个字符"
                 return markdown_content, status
+            # 处理失败
+            return "", f"PDF转换失败: Transformers Nougat处理失败\n错误: {error}"
     except Exception as e:
         import traceback
         error_details = traceback.format_exc()
+        print(f"Transformers Nougat转换错误: {str(e)}\n{error_details}")
         return "", f"Markdown转换错误: {str(e)}"
 @spaces.GPU(duration=120, stateless=True)
             clear_pdf_btn = gr.Button("清除PDF")
+            if TRANSFORMERS_NOUGAT_AVAILABLE:
                 nougat_info = """
                 <div style="margin-top: 10px; margin-bottom: 10px;">
+                    <p><b>Transformers Nougat PDF处理:</b> 系统将使用Transformers库中的Nougat模型将上传的PDF转换为高质量Markdown。Nougat能够很好地保留原始布局、数学公式和表格，远优于传统的PDF文本提取。</p>
                 </div>
                 """
             else:
                 nougat_info = """
                 <div style="margin-top: 10px; margin-bottom: 10px; color: #d32f2f;">
+                    <p><b>Transformers Nougat未安装:</b> PDF处理功能需要Transformers Nougat。请执行 <code>pip install transformers pdf2image Pillow</code> 安装后重试。</p>
                 </div>
                 """

requirements.txt CHANGED Viewed

@@ -1,11 +1,11 @@
 huggingface_hub
 pydantic==2.10.6
-transformers[torch]
 torch
 tqdm
 accelerate
 gradio
 python-dotenv
 albumentations==1.3.1
-# 直接从GitHub源码安装Nougat，绕过版本兼容性问题
-git+https://github.com/facebookresearch/nougat.git

 huggingface_hub
 pydantic==2.10.6
+transformers[torch]>=4.36.0
 torch
 tqdm
 accelerate
 gradio
 python-dotenv
 albumentations==1.3.1
+pdf2image
+Pillow