Spaces:
Runtime error
Runtime error
Allen Park
commited on
Commit
·
901a87e
1
Parent(s):
e504a30
fix(pdfplumber): replace the pdfplumber package and implementation with pymupdf
Browse files- app.py +7 -9
- requirements.txt +1 -1
app.py
CHANGED
|
@@ -5,7 +5,7 @@ from typing import List, Tuple, Union
|
|
| 5 |
from pathlib import Path
|
| 6 |
import gradio as gr
|
| 7 |
import openai
|
| 8 |
-
import
|
| 9 |
|
| 10 |
HF_TOKEN = os.environ.get("HF_TOKEN", None)
|
| 11 |
LEPTON_API_TOKEN = os.environ.get("LEPTON_API_TOKEN", None)
|
|
@@ -136,11 +136,11 @@ def model_call(question, document, answer, client_base_url):
|
|
| 136 |
def get_filetype(filename):
|
| 137 |
return filename.split(".")[-1]
|
| 138 |
|
| 139 |
-
def
|
| 140 |
-
with
|
| 141 |
text = ""
|
| 142 |
-
for page in
|
| 143 |
-
text += page.
|
| 144 |
return text
|
| 145 |
|
| 146 |
def upload_file(filepath):
|
|
@@ -151,10 +151,8 @@ def upload_file(filepath):
|
|
| 151 |
print("FILEPATH type & file name type", type(filepath), type(name))
|
| 152 |
filetype = get_filetype(name)
|
| 153 |
# conditionals for filetype and function call
|
| 154 |
-
if filetype == "pdf":
|
| 155 |
-
extracted_file_text =
|
| 156 |
-
elif filetype == "txt":
|
| 157 |
-
extracted_file_text = filepath.read().decode("utf-8")
|
| 158 |
elif filetype == "docx" or filetype == "doc":
|
| 159 |
extracted_file_text = filepath.read().decode("utf-8")
|
| 160 |
return [gr.UploadButton(visible=False), gr.Group(visible=True), gr.Markdown("**Uploaded file:** {name}".format(name=name)), extracted_file_text]
|
|
|
|
| 5 |
from pathlib import Path
|
| 6 |
import gradio as gr
|
| 7 |
import openai
|
| 8 |
+
import pymupdf
|
| 9 |
|
| 10 |
HF_TOKEN = os.environ.get("HF_TOKEN", None)
|
| 11 |
LEPTON_API_TOKEN = os.environ.get("LEPTON_API_TOKEN", None)
|
|
|
|
| 136 |
def get_filetype(filename):
|
| 137 |
return filename.split(".")[-1]
|
| 138 |
|
| 139 |
+
def extract_text_pymupdf(file):
|
| 140 |
+
with pymupdf.open(file) as pdf_or_txt:
|
| 141 |
text = ""
|
| 142 |
+
for page in pdf_or_txt:
|
| 143 |
+
text += page.get_text()
|
| 144 |
return text
|
| 145 |
|
| 146 |
def upload_file(filepath):
|
|
|
|
| 151 |
print("FILEPATH type & file name type", type(filepath), type(name))
|
| 152 |
filetype = get_filetype(name)
|
| 153 |
# conditionals for filetype and function call
|
| 154 |
+
if filetype == "pdf" or filetype == "txt":
|
| 155 |
+
extracted_file_text = extract_text_pymupdf(filepath)
|
|
|
|
|
|
|
| 156 |
elif filetype == "docx" or filetype == "doc":
|
| 157 |
extracted_file_text = filepath.read().decode("utf-8")
|
| 158 |
return [gr.UploadButton(visible=False), gr.Group(visible=True), gr.Markdown("**Uploaded file:** {name}".format(name=name)), extracted_file_text]
|
requirements.txt
CHANGED
|
@@ -1,2 +1,2 @@
|
|
| 1 |
openai
|
| 2 |
-
|
|
|
|
| 1 |
openai
|
| 2 |
+
PyMuPDF
|