Spaces:
Runtime error
Runtime error
Allen Park
commited on
Commit
·
e504a30
1
Parent(s):
1230f78
feat(pdf text extraction): extract all the text from the uploaded pdf file
Browse files* feat: add simple filetype extraction helper function
* feat: add pdfplumber text extraction from pdf helper functoin
* chore: add conditionals to ensure filetypes are only pdf, txt, doc, or docx
---------
Co-authored-by: Allen Park <parknella19@gmail.com>
- app.py +26 -7
- requirements.txt +1 -0
app.py
CHANGED
|
@@ -1,11 +1,11 @@
|
|
| 1 |
import os
|
| 2 |
import re
|
|
|
|
| 3 |
from typing import List, Tuple, Union
|
| 4 |
from pathlib import Path
|
| 5 |
import gradio as gr
|
| 6 |
import openai
|
| 7 |
-
|
| 8 |
-
|
| 9 |
|
| 10 |
HF_TOKEN = os.environ.get("HF_TOKEN", None)
|
| 11 |
LEPTON_API_TOKEN = os.environ.get("LEPTON_API_TOKEN", None)
|
|
@@ -133,17 +133,36 @@ def model_call(question, document, answer, client_base_url):
|
|
| 133 |
combined_reasoning = " ".join(reasoning)[1:-1]
|
| 134 |
return combined_reasoning, score
|
| 135 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 136 |
def upload_file(filepath):
|
|
|
|
| 137 |
if filepath is not None:
|
| 138 |
name = Path(filepath).name
|
| 139 |
print("FILEPATH & file name", filepath, name)
|
| 140 |
print("FILEPATH type & file name type", type(filepath), type(name))
|
| 141 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 142 |
else:
|
| 143 |
-
return [gr.UploadButton(visible=True, file_count="single", file_types=UPLOADABLE_FILE_TYPES), gr.Group(visible=False), gr.Markdown("")]
|
| 144 |
# return [gr.UploadButton(visible=False), gr.DownloadButton(label=f"Download {name}", value=filepath, visible=True)]
|
| 145 |
def reset_buttons():
|
| 146 |
-
return [gr.UploadButton(visible=True, file_count="single", file_types=UPLOADABLE_FILE_TYPES), gr.Group(visible=False), gr.Markdown("")]
|
| 147 |
|
| 148 |
# def download_file():
|
| 149 |
# return [gr.UploadButton(visible=True), gr.DownloadButton(visible=False)]
|
|
@@ -185,8 +204,8 @@ with gr.Blocks(css=css) as demo:
|
|
| 185 |
score = gr.Textbox(label="Score (FAIL if Hallucinated, PASS if not)")
|
| 186 |
|
| 187 |
model_dropdown.change(fn=update_client_base_url, inputs=[model_dropdown], outputs=[base_url_state])
|
| 188 |
-
u.upload(upload_file, u, [u, file_group, file_name])
|
| 189 |
-
c.click(reset_buttons, None, [u, file_group, file_name])
|
| 190 |
# d.click(download_file, None, [u, d])
|
| 191 |
|
| 192 |
submit_button.click(fn=model_call, inputs=[question, document, answer, base_url_state], outputs=[reasoning, score])
|
|
|
|
| 1 |
import os
|
| 2 |
import re
|
| 3 |
+
import io
|
| 4 |
from typing import List, Tuple, Union
|
| 5 |
from pathlib import Path
|
| 6 |
import gradio as gr
|
| 7 |
import openai
|
| 8 |
+
import pdfplumber
|
|
|
|
| 9 |
|
| 10 |
HF_TOKEN = os.environ.get("HF_TOKEN", None)
|
| 11 |
LEPTON_API_TOKEN = os.environ.get("LEPTON_API_TOKEN", None)
|
|
|
|
| 133 |
combined_reasoning = " ".join(reasoning)[1:-1]
|
| 134 |
return combined_reasoning, score
|
| 135 |
|
| 136 |
+
def get_filetype(filename):
|
| 137 |
+
return filename.split(".")[-1]
|
| 138 |
+
|
| 139 |
+
def extract_text_pdfplumber(file):
|
| 140 |
+
with pdfplumber.open(io.BytesIO(file.read())) as pdf:
|
| 141 |
+
text = ""
|
| 142 |
+
for page in pdf.pages:
|
| 143 |
+
text += page.extract_text()
|
| 144 |
+
return text
|
| 145 |
+
|
| 146 |
def upload_file(filepath):
|
| 147 |
+
extracted_file_text = ""
|
| 148 |
if filepath is not None:
|
| 149 |
name = Path(filepath).name
|
| 150 |
print("FILEPATH & file name", filepath, name)
|
| 151 |
print("FILEPATH type & file name type", type(filepath), type(name))
|
| 152 |
+
filetype = get_filetype(name)
|
| 153 |
+
# conditionals for filetype and function call
|
| 154 |
+
if filetype == "pdf":
|
| 155 |
+
extracted_file_text = extract_text_pdfplumber(filepath)
|
| 156 |
+
elif filetype == "txt":
|
| 157 |
+
extracted_file_text = filepath.read().decode("utf-8")
|
| 158 |
+
elif filetype == "docx" or filetype == "doc":
|
| 159 |
+
extracted_file_text = filepath.read().decode("utf-8")
|
| 160 |
+
return [gr.UploadButton(visible=False), gr.Group(visible=True), gr.Markdown("**Uploaded file:** {name}".format(name=name)), extracted_file_text]
|
| 161 |
else:
|
| 162 |
+
return [gr.UploadButton(visible=True, file_count="single", file_types=UPLOADABLE_FILE_TYPES), gr.Group(visible=False), gr.Markdown(""), extracted_file_text]
|
| 163 |
# return [gr.UploadButton(visible=False), gr.DownloadButton(label=f"Download {name}", value=filepath, visible=True)]
|
| 164 |
def reset_buttons():
|
| 165 |
+
return [gr.UploadButton(visible=True, file_count="single", file_types=UPLOADABLE_FILE_TYPES), gr.Group(visible=False), gr.Markdown(""), gr.Textbox(value="")]
|
| 166 |
|
| 167 |
# def download_file():
|
| 168 |
# return [gr.UploadButton(visible=True), gr.DownloadButton(visible=False)]
|
|
|
|
| 204 |
score = gr.Textbox(label="Score (FAIL if Hallucinated, PASS if not)")
|
| 205 |
|
| 206 |
model_dropdown.change(fn=update_client_base_url, inputs=[model_dropdown], outputs=[base_url_state])
|
| 207 |
+
u.upload(upload_file, u, [u, file_group, file_name, document])
|
| 208 |
+
c.click(reset_buttons, None, [u, file_group, file_name, document])
|
| 209 |
# d.click(download_file, None, [u, d])
|
| 210 |
|
| 211 |
submit_button.click(fn=model_call, inputs=[question, document, answer, base_url_state], outputs=[reasoning, score])
|
requirements.txt
CHANGED
|
@@ -1 +1,2 @@
|
|
| 1 |
openai
|
|
|
|
|
|
| 1 |
openai
|
| 2 |
+
pdfplumber
|