Spaces:

PatronusAI
/

LynxDemo

Runtime error

Allen Park commited on Jul 29, 2024

Commit

e504a30

1 Parent(s): 1230f78

feat(pdf text extraction): extract all the text from the uploaded pdf file

* feat: add simple filetype extraction helper function
* feat: add pdfplumber text extraction from pdf helper functoin
* chore: add conditionals to ensure filetypes are only pdf, txt, doc, or docx
---------
Co-authored-by: Allen Park <parknella19@gmail.com>

Files changed (2) hide show

app.py +26 -7
requirements.txt +1 -0

app.py CHANGED Viewed

@@ -1,11 +1,11 @@
 import os
 import re
 from typing import List, Tuple, Union
 from pathlib import Path
 import gradio as gr
 import openai
 HF_TOKEN = os.environ.get("HF_TOKEN", None)
 LEPTON_API_TOKEN = os.environ.get("LEPTON_API_TOKEN", None)
@@ -133,17 +133,36 @@ def model_call(question, document, answer, client_base_url):
     combined_reasoning = " ".join(reasoning)[1:-1]
     return combined_reasoning, score
 def upload_file(filepath):
     if filepath is not None:
         name = Path(filepath).name
         print("FILEPATH & file name", filepath, name)
         print("FILEPATH type & file name type", type(filepath), type(name))
-        return [gr.UploadButton(visible=False), gr.Group(visible=True), gr.Markdown("**Uploaded file:** {name}".format(name=name))]
     else:
-        return [gr.UploadButton(visible=True, file_count="single", file_types=UPLOADABLE_FILE_TYPES), gr.Group(visible=False), gr.Markdown("")]
     # return [gr.UploadButton(visible=False), gr.DownloadButton(label=f"Download {name}", value=filepath, visible=True)]
 def reset_buttons():
-    return [gr.UploadButton(visible=True, file_count="single", file_types=UPLOADABLE_FILE_TYPES), gr.Group(visible=False), gr.Markdown("")]
 # def download_file():
 #     return [gr.UploadButton(visible=True), gr.DownloadButton(visible=False)]
@@ -185,8 +204,8 @@ with gr.Blocks(css=css) as demo:
             score = gr.Textbox(label="Score (FAIL if Hallucinated, PASS if not)")
     model_dropdown.change(fn=update_client_base_url, inputs=[model_dropdown], outputs=[base_url_state])
-    u.upload(upload_file, u, [u, file_group, file_name])
-    c.click(reset_buttons, None, [u, file_group, file_name])
     # d.click(download_file, None, [u, d])
     submit_button.click(fn=model_call, inputs=[question, document, answer, base_url_state], outputs=[reasoning, score])

 import os
 import re
+import io
 from typing import List, Tuple, Union
 from pathlib import Path
 import gradio as gr
 import openai
+import pdfplumber
 HF_TOKEN = os.environ.get("HF_TOKEN", None)
 LEPTON_API_TOKEN = os.environ.get("LEPTON_API_TOKEN", None)
     combined_reasoning = " ".join(reasoning)[1:-1]
     return combined_reasoning, score
+def get_filetype(filename):
+    return filename.split(".")[-1]
+def extract_text_pdfplumber(file):
+    with pdfplumber.open(io.BytesIO(file.read())) as pdf:
+        text = ""
+        for page in pdf.pages:
+            text += page.extract_text()
+    return text
 def upload_file(filepath):
+    extracted_file_text = ""
     if filepath is not None:
         name = Path(filepath).name
         print("FILEPATH & file name", filepath, name)
         print("FILEPATH type & file name type", type(filepath), type(name))
+        filetype = get_filetype(name)
+        # conditionals for filetype and function call
+        if filetype == "pdf":
+            extracted_file_text = extract_text_pdfplumber(filepath)
+        elif filetype == "txt":
+            extracted_file_text = filepath.read().decode("utf-8")
+        elif filetype == "docx" or filetype == "doc":
+            extracted_file_text = filepath.read().decode("utf-8")
+        return [gr.UploadButton(visible=False), gr.Group(visible=True), gr.Markdown("**Uploaded file:** {name}".format(name=name)), extracted_file_text]
     else:
+        return [gr.UploadButton(visible=True, file_count="single", file_types=UPLOADABLE_FILE_TYPES), gr.Group(visible=False), gr.Markdown(""), extracted_file_text]
     # return [gr.UploadButton(visible=False), gr.DownloadButton(label=f"Download {name}", value=filepath, visible=True)]
 def reset_buttons():
+    return [gr.UploadButton(visible=True, file_count="single", file_types=UPLOADABLE_FILE_TYPES), gr.Group(visible=False), gr.Markdown(""), gr.Textbox(value="")]
 # def download_file():
 #     return [gr.UploadButton(visible=True), gr.DownloadButton(visible=False)]
             score = gr.Textbox(label="Score (FAIL if Hallucinated, PASS if not)")
     model_dropdown.change(fn=update_client_base_url, inputs=[model_dropdown], outputs=[base_url_state])
+    u.upload(upload_file, u, [u, file_group, file_name, document])
+    c.click(reset_buttons, None, [u, file_group, file_name, document])
     # d.click(download_file, None, [u, d])
     submit_button.click(fn=model_call, inputs=[question, document, answer, base_url_state], outputs=[reasoning, score])

requirements.txt CHANGED Viewed

	@@ -1 +1,2 @@
1	openai


1	openai
2	+ pdfplumber