Spaces:
Runtime error
Runtime error
Allen Park
commited on
Commit
·
6efea88
1
Parent(s):
901a87e
feat(docx text extraction): extract all the text from the uploaded docx file
Browse files* feat: add python-docx text extraction from pdf helper functoin
---------
Co-authored-by: Allen Park <parknella19@gmail.com>
- app.py +10 -2
- requirements.txt +2 -1
app.py
CHANGED
|
@@ -6,6 +6,7 @@ from pathlib import Path
|
|
| 6 |
import gradio as gr
|
| 7 |
import openai
|
| 8 |
import pymupdf
|
|
|
|
| 9 |
|
| 10 |
HF_TOKEN = os.environ.get("HF_TOKEN", None)
|
| 11 |
LEPTON_API_TOKEN = os.environ.get("LEPTON_API_TOKEN", None)
|
|
@@ -143,6 +144,13 @@ def extract_text_pymupdf(file):
|
|
| 143 |
text += page.get_text()
|
| 144 |
return text
|
| 145 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 146 |
def upload_file(filepath):
|
| 147 |
extracted_file_text = ""
|
| 148 |
if filepath is not None:
|
|
@@ -153,8 +161,8 @@ def upload_file(filepath):
|
|
| 153 |
# conditionals for filetype and function call
|
| 154 |
if filetype == "pdf" or filetype == "txt":
|
| 155 |
extracted_file_text = extract_text_pymupdf(filepath)
|
| 156 |
-
elif filetype == "docx"
|
| 157 |
-
extracted_file_text = filepath
|
| 158 |
return [gr.UploadButton(visible=False), gr.Group(visible=True), gr.Markdown("**Uploaded file:** {name}".format(name=name)), extracted_file_text]
|
| 159 |
else:
|
| 160 |
return [gr.UploadButton(visible=True, file_count="single", file_types=UPLOADABLE_FILE_TYPES), gr.Group(visible=False), gr.Markdown(""), extracted_file_text]
|
|
|
|
| 6 |
import gradio as gr
|
| 7 |
import openai
|
| 8 |
import pymupdf
|
| 9 |
+
from docx import Document
|
| 10 |
|
| 11 |
HF_TOKEN = os.environ.get("HF_TOKEN", None)
|
| 12 |
LEPTON_API_TOKEN = os.environ.get("LEPTON_API_TOKEN", None)
|
|
|
|
| 144 |
text += page.get_text()
|
| 145 |
return text
|
| 146 |
|
| 147 |
+
def extract_text_python_docx(file):
|
| 148 |
+
doc = Document(io.BytesIO(file))
|
| 149 |
+
text = ""
|
| 150 |
+
for paragraph in doc.paragraphs:
|
| 151 |
+
text += paragraph.text + '\n'
|
| 152 |
+
return text.strip()
|
| 153 |
+
|
| 154 |
def upload_file(filepath):
|
| 155 |
extracted_file_text = ""
|
| 156 |
if filepath is not None:
|
|
|
|
| 161 |
# conditionals for filetype and function call
|
| 162 |
if filetype == "pdf" or filetype == "txt":
|
| 163 |
extracted_file_text = extract_text_pymupdf(filepath)
|
| 164 |
+
elif filetype == "docx":
|
| 165 |
+
extracted_file_text = extract_text_python_docx(filepath)
|
| 166 |
return [gr.UploadButton(visible=False), gr.Group(visible=True), gr.Markdown("**Uploaded file:** {name}".format(name=name)), extracted_file_text]
|
| 167 |
else:
|
| 168 |
return [gr.UploadButton(visible=True, file_count="single", file_types=UPLOADABLE_FILE_TYPES), gr.Group(visible=False), gr.Markdown(""), extracted_file_text]
|
requirements.txt
CHANGED
|
@@ -1,2 +1,3 @@
|
|
| 1 |
openai
|
| 2 |
-
PyMuPDF
|
|
|
|
|
|
| 1 |
openai
|
| 2 |
+
PyMuPDF
|
| 3 |
+
python-docx
|