Spaces:
Sleeping
Sleeping
Added TTS
Browse files- app.py +37 -8
- header.html +124 -115
- tts.py +46 -0
app.py
CHANGED
|
@@ -1,34 +1,55 @@
|
|
| 1 |
# app.py
|
| 2 |
import os
|
| 3 |
import gradio as gr
|
|
|
|
|
|
|
| 4 |
from gradio_pdf import PDF
|
|
|
|
| 5 |
from model import model_initialized
|
| 6 |
from pdf_processor import to_pdf, to_markdown
|
| 7 |
-
from
|
| 8 |
-
import logging
|
| 9 |
|
| 10 |
# Set up logging
|
| 11 |
logging.basicConfig(level=logging.INFO)
|
| 12 |
|
| 13 |
# Load header HTML content
|
| 14 |
-
with open("header.html", "r") as file:
|
| 15 |
header = file.read()
|
| 16 |
|
| 17 |
-
#
|
| 18 |
-
latin_lang = ['af', 'az', 'bs', 'cs', 'cy', 'da', 'de', 'es', 'et', 'fr', 'ga', 'hr',
|
|
|
|
|
|
|
|
|
|
| 19 |
arabic_lang = ['ar', 'fa', 'ug', 'ur']
|
| 20 |
-
cyrillic_lang = ['ru', 'rs_cyrillic', 'be', 'bg', 'uk', 'mn', 'abq', 'ady', 'kbd', 'ava',
|
| 21 |
-
|
|
|
|
|
|
|
| 22 |
other_lang = ['ch', 'en', 'korean', 'japan', 'chinese_cht', 'ta', 'te', 'ka']
|
| 23 |
|
| 24 |
all_lang = ['', 'auto'] + other_lang + latin_lang + arabic_lang + cyrillic_lang + devanagari_lang
|
| 25 |
|
| 26 |
-
#
|
| 27 |
def file_to_pdf(file_obj):
|
| 28 |
if file_obj is not None:
|
| 29 |
return to_pdf(file_obj.name)
|
| 30 |
return None
|
| 31 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
with gr.Blocks() as demo:
|
| 33 |
gr.HTML(header)
|
| 34 |
with gr.Row():
|
|
@@ -65,13 +86,21 @@ with gr.Blocks() as demo:
|
|
| 65 |
md_render = gr.Markdown(label="Markdown rendering", height=1100, show_copy_button=True, line_breaks=True)
|
| 66 |
with gr.Tab("Markdown text"):
|
| 67 |
md_text = gr.TextArea(lines=45, show_copy_button=True)
|
|
|
|
|
|
|
|
|
|
| 68 |
|
|
|
|
| 69 |
file_input.change(fn=file_to_pdf, inputs=file_input, outputs=pdf_display)
|
|
|
|
| 70 |
convert_button.click(
|
| 71 |
fn=to_markdown,
|
| 72 |
inputs=[file_input, max_pages, is_ocr, layout_mode, formula_enable, table_enable, language],
|
| 73 |
outputs=[md_render, md_text, output_file, pdf_display]
|
| 74 |
)
|
|
|
|
|
|
|
|
|
|
| 75 |
clear_button.add([file_input, md_render, pdf_display, md_text, output_file, is_ocr])
|
| 76 |
|
| 77 |
if __name__ == "__main__":
|
|
|
|
| 1 |
# app.py
|
| 2 |
import os
|
| 3 |
import gradio as gr
|
| 4 |
+
import logging
|
| 5 |
+
import tempfile
|
| 6 |
from gradio_pdf import PDF
|
| 7 |
+
from config import config
|
| 8 |
from model import model_initialized
|
| 9 |
from pdf_processor import to_pdf, to_markdown
|
| 10 |
+
from tts import text_to_speech_openai, text_to_speech_gtts
|
|
|
|
| 11 |
|
| 12 |
# Set up logging
|
| 13 |
logging.basicConfig(level=logging.INFO)
|
| 14 |
|
| 15 |
# Load header HTML content
|
| 16 |
+
with open("header.html", "r", encoding="utf-8") as file:
|
| 17 |
header = file.read()
|
| 18 |
|
| 19 |
+
# Define language options (could also be moved to config.yaml)
|
| 20 |
+
latin_lang = ['af', 'az', 'bs', 'cs', 'cy', 'da', 'de', 'es', 'et', 'fr', 'ga', 'hr',
|
| 21 |
+
'hu', 'id', 'is', 'it', 'ku', 'la', 'lt', 'lv', 'mi', 'ms', 'mt', 'nl',
|
| 22 |
+
'no', 'oc', 'pi', 'pl', 'pt', 'ro', 'rs_latin', 'sk', 'sl', 'sq', 'sv',
|
| 23 |
+
'sw', 'tl', 'tr', 'uz', 'vi', 'french', 'german']
|
| 24 |
arabic_lang = ['ar', 'fa', 'ug', 'ur']
|
| 25 |
+
cyrillic_lang = ['ru', 'rs_cyrillic', 'be', 'bg', 'uk', 'mn', 'abq', 'ady', 'kbd', 'ava',
|
| 26 |
+
'dar', 'inh', 'che', 'lbe', 'lez', 'tab']
|
| 27 |
+
devanagari_lang = ['hi', 'mr', 'ne', 'bh', 'mai', 'ang', 'bho', 'mah', 'sck', 'new', 'gom',
|
| 28 |
+
'sa', 'bgc']
|
| 29 |
other_lang = ['ch', 'en', 'korean', 'japan', 'chinese_cht', 'ta', 'te', 'ka']
|
| 30 |
|
| 31 |
all_lang = ['', 'auto'] + other_lang + latin_lang + arabic_lang + cyrillic_lang + devanagari_lang
|
| 32 |
|
| 33 |
+
# Define a function to convert a file to a PDF (if not already)
|
| 34 |
def file_to_pdf(file_obj):
|
| 35 |
if file_obj is not None:
|
| 36 |
return to_pdf(file_obj.name)
|
| 37 |
return None
|
| 38 |
|
| 39 |
+
# Define a function to handle TTS using OpenAI (with fallback)
|
| 40 |
+
def read_text(text, language="en"):
|
| 41 |
+
"""
|
| 42 |
+
Attempts to synthesize speech from text using OpenAI TTS,
|
| 43 |
+
falling back to gTTS if an error occurs.
|
| 44 |
+
"""
|
| 45 |
+
try:
|
| 46 |
+
text_to_speech_openai(text, language)
|
| 47 |
+
except Exception as e:
|
| 48 |
+
logging.error("OpenAI TTS failed: %s. Falling back to gTTS.", e)
|
| 49 |
+
text_to_speech_gtts(text, language)
|
| 50 |
+
return "Audio played successfully"
|
| 51 |
+
|
| 52 |
+
# Set up the Gradio Blocks interface
|
| 53 |
with gr.Blocks() as demo:
|
| 54 |
gr.HTML(header)
|
| 55 |
with gr.Row():
|
|
|
|
| 86 |
md_render = gr.Markdown(label="Markdown rendering", height=1100, show_copy_button=True, line_breaks=True)
|
| 87 |
with gr.Tab("Markdown text"):
|
| 88 |
md_text = gr.TextArea(lines=45, show_copy_button=True)
|
| 89 |
+
# TTS components
|
| 90 |
+
read_button = gr.Button("Read Out Loud")
|
| 91 |
+
read_status = gr.Textbox(label="TTS Status")
|
| 92 |
|
| 93 |
+
# Define interactions
|
| 94 |
file_input.change(fn=file_to_pdf, inputs=file_input, outputs=pdf_display)
|
| 95 |
+
|
| 96 |
convert_button.click(
|
| 97 |
fn=to_markdown,
|
| 98 |
inputs=[file_input, max_pages, is_ocr, layout_mode, formula_enable, table_enable, language],
|
| 99 |
outputs=[md_render, md_text, output_file, pdf_display]
|
| 100 |
)
|
| 101 |
+
|
| 102 |
+
read_button.click(fn=read_text, inputs=[md_text, language], outputs=read_status)
|
| 103 |
+
|
| 104 |
clear_button.add([file_input, md_render, pdf_display, md_text, output_file, is_ocr])
|
| 105 |
|
| 106 |
if __name__ == "__main__":
|
header.html
CHANGED
|
@@ -1,132 +1,141 @@
|
|
| 1 |
<html>
|
| 2 |
<head>
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
<style>
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
</style>
|
| 32 |
</head>
|
| 33 |
|
| 34 |
-
<body>
|
| 35 |
-
<div style="
|
| 36 |
-
display: flex;
|
| 37 |
-
flex-direction: column;
|
| 38 |
-
justify-content: center;
|
| 39 |
-
align-items: center;
|
| 40 |
-
text-align: center;
|
| 41 |
-
background: linear-gradient(45deg, #007bff 0%, #0056b3 100%);
|
| 42 |
-
padding: 24px;
|
| 43 |
-
gap: 24px;
|
| 44 |
-
border-radius: 8px;
|
| 45 |
-
">
|
| 46 |
<div style="
|
| 47 |
display: flex;
|
| 48 |
flex-direction: column;
|
|
|
|
| 49 |
align-items: center;
|
| 50 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
">
|
| 52 |
-
<div style="
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
"
|
| 60 |
-
|
| 61 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
</div>
|
| 63 |
-
</div>
|
| 64 |
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
</div>
|
| 126 |
-
</div>
|
| 127 |
-
|
| 128 |
-
<!-- New Demo Links -->
|
| 129 |
-
</div>
|
| 130 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 131 |
|
| 132 |
-
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
<html>
|
| 2 |
<head>
|
| 3 |
+
<!-- <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bulma@0.9.3/css/bulma.min.css"> -->
|
| 4 |
+
<link rel="stylesheet" href="https://use.fontawesome.com/releases/v5.15.4/css/all.css">
|
| 5 |
+
<style>
|
| 6 |
+
.link-block {
|
| 7 |
+
border: 1px solid transparent;
|
| 8 |
+
border-radius: 24px;
|
| 9 |
+
background-color: rgba(54, 54, 54, 1);
|
| 10 |
+
cursor: pointer !important;
|
| 11 |
+
}
|
| 12 |
+
.link-block:hover {
|
| 13 |
+
background-color: rgba(54, 54, 54, 0.75) !important;
|
| 14 |
+
cursor: pointer !important;
|
| 15 |
+
}
|
| 16 |
+
.external-link {
|
| 17 |
+
display: inline-flex;
|
| 18 |
+
align-items: center;
|
| 19 |
+
height: 36px;
|
| 20 |
+
line-height: 36px;
|
| 21 |
+
padding: 0 16px;
|
| 22 |
+
cursor: pointer !important;
|
| 23 |
+
}
|
| 24 |
+
.external-link,
|
| 25 |
+
.external-link:hover {
|
| 26 |
+
cursor: pointer !important;
|
| 27 |
+
}
|
| 28 |
+
a {
|
| 29 |
+
text-decoration: none;
|
| 30 |
+
}
|
| 31 |
+
</style>
|
| 32 |
</head>
|
| 33 |
|
| 34 |
+
<body>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
<div style="
|
| 36 |
display: flex;
|
| 37 |
flex-direction: column;
|
| 38 |
+
justify-content: center;
|
| 39 |
align-items: center;
|
| 40 |
+
text-align: center;
|
| 41 |
+
background: linear-gradient(45deg, #007bff 0%, #0056b3 100%);
|
| 42 |
+
padding: 24px;
|
| 43 |
+
gap: 24px;
|
| 44 |
+
border-radius: 8px;
|
| 45 |
">
|
| 46 |
+
<div style="
|
| 47 |
+
display: flex;
|
| 48 |
+
flex-direction: column;
|
| 49 |
+
align-items: center;
|
| 50 |
+
gap: 16px;
|
| 51 |
+
">
|
| 52 |
+
<div style="display: flex; flex-direction: column; gap: 8px">
|
| 53 |
+
<h1 style="
|
| 54 |
+
font-size: 48px;
|
| 55 |
+
color: #fafafa;
|
| 56 |
+
margin: 0;
|
| 57 |
+
font-family: 'Trebuchet MS', 'Lucida Sans Unicode',
|
| 58 |
+
'Lucida Grande', 'Lucida Sans', Arial, sans-serif;
|
| 59 |
+
">
|
| 60 |
+
MinerU: PDF Extraction & Voice Reading Demo
|
| 61 |
+
</h1>
|
| 62 |
+
</div>
|
| 63 |
</div>
|
|
|
|
| 64 |
|
| 65 |
+
<p style="
|
| 66 |
+
margin: 0;
|
| 67 |
+
line-height: 1.6rem;
|
| 68 |
+
font-size: 16px;
|
| 69 |
+
color: #fafafa;
|
| 70 |
+
opacity: 0.8;
|
| 71 |
+
">
|
| 72 |
+
A one-stop, open-source, high-quality tool for data extraction and PDF voice reading,<br>
|
| 73 |
+
supporting PDF, webpage, and e-book extraction.
|
| 74 |
+
</p>
|
| 75 |
+
<style>
|
| 76 |
+
.link-block {
|
| 77 |
+
display: inline-block;
|
| 78 |
+
}
|
| 79 |
+
.link-block + .link-block {
|
| 80 |
+
margin-left: 20px;
|
| 81 |
+
}
|
| 82 |
+
</style>
|
| 83 |
|
| 84 |
+
<div class="column has-text-centered">
|
| 85 |
+
<div class="publication-links">
|
| 86 |
+
<!-- Code Link. -->
|
| 87 |
+
<span class="link-block">
|
| 88 |
+
<a href="https://github.com/opendatalab/MinerU" class="external-link button is-normal is-rounded is-dark" style="text-decoration: none; cursor: pointer">
|
| 89 |
+
<span class="icon" style="margin-right: 4px">
|
| 90 |
+
<i class="fab fa-github" style="color: white; margin-right: 4px"></i>
|
| 91 |
+
</span>
|
| 92 |
+
<span style="color: white">Code</span>
|
| 93 |
+
</a>
|
| 94 |
+
</span>
|
| 95 |
|
| 96 |
+
<!-- arXiv Link. -->
|
| 97 |
+
<span class="link-block">
|
| 98 |
+
<a href="https://arxiv.org/abs/2409.18839" class="external-link button is-normal is-rounded is-dark" style="text-decoration: none; cursor: pointer">
|
| 99 |
+
<span class="icon" style="margin-right: 8px">
|
| 100 |
+
<i class="fas fa-file" style="color: white"></i>
|
| 101 |
+
</span>
|
| 102 |
+
<span style="color: white">Paper</span>
|
| 103 |
+
</a>
|
| 104 |
+
</span>
|
| 105 |
|
| 106 |
+
<!-- Homepage Link. -->
|
| 107 |
+
<span class="link-block">
|
| 108 |
+
<a href="https://mineru.org.cn/home?source=huggingface" class="external-link button is-normal is-rounded is-dark" style="text-decoration: none; cursor: pointer">
|
| 109 |
+
<span class="icon" style="margin-right: 8px">
|
| 110 |
+
<i class="fas fa-home" style="color: white"></i>
|
| 111 |
+
</span>
|
| 112 |
+
<span style="color: white">Homepage</span>
|
| 113 |
+
</a>
|
| 114 |
+
</span>
|
| 115 |
|
| 116 |
+
<!-- Client Link. -->
|
| 117 |
+
<span class="link-block">
|
| 118 |
+
<a href="https://mineru.org.cn/client?source=huggingface" class="external-link button is-normal is-rounded is-dark" style="text-decoration: none; cursor: pointer">
|
| 119 |
+
<span class="icon" style="margin-right: 8px">
|
| 120 |
+
<i class="fas fa-download" style="color: white"></i>
|
| 121 |
+
</span>
|
| 122 |
+
<span style="color: white">Download</span>
|
| 123 |
+
</a>
|
| 124 |
+
</span>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 125 |
|
| 126 |
+
<!-- Voice Reading Demo Link. -->
|
| 127 |
+
<span class="link-block">
|
| 128 |
+
<a href="https://mineru.org.cn/voice?source=huggingface" class="external-link button is-normal is-rounded is-dark" style="text-decoration: none; cursor: pointer">
|
| 129 |
+
<span class="icon" style="margin-right: 8px">
|
| 130 |
+
<i class="fas fa-volume-up" style="color: white"></i>
|
| 131 |
+
</span>
|
| 132 |
+
<span style="color: white">Voice Reading Demo</span>
|
| 133 |
+
</a>
|
| 134 |
+
</span>
|
| 135 |
+
</div>
|
| 136 |
+
</div>
|
| 137 |
|
| 138 |
+
<!-- New Demo Links -->
|
| 139 |
+
</div>
|
| 140 |
+
</body>
|
| 141 |
+
</html>
|
tts.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# tts.py
|
| 2 |
+
import os
|
| 3 |
+
import tempfile
|
| 4 |
+
import requests
|
| 5 |
+
from playsound import playsound
|
| 6 |
+
|
| 7 |
+
def text_to_speech_openai(text, language="en"):
|
| 8 |
+
"""
|
| 9 |
+
Convert text to speech using a hypothetical OpenAI TTS API.
|
| 10 |
+
Note: OpenAI Whisper is for speech recognition.
|
| 11 |
+
Replace the endpoint and parameters with actual API details when available.
|
| 12 |
+
"""
|
| 13 |
+
import openai
|
| 14 |
+
api_key = os.getenv("api_key_oai")
|
| 15 |
+
if not api_key:
|
| 16 |
+
raise ValueError("API key for OpenAI TTS not found in environment variable 'api_key_oai'")
|
| 17 |
+
openai.api_key = api_key
|
| 18 |
+
|
| 19 |
+
try:
|
| 20 |
+
# Hypothetical API call -- adjust the engine name and parameters as per actual API documentation.
|
| 21 |
+
response = openai.Audio.synthesize(
|
| 22 |
+
engine="tts", # Hypothetical engine name for TTS
|
| 23 |
+
text=text,
|
| 24 |
+
language=language
|
| 25 |
+
)
|
| 26 |
+
audio_url = response["audio_url"]
|
| 27 |
+
except Exception as e:
|
| 28 |
+
raise RuntimeError(f"OpenAI TTS synthesis failed: {e}")
|
| 29 |
+
|
| 30 |
+
# Download and play the audio
|
| 31 |
+
audio_data = requests.get(audio_url).content
|
| 32 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
|
| 33 |
+
tmp_file.write(audio_data)
|
| 34 |
+
tmp_file_path = tmp_file.name
|
| 35 |
+
playsound(tmp_file_path)
|
| 36 |
+
|
| 37 |
+
def text_to_speech_gtts(text, language="en"):
|
| 38 |
+
"""
|
| 39 |
+
Fallback text-to-speech using the gTTS library.
|
| 40 |
+
"""
|
| 41 |
+
from gtts import gTTS
|
| 42 |
+
tts = gTTS(text=text, lang=language)
|
| 43 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
|
| 44 |
+
tts.save(tmp_file.name)
|
| 45 |
+
tmp_file_path = tmp_file.name
|
| 46 |
+
playsound(tmp_file_path)
|