pdfreader / APP
Tejha's picture
Create APP
297bcca verified
import streamlit as st
from streamlit_lottie import st_lottie
import requests
from io import BytesIO
from docx import Document
import pdfplumber
from gtts import gTTS
import os
import base64
# --- Load Assets ---
def load_lottieurl(url):
r = requests.get(url)
if r.status_code != 200:
return None
return r.json()
lottie_astronaut = load_lottieurl("https://lottie.host/b86c724d-556d-4a7a-a9b2-277f8099687b/J5c91vW5qS.json")
# --- Functions ---
def read_docx(file):
try:
document = Document(file)
full_text = []
for para in document.paragraphs:
full_text.append(para.text)
return "\n".join(full_text)
except Exception as e:
return f"Error reading DOCX file: {e}"
def read_pdf(file):
try:
text = ""
with pdfplumber.open(file) as pdf:
for page in pdf.pages:
text += page.extract_text() + "\n"
return text
except Exception as e:
return f"Error reading PDF file: {e}"
def analyze_text(text):
# Basic analysis - you can expand this with more sophisticated NLP techniques
word_count = len(text.split())
char_count = len(text)
sentences = text.split('.')
sentence_count = len([s.strip() for s in sentences if s.strip()])
return f"Word Count: {word_count}\nCharacter Count: {char_count}\nSentence Count: {sentence_count}"
def text_to_speech(text, language='en'):
try:
tts = gTTS(text=text, lang=language, slow=False)
audio_file = 'temp_audio.mp3'
tts.save(audio_file)
return audio_file
except Exception as e:
st.error(f"Error generating speech: {e}")
return None
def play_audio(audio_file):
with open(audio_file, "rb") as f:
data = f.read()
b64 = base64.b64encode(data).decode()
md = f"""
<audio controls autoplay="true">
<source src="data:audio/mp3;base64,{b64}" type="audio/mp3">
</audio>
"""
st.markdown(md, unsafe_allow_html=True)
# --- Streamlit App ---
st.set_page_config(page_title="AI Document Reader & Analyzer", page_icon=":book:")
st.subheader("πŸš€ AI Document Reader & Analyzer")
st_lottie(lottie_astronaut, height=150)
uploaded_file = st.file_uploader("Upload a DOCX or PDF file", type=["docx", "pdf"])
if uploaded_file is not None:
file_extension = uploaded_file.name.split(".")[-1].lower()
document_text = ""
with st.spinner(f"Reading and processing your {file_extension.upper()} file..."):
if file_extension == "docx":
document_text = read_docx(uploaded_file)
elif file_extension == "pdf":
document_text = read_pdf(uploaded_file)
if document_text:
st.subheader("Document Content:")
st.text_area("Text from the document", document_text, height=300)
st.subheader("Document Analysis:")
analysis = analyze_text(document_text)
st.write(analysis)
st.subheader("Virtual Voice Reader:")
language_choice = st.selectbox("Select language for voice:", ["en", "hi", "es", "fr", "de", "ja", "ko", "pt", "ru", "zh-cn"])
if st.button("Read with Virtual Voice"):
with st.spinner("Generating and playing audio..."):
audio_file = text_to_speech(document_text, language=language_choice)
if audio_file:
play_audio(audio_file)
# Clean up the temporary audio file
os.remove(audio_file)
else:
st.error("Could not extract text from the uploaded file.")
st.markdown("---")
st.info("This AI Space can read DOCX and PDF files, analyze basic statistics, and read the content using a virtual voice. You can expand the analysis capabilities with more advanced Natural Language Processing (NLP) techniques.")