Spaces:

Tejha
/

pdfreader

Sleeping

App Files Files Community

Tejha commited on Mar 25

Commit

f7b8004

verified ·

1 Parent(s): 297bcca

Create app.py

Browse files

Files changed (1) hide show

app.py +109 -0

app.py ADDED Viewed

	@@ -0,0 +1,109 @@

+import streamlit as st
+from streamlit_lottie import st_lottie
+import requests
+from io import BytesIO
+from docx import Document
+import pdfplumber
+from gtts import gTTS
+import os
+import base64
+# --- Load Assets ---
+def load_lottieurl(url):
+    r = requests.get(url)
+    if r.status_code != 200:
+        return None
+    return r.json()
+lottie_astronaut = load_lottieurl("https://lottie.host/b86c724d-556d-4a7a-a9b2-277f8099687b/J5c91vW5qS.json")
+# --- Functions ---
+def read_docx(file):
+    try:
+        document = Document(file)
+        full_text = []
+        for para in document.paragraphs:
+            full_text.append(para.text)
+        return "\n".join(full_text)
+    except Exception as e:
+        return f"Error reading DOCX file: {e}"
+def read_pdf(file):
+    try:
+        text = ""
+        with pdfplumber.open(file) as pdf:
+            for page in pdf.pages:
+                text += page.extract_text() + "\n"
+        return text
+    except Exception as e:
+        return f"Error reading PDF file: {e}"
+def analyze_text(text):
+    # Basic analysis - you can expand this with more sophisticated NLP techniques
+    word_count = len(text.split())
+    char_count = len(text)
+    sentences = text.split('.')
+    sentence_count = len([s.strip() for s in sentences if s.strip()])
+    return f"Word Count: {word_count}\nCharacter Count: {char_count}\nSentence Count: {sentence_count}"
+def text_to_speech(text, language='en'):
+    try:
+        tts = gTTS(text=text, lang=language, slow=False)
+        audio_file = 'temp_audio.mp3'
+        tts.save(audio_file)
+        return audio_file
+    except Exception as e:
+        st.error(f"Error generating speech: {e}")
+        return None
+def play_audio(audio_file):
+    with open(audio_file, "rb") as f:
+        data = f.read()
+        b64 = base64.b64encode(data).decode()
+        md = f"""
+            <audio controls autoplay="true">
+            <source src="data:audio/mp3;base64,{b64}" type="audio/mp3">
+            </audio>
+            """
+        st.markdown(md, unsafe_allow_html=True)
+# --- Streamlit App ---
+st.set_page_config(page_title="AI Document Reader & Analyzer", page_icon=":book:")
+st.subheader("🚀 AI Document Reader & Analyzer")
+st_lottie(lottie_astronaut, height=150)
+uploaded_file = st.file_uploader("Upload a DOCX or PDF file", type=["docx", "pdf"])
+if uploaded_file is not None:
+    file_extension = uploaded_file.name.split(".")[-1].lower()
+    document_text = ""
+    with st.spinner(f"Reading and processing your {file_extension.upper()} file..."):
+        if file_extension == "docx":
+            document_text = read_docx(uploaded_file)
+        elif file_extension == "pdf":
+            document_text = read_pdf(uploaded_file)
+    if document_text:
+        st.subheader("Document Content:")
+        st.text_area("Text from the document", document_text, height=300)
+        st.subheader("Document Analysis:")
+        analysis = analyze_text(document_text)
+        st.write(analysis)
+        st.subheader("Virtual Voice Reader:")
+        language_choice = st.selectbox("Select language for voice:", ["en", "hi", "es", "fr", "de", "ja", "ko", "pt", "ru", "zh-cn"])
+        if st.button("Read with Virtual Voice"):
+            with st.spinner("Generating and playing audio..."):
+                audio_file = text_to_speech(document_text, language=language_choice)
+                if audio_file:
+                    play_audio(audio_file)
+                    # Clean up the temporary audio file
+                    os.remove(audio_file)
+    else:
+        st.error("Could not extract text from the uploaded file.")
+st.markdown("---")
+st.info("This AI Space can read DOCX and PDF files, analyze basic statistics, and read the content using a virtual voice. You can expand the analysis capabilities with more advanced Natural Language Processing (NLP) techniques.")