Spaces:

Tejha
/

pdfreader

Sleeping

File size: 3,847 Bytes

297bcca

import streamlit as st
from streamlit_lottie import st_lottie
import requests
from io import BytesIO
from docx import Document
import pdfplumber
from gtts import gTTS
import os
import base64

# --- Load Assets ---
def load_lottieurl(url):
    r = requests.get(url)
    if r.status_code != 200:
        return None
    return r.json()

lottie_astronaut = load_lottieurl("https://lottie.host/b86c724d-556d-4a7a-a9b2-277f8099687b/J5c91vW5qS.json")

# --- Functions ---
def read_docx(file):
    try:
        document = Document(file)
        full_text = []
        for para in document.paragraphs:
            full_text.append(para.text)
        return "\n".join(full_text)
    except Exception as e:
        return f"Error reading DOCX file: {e}"

def read_pdf(file):
    try:
        text = ""
        with pdfplumber.open(file) as pdf:
            for page in pdf.pages:
                text += page.extract_text() + "\n"
        return text
    except Exception as e:
        return f"Error reading PDF file: {e}"

def analyze_text(text):
    # Basic analysis - you can expand this with more sophisticated NLP techniques
    word_count = len(text.split())
    char_count = len(text)
    sentences = text.split('.')
    sentence_count = len([s.strip() for s in sentences if s.strip()])
    return f"Word Count: {word_count}\nCharacter Count: {char_count}\nSentence Count: {sentence_count}"

def text_to_speech(text, language='en'):
    try:
        tts = gTTS(text=text, lang=language, slow=False)
        audio_file = 'temp_audio.mp3'
        tts.save(audio_file)
        return audio_file
    except Exception as e:
        st.error(f"Error generating speech: {e}")
        return None

def play_audio(audio_file):
    with open(audio_file, "rb") as f:
        data = f.read()
        b64 = base64.b64encode(data).decode()
        md = f"""
            <audio controls autoplay="true">
            <source src="data:audio/mp3;base64,{b64}" type="audio/mp3">
            </audio>
            """
        st.markdown(md, unsafe_allow_html=True)

# --- Streamlit App ---
st.set_page_config(page_title="AI Document Reader & Analyzer", page_icon=":book:")

st.subheader("🚀 AI Document Reader & Analyzer")
st_lottie(lottie_astronaut, height=150)

uploaded_file = st.file_uploader("Upload a DOCX or PDF file", type=["docx", "pdf"])

if uploaded_file is not None:
    file_extension = uploaded_file.name.split(".")[-1].lower()
    document_text = ""

    with st.spinner(f"Reading and processing your {file_extension.upper()} file..."):
        if file_extension == "docx":
            document_text = read_docx(uploaded_file)
        elif file_extension == "pdf":
            document_text = read_pdf(uploaded_file)

    if document_text:
        st.subheader("Document Content:")
        st.text_area("Text from the document", document_text, height=300)

        st.subheader("Document Analysis:")
        analysis = analyze_text(document_text)
        st.write(analysis)

        st.subheader("Virtual Voice Reader:")
        language_choice = st.selectbox("Select language for voice:", ["en", "hi", "es", "fr", "de", "ja", "ko", "pt", "ru", "zh-cn"])
        if st.button("Read with Virtual Voice"):
            with st.spinner("Generating and playing audio..."):
                audio_file = text_to_speech(document_text, language=language_choice)
                if audio_file:
                    play_audio(audio_file)
                    # Clean up the temporary audio file
                    os.remove(audio_file)
    else:
        st.error("Could not extract text from the uploaded file.")

st.markdown("---")
st.info("This AI Space can read DOCX and PDF files, analyze basic statistics, and read the content using a virtual voice. You can expand the analysis capabilities with more advanced Natural Language Processing (NLP) techniques.")