Spaces:

Tejha
/

pdfreader

Sleeping

App Files Files Community

pdfreader / APP

Tejha

Create APP

297bcca verified 8 months ago

raw

history blame contribute delete

3.85 kB

	import streamlit as st
	from streamlit_lottie import st_lottie
	import requests
	from io import BytesIO
	from docx import Document
	import pdfplumber
	from gtts import gTTS
	import os
	import base64

	# --- Load Assets ---
	def load_lottieurl(url):
	r = requests.get(url)
	if r.status_code != 200:
	return None
	return r.json()

	lottie_astronaut = load_lottieurl("https://lottie.host/b86c724d-556d-4a7a-a9b2-277f8099687b/J5c91vW5qS.json")

	# --- Functions ---
	def read_docx(file):
	try:
	document = Document(file)
	full_text = []
	for para in document.paragraphs:
	full_text.append(para.text)
	return "\n".join(full_text)
	except Exception as e:
	return f"Error reading DOCX file: {e}"

	def read_pdf(file):
	try:
	text = ""
	with pdfplumber.open(file) as pdf:
	for page in pdf.pages:
	text += page.extract_text() + "\n"
	return text
	except Exception as e:
	return f"Error reading PDF file: {e}"

	def analyze_text(text):
	# Basic analysis - you can expand this with more sophisticated NLP techniques
	word_count = len(text.split())
	char_count = len(text)
	sentences = text.split('.')
	sentence_count = len([s.strip() for s in sentences if s.strip()])
	return f"Word Count: {word_count}\nCharacter Count: {char_count}\nSentence Count: {sentence_count}"

	def text_to_speech(text, language='en'):
	try:
	tts = gTTS(text=text, lang=language, slow=False)
	audio_file = 'temp_audio.mp3'
	tts.save(audio_file)
	return audio_file
	except Exception as e:
	st.error(f"Error generating speech: {e}")
	return None

	def play_audio(audio_file):
	with open(audio_file, "rb") as f:
	data = f.read()
	b64 = base64.b64encode(data).decode()
	md = f"""
	<audio controls autoplay="true">
	<source src="data:audio/mp3;base64,{b64}" type="audio/mp3">
	</audio>
	"""
	st.markdown(md, unsafe_allow_html=True)

	# --- Streamlit App ---
	st.set_page_config(page_title="AI Document Reader & Analyzer", page_icon=":book:")

	st.subheader("🚀 AI Document Reader & Analyzer")
	st_lottie(lottie_astronaut, height=150)

	uploaded_file = st.file_uploader("Upload a DOCX or PDF file", type=["docx", "pdf"])

	if uploaded_file is not None:
	file_extension = uploaded_file.name.split(".")[-1].lower()
	document_text = ""

	with st.spinner(f"Reading and processing your {file_extension.upper()} file..."):
	if file_extension == "docx":
	document_text = read_docx(uploaded_file)
	elif file_extension == "pdf":
	document_text = read_pdf(uploaded_file)

	if document_text:
	st.subheader("Document Content:")
	st.text_area("Text from the document", document_text, height=300)

	st.subheader("Document Analysis:")
	analysis = analyze_text(document_text)
	st.write(analysis)

	st.subheader("Virtual Voice Reader:")
	language_choice = st.selectbox("Select language for voice:", ["en", "hi", "es", "fr", "de", "ja", "ko", "pt", "ru", "zh-cn"])
	if st.button("Read with Virtual Voice"):
	with st.spinner("Generating and playing audio..."):
	audio_file = text_to_speech(document_text, language=language_choice)
	if audio_file:
	play_audio(audio_file)
	# Clean up the temporary audio file
	os.remove(audio_file)
	else:
	st.error("Could not extract text from the uploaded file.")

	st.markdown("---")
	st.info("This AI Space can read DOCX and PDF files, analyze basic statistics, and read the content using a virtual voice. You can expand the analysis capabilities with more advanced Natural Language Processing (NLP) techniques.")