HR_Doc_RAG

Sleeping

App Files Files Community

HR_Doc_RAG / app.py

SnehaAkula

Update app.py

ff02b0e verified over 1 year ago

raw

history blame contribute delete

4.96 kB

	import os
	import streamlit as st
	import fitz
	from PIL import Image
	import tempfile
	from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader
	from langchain.chains.question_answering import load_qa_chain
	from docx import Document
	import io
	# from langchain_community.llms import HuggingFaceHub
	from langchain_huggingface import HuggingFaceEndpoint

	# Ensure you have your Hugging Face token stored in an environment variable
	huggingface_token = os.getenv('HUGGINGFACEHUB_API_TOKEN')

	if huggingface_token is None:
	raise ValueError("No Hugging Face token found. Please set the HUGGINGFACEHUB_API_TOKEN environment variable.")

	llm = HuggingFaceEndpoint(repo_id="mistralai/Mistral-7B-Instruct-v0.3", huggingfacehub_api_token=huggingface_token)

	# Initialize conversation history list
	if "conversation_history" not in st.session_state:
	st.session_state.conversation_history = []

	# Function to load document and perform question answering (cached)

	@st.cache_data
	def process_document(uploaded_file, query):
	# Save uploaded file to temporary directory
	with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
	tmp_file.write(uploaded_file.read())

	# Load document based on file type
	file_extension = os.path.splitext(uploaded_file.name)[1].lower()
	if file_extension == ".pdf":
	loader = PyPDFLoader(tmp_file.name)
	document_text = None
	elif file_extension == ".docx":
	loader = Docx2txtLoader(tmp_file.name)
	document = Document(tmp_file.name)
	document_text = "\n".join([paragraph.text for paragraph in document.paragraphs])
	else:
	st.error("Unsupported file format. Please upload a text file (.txt), a PDF file (.pdf), or a Word document (.docx).")
	return "", None

	documents = loader.load()

	# Load QA chain
	# chain = load_qa_chain(llm=OpenAI(), verbose=True)
	chain = load_qa_chain(llm=llm, verbose=True)

	# Perform question answering
	response = chain.invoke({"input_documents": documents, "question": query})

	# Remove temporary file
	os.unlink(tmp_file.name)

	return response["output_text"], document_text





	# Function to update conversation history
	def update_conversation(query, response):
	st.session_state.conversation_history.append({"question": query, "answer": response})

	# Function to convert PDF pages to images
	def pdf_to_images(pdf_bytes):
	doc = fitz.open("pdf", pdf_bytes)
	images = []

	for page_num in range(doc.page_count):
	page = doc[page_num]
	image = page.get_pixmap()
	img = Image.frombytes("RGB", [image.width, image.height], image.samples)
	images.append(img)

	return images

	# Streamlit UI
	def main():
	# Set sidebar title
	st.sidebar.title("7steps.AI")
	st.sidebar.markdown("---")

	# File uploader for document in sidebar
	uploaded_file = st.sidebar.file_uploader("Upload a document", type=["pdf", "docx"])

	# Display document content or images
	if uploaded_file is not None:
	st.title("Document Content")
	file_extension = os.path.splitext(uploaded_file.name)[1].lower()
	if file_extension in [".docx"]:
	_, document_text = process_document(uploaded_file, "")
	if document_text is not None:
	st.text_area("Document Text", value=document_text, height=300)
	elif file_extension == ".pdf":
	images = pdf_to_images(uploaded_file.getvalue())
	if images:
	page_number = st.number_input("Page Number", value=1, min_value=1, max_value=len(images))
	st.image(images[page_number - 1], caption=f"Page {page_number}", use_column_width=True)

	# Download button for images
	img_bytes = io.BytesIO()
	images[page_number - 1].save(img_bytes, format='PNG')
	st.download_button("Download Image", img_bytes.getvalue(), f'Page_{page_number}.png')

	# Text box for new question in sidebar
	query = st.sidebar.text_input("Enter your question:")

	# "Ask" button in sidebar
	if st.sidebar.button("Ask"):
	if uploaded_file is not None:
	# Process document and display response
	response, _ = process_document(uploaded_file, query)
	if response: # Check if response is not empty
	# Update conversation history
	# st.write(response)
	st.write("You:", query)
	st.write("AI:", response)
	update_conversation(query, response)
	else:
	st.sidebar.write("Please upload a document first.")

	# # Display conversation history
	# st.title("Conversation History")
	# for item in st.session_state.conversation_history:
	# st.write("You:", item["question"])
	# st.write("AI:", item["answer"])

	# Run the application
	if __name__ == "__main__":
	main()