Spaces:

MagicDash
/

Chat-With-File

Sleeping

App Files Files Community

MagicDash commited on Oct 24, 2024

Commit

ec29974

verified ·

1 Parent(s): 6a5e10b

Upload 4 files

Browse files

Files changed (4) hide show

dockerfile +17 -0
requirements.txt +19 -0
templates/analyze.html +143 -0
webapp.py +279 -0

dockerfile ADDED Viewed

	@@ -0,0 +1,17 @@

+# Use an official Python runtime as a parent image
+FROM python:3.9-slim
+# Set the working directory in the container
+WORKDIR /app
+# Copy the current directory contents into the container at /app
+COPY . /app
+# Install dependencies
+RUN pip install --no-cache-dir -r requirements.txt
+# Expose port 8000
+EXPOSE 8000
+# Run the application using uvicorn
+CMD ["uvicorn", "webapp:app", "--host", "0.0.0.0", "--port", "8000"]

requirements.txt ADDED Viewed

	@@ -0,0 +1,19 @@

+fastapi
+google-generativeai
+langchain-google-genai
+langchain
+pypdf
+langchain-community
+unstructured
+openpyxl
+docx2txt
+python-magic
+python-pptx
+jinja2
+nest-asyncio
+faiss-cpu
+tiktoken
+networkx
+pandas
+uvicorn
+python-multipart

templates/analyze.html ADDED Viewed

	@@ -0,0 +1,143 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>File Analysis</title>
+    <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0/dist/css/bootstrap.min.css" rel="stylesheet">
+    <link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0-beta3/css/all.min.css" rel="stylesheet">
+    <style>
+        body {
+            background-color: #f8f9fa;
+            font-family: 'Arial', sans-serif;
+        }
+        .container {
+            margin-top: 50px;
+            margin-bottom: 50px;
+            border-radius: 10px;
+            background: white;
+            padding: 30px;
+            box-shadow: 0 4px 20px rgba(0, 0, 0, 0.1);
+        }
+        h2, h3 {
+            color: #343a40;
+            margin-bottom: 20px;
+        }
+        .form-control, .form-select {
+            margin-bottom: 15px;
+            border-radius: 8px;
+            box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
+        }
+        .form-control:focus, .form-select:focus {
+            border-color: #007bff;
+            box-shadow: 0 0 5px rgba(0, 123, 255, 0.5);
+        }
+        .btn {
+            border-radius: 8px;
+            transition: background-color 0.3s ease, transform 0.2s ease;
+        }
+        .btn-primary {
+            background-color: #0d6efd;
+            border: none;
+        }
+        .btn-primary:hover {
+            background-color: #0b5ed7;
+            transform: translateY(-2px);
+        }
+        .btn-secondary {
+            background-color: #6c757d;
+            border: none;
+        }
+        .btn-secondary:hover {
+            background-color: #5a6268;
+            transform: translateY(-2px);
+        }
+        .summary {
+            background-color: #ffffff;
+            padding: 20px;
+            border-radius: 8px;
+            box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
+            margin-top: 20px;
+        }
+        .list-group-item {
+            background-color: #ffffff;
+            border: 1px solid #dee2e6;
+            border-radius: 8px;
+            margin-bottom: 10px;
+        }
+        .list-group-item:hover {
+            background-color: #f1f1f1;
+        }
+        .conversation-history {
+            margin-top: 20px;
+        }
+    </style>
+</head>
+<body>
+<div class="container">
+    <h2 class="text-center">File Analysis</h2>
+    {% if not summary %}
+    <form action="/" method="post" enctype="multipart/form-data" class="bg-light p-4 border rounded shadow-sm">
+        <h5>Upload File</h5>
+        <input type="file" name="file" accept=".pdf,.pptx,.csv,.xlsx,.mp3,.docx" class="form-control">
+        <label>Select Summary Length:</label>
+        <select name="summary_length" class="form-select">
+            <option value="2 sentences">Short</option>
+            <option value="5 sentences">Medium</option>
+            <option value="10 sentences">Long</option>
+        </select>
+        <br>
+        <label>Who are you?</label>
+        <input type="text" name="iam" id="iam" class="form-control" required>
+        <label>What's the document context about?</label>
+        <input type="text" name="context" id="context" class="form-control" required>
+        <label>Output Expectation (What you want to analyze?)</label>
+        <input type="text" name="output" id="output" class="form-control" required>
+        <label>Input your Google Gemini API Key</label>
+        <input type="text" name="api_key" id="api_key" class="form-control">
+        <input type="submit" value="Analyze" class="btn btn-primary mt-3">
+    </form>
+    {% endif %}
+    {% if summary %}
+    <div class="summary">
+        <h3>Summary:</h3>
+        <p>{{ summary|safe }}</p>
+        {% if show_conversation %}
+        <h3>Conversation</h3>
+        <form action="/ask" method="post" class="mb-3">
+            <input type="text" name="question" class="form-control" placeholder="Ask your question">
+            <input type="submit" value="Ask" class="btn btn-secondary mt-2">
+        </form>
+        {% endif %}
+    </div>
+    {% endif %}
+    {% if question_responses %}
+    <br>
+    <h3>Conversation History:</h3>
+    <ul class="list-group conversation-history">
+        {% for question, response in question_responses %}
+        <li class="list-group-item">
+            <strong>Question:</strong> {{ question }}<br>
+            <strong>Response:</strong> {{ response|safe }}
+        </li>
+        {% endfor %}
+    </ul>
+    {% endif %}
+</div>
+<script src="https://code.jquery.com/jquery-3.6.0.min.js"></script>
+<script src="https://cdn.jsdelivr.net/npm/@popperjs/core@2.11.6/dist/umd/popper.min.js"></script>
+<script src="https://stackpath.bootstrapcdn.com/bootstrap/5.3.0/js/bootstrap.min.js"></script>
+</body>
+</html>

webapp.py ADDED Viewed

	@@ -0,0 +1,279 @@

+from fastapi import FastAPI, File, UploadFile, Form, Request, HTTPException
+from fastapi.responses import HTMLResponse
+from fastapi.templating import Jinja2Templates
+from typing import List, Optional
+from langchain_google_genai import ChatGoogleGenerativeAI
+from langchain_community.document_loaders import PyPDFLoader, UnstructuredCSVLoader, UnstructuredExcelLoader, Docx2txtLoader, UnstructuredPowerPointLoader
+from langchain.chains import StuffDocumentsChain
+from langchain.chains.llm import LLMChain
+from langchain.prompts import PromptTemplate
+from langchain.vectorstores import FAISS
+from langchain_google_genai import GoogleGenerativeAIEmbeddings
+import json
+import os
+import google.generativeai as genai
+import re
+import nest_asyncio
+import nltk
+from langchain.text_splitter import CharacterTextSplitter
+app = FastAPI()
+templates = Jinja2Templates(directory="templates")
+if os.getenv("FASTAPI_ENV") == "development":
+    nest_asyncio.apply()
+nltk.download('averaged_perceptron_tagger_eng')
+from nltk.tokenize import word_tokenize
+# Initialize your model and other variables
+uploaded_file_path = None
+document_analyzed = False
+summary = None
+question_responses = []
+api = None
+llm = None
+safety_settings = [
+    {"category": "HARM_CATEGORY_DANGEROUS", "threshold": "BLOCK_NONE"},
+    {"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_NONE"},
+    {"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_NONE"},
+    {"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_NONE"},
+    {"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_NONE"},
+]
+def format_text(text: str) -> str:
+    text = re.sub(r'\*\*(.*?)\*\*', r'<b>\1</b>', text)
+    text = text.replace('*', '<br>')
+    return text
+# Route for main page
+@app.get("/", response_class=HTMLResponse)
+async def read_main(request: Request):
+    return templates.TemplateResponse("analyze.html", {
+        "request": request,
+        "summary": summary,
+        "show_conversation": document_analyzed,
+        "question_responses": question_responses
+    })
+# Route for analyzing documents
+@app.post("/", response_class=HTMLResponse)
+async def analyze_document(
+    request: Request,
+    api_key: str = Form(...),
+    iam: str = Form(...),
+    context: str = Form(...),
+    output: str = Form(...),
+    summary_length: str = Form(...),
+    file: UploadFile = File(...)
+):
+    global uploaded_file_path, document_analyzed, summary, question_responses, api, llm
+    loader = None
+    try:
+        # Initialize or update API key and models
+        api = api_key
+        genai.configure(api_key=api)
+        llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash-latest", google_api_key=api)
+        # Save the uploaded file
+        uploaded_file_path = "uploaded_file" + os.path.splitext(file.filename)[1]
+        with open(uploaded_file_path, "wb") as f:
+            f.write(file.file.read())
+        # Determine the file type and load accordingly
+        file_extension = os.path.splitext(uploaded_file_path)[1].lower()
+        print(f"File extension: {file_extension}")  # Debugging statement
+        if file_extension == ".pdf":
+            loader = PyPDFLoader(uploaded_file_path)
+        elif file_extension == ".csv":
+            loader = UnstructuredCSVLoader(uploaded_file_path, mode="elements", encoding="utf8")
+        elif file_extension == ".xlsx":
+            loader = UnstructuredExcelLoader(uploaded_file_path, mode="elements")
+        elif file_extension == ".docx":
+            loader = Docx2txtLoader(uploaded_file_path)
+        elif file_extension == ".pptx":
+            loader = UnstructuredPowerPointLoader(uploaded_file_path)
+        elif file_extension == ".mp3":
+            # Process audio files differently
+            audio_file = genai.upload_file(path=uploaded_file_path)
+            model = genai.GenerativeModel(model_name="gemini-1.5-flash")
+            prompt = f"I am an {iam}. This file is about {context}. Answer the question based on this file: {output}. Write a {summary_length} concise summary."
+            response = model.generate_content([prompt, audio_file], safety_settings=safety_settings)
+            summary = format_text(response.text)
+            document_analyzed = True
+            outputs = {"summary": summary}
+            with open("output_summary.json", "w") as outfile:
+                json.dump(outputs, outfile)
+            return templates.TemplateResponse("analyze.html", {
+                "request": request,
+                "summary": summary,
+                "show_conversation": document_analyzed,
+                "question_responses": question_responses
+            })
+        # If no loader is set, raise an exception
+        if loader is None:
+            raise HTTPException(status_code=400, detail=f"Unsupported file type: {file_extension}")
+        docs = loader.load()
+        prompt_template = PromptTemplate.from_template(
+            f"I am an {iam}. This file is about {context}. Answer the question based on this file: {output}. Write a {summary_length} concise summary of the following text: {{text}}"
+        )
+        llm_chain = LLMChain(llm=llm, prompt=prompt_template)
+        stuff_chain = StuffDocumentsChain(llm_chain=llm_chain, document_variable_name="text")
+        response = stuff_chain.invoke(docs)
+        summary = format_text(response["output_text"])
+        document_analyzed = True
+        outputs = {"summary": summary}
+        with open("output.json", "w") as outfile:
+            json.dump(outputs, outfile)
+        return templates.TemplateResponse("analyze.html", {
+            "request": request,
+            "summary": summary,
+            "show_conversation": document_analyzed,
+            "question_responses": question_responses
+        })
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"An error occurred: {e}")
+# Route for asking questions
+from langchain.text_splitter import CharacterTextSplitter  # Ensure this is imported
+@app.post("/ask", response_class=HTMLResponse)
+async def ask_question(request: Request, question: str = Form(...)):
+    global uploaded_file_path, question_responses, llm, api
+    loader = None
+    if uploaded_file_path:
+        # Determine the file type and load accordingly
+        file_extension = os.path.splitext(uploaded_file_path)[1].lower()
+        if file_extension == ".pdf":
+            loader = PyPDFLoader(uploaded_file_path)
+        elif file_extension == ".csv":
+            loader = UnstructuredCSVLoader(uploaded_file_path, mode="elements")
+        elif file_extension == ".xlsx":
+            loader = UnstructuredExcelLoader(uploaded_file_path, mode="elements")
+        elif file_extension == ".docx":
+            loader = Docx2txtLoader(uploaded_file_path)
+        elif file_extension == ".pptx":
+            loader = UnstructuredPowerPointLoader(uploaded_file_path)
+        elif file_extension == ".mp3":
+            audio_file = genai.upload_file(path=uploaded_file_path)
+            model = genai.GenerativeModel(model_name="gemini-1.5-flash")
+            latest_conversation = request.cookies.get("latest_question_response", "")
+            prompt = "Answer the question based on the speech: " + question + (f" Latest conversation: {latest_conversation}" if latest_conversation else "")
+            response = model.generate_content([prompt, audio_file], safety_settings=safety_settings)
+            current_response = response.text
+            current_question = f"You asked: {question}"
+            # Save the latest question and response to the session
+            question_responses.append((current_question, current_response))
+            # Perform vector embedding and search
+            text = current_response  # Use the summary generated from the MP3 content
+            os.environ["GOOGLE_API_KEY"] = api
+            embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
+            summary_embedding = embeddings.embed_query(text)
+            document_search = FAISS.from_texts([text], embeddings)
+            if document_search:
+                query_embedding = embeddings.embed_query(question)
+                results = document_search.similarity_search_by_vector(query_embedding, k=1)
+                if results:
+                    current_response = results[0].page_content
+                else:
+                    current_response = "No matching document found in the database."
+            else:
+                current_response = "Vector database not initialized."
+            # Append the question and response from FAISS search
+            question_responses.append((current_question, current_response))
+            # Save all results including FAISS response to output.json
+            save_to_json(summary, question_responses)
+            # Save the latest question and response to the session
+            response = templates.TemplateResponse("analyze.html", {"request": request, "summary": summary, "show_conversation": document_analyzed, "question_responses": question_responses})
+            response.set_cookie(key="latest_question_response", value=current_response)
+            return response
+        # If no loader is set, raise an exception
+        if loader is None:
+            raise HTTPException(status_code=400, detail=f"Unsupported file type: {file_extension}")
+        docs = loader.load()
+        text = "\n".join([doc.page_content for doc in docs])
+        os.environ["GOOGLE_API_KEY"] = api
+        # Split the text into chunks
+        text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
+        chunks = text_splitter.split_text(text)
+        # Define the Summarize Chain for the question
+        latest_conversation = request.cookies.get("latest_question_response", "")
+        template1 = question + """ answer the question based on the following:
+                    "{text}"
+                    :""" + (f" Answer the Question with no more than 3 sentences. Latest conversation: {latest_conversation}" if latest_conversation else "")
+        current_response = ""
+        for chunk in chunks:
+            prompt1 = PromptTemplate.from_template(template1.format(text=chunk))
+            # Initialize the LLMChain with the prompt
+            llm_chain1 = LLMChain(llm=llm, prompt=prompt1)
+            response1 = llm_chain1.invoke({"text": chunk})
+            current_response += response1["text"] + "\n"
+        # Generate embeddings for the combined responses
+        embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
+        summary_embedding = embeddings.embed_query(current_response)
+        document_search = FAISS.from_texts([current_response], embeddings)
+        # Perform a search on the FAISS vector database if it's initialized
+        if document_search:
+            query_embedding = embeddings.embed_query(question)
+            results = document_search.similarity_search_by_vector(query_embedding, k=1)
+            if results:
+                current_response = format_text(results[0].page_content)
+            else:
+                current_response = "No matching document found in the database."
+        else:
+            current_response = "Vector database not initialized."
+        # Append the question and response from FAISS search
+        current_question = f"You asked: {question}"
+        question_responses.append((current_question, current_response))
+        # Save all results to output.json
+        save_to_json(summary, question_responses)
+        # Save the latest question and response to the session
+        response = templates.TemplateResponse("analyze.html", {"request": request, "summary": summary, "show_conversation": document_analyzed, "question_responses": question_responses})
+        response.set_cookie(key="latest_question_response", value=current_response)
+        return response
+    else:
+        raise HTTPException(status_code=400, detail="No file has been uploaded yet.")
+def save_to_json(summary, question_responses):
+    outputs = {
+        "summary": summary,
+        "question_responses": question_responses
+    }
+    with open("output_summary.json", "w") as outfile:
+        json.dump(outputs, outfile)
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="127.0.0.1", port=8000)