MagicDash commited on
Commit
ec29974
·
verified ·
1 Parent(s): 6a5e10b

Upload 4 files

Browse files
Files changed (4) hide show
  1. dockerfile +17 -0
  2. requirements.txt +19 -0
  3. templates/analyze.html +143 -0
  4. webapp.py +279 -0
dockerfile ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use an official Python runtime as a parent image
2
+ FROM python:3.9-slim
3
+
4
+ # Set the working directory in the container
5
+ WORKDIR /app
6
+
7
+ # Copy the current directory contents into the container at /app
8
+ COPY . /app
9
+
10
+ # Install dependencies
11
+ RUN pip install --no-cache-dir -r requirements.txt
12
+
13
+ # Expose port 8000
14
+ EXPOSE 8000
15
+
16
+ # Run the application using uvicorn
17
+ CMD ["uvicorn", "webapp:app", "--host", "0.0.0.0", "--port", "8000"]
requirements.txt ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi
2
+ google-generativeai
3
+ langchain-google-genai
4
+ langchain
5
+ pypdf
6
+ langchain-community
7
+ unstructured
8
+ openpyxl
9
+ docx2txt
10
+ python-magic
11
+ python-pptx
12
+ jinja2
13
+ nest-asyncio
14
+ faiss-cpu
15
+ tiktoken
16
+ networkx
17
+ pandas
18
+ uvicorn
19
+ python-multipart
templates/analyze.html ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>File Analysis</title>
7
+ <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0/dist/css/bootstrap.min.css" rel="stylesheet">
8
+ <link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0-beta3/css/all.min.css" rel="stylesheet">
9
+ <style>
10
+ body {
11
+ background-color: #f8f9fa;
12
+ font-family: 'Arial', sans-serif;
13
+ }
14
+ .container {
15
+ margin-top: 50px;
16
+ margin-bottom: 50px;
17
+ border-radius: 10px;
18
+ background: white;
19
+ padding: 30px;
20
+ box-shadow: 0 4px 20px rgba(0, 0, 0, 0.1);
21
+ }
22
+ h2, h3 {
23
+ color: #343a40;
24
+ margin-bottom: 20px;
25
+ }
26
+ .form-control, .form-select {
27
+ margin-bottom: 15px;
28
+ border-radius: 8px;
29
+ box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
30
+ }
31
+ .form-control:focus, .form-select:focus {
32
+ border-color: #007bff;
33
+ box-shadow: 0 0 5px rgba(0, 123, 255, 0.5);
34
+ }
35
+ .btn {
36
+ border-radius: 8px;
37
+ transition: background-color 0.3s ease, transform 0.2s ease;
38
+ }
39
+ .btn-primary {
40
+ background-color: #0d6efd;
41
+ border: none;
42
+ }
43
+ .btn-primary:hover {
44
+ background-color: #0b5ed7;
45
+ transform: translateY(-2px);
46
+ }
47
+ .btn-secondary {
48
+ background-color: #6c757d;
49
+ border: none;
50
+ }
51
+ .btn-secondary:hover {
52
+ background-color: #5a6268;
53
+ transform: translateY(-2px);
54
+ }
55
+ .summary {
56
+ background-color: #ffffff;
57
+ padding: 20px;
58
+ border-radius: 8px;
59
+ box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
60
+ margin-top: 20px;
61
+ }
62
+ .list-group-item {
63
+ background-color: #ffffff;
64
+ border: 1px solid #dee2e6;
65
+ border-radius: 8px;
66
+ margin-bottom: 10px;
67
+ }
68
+ .list-group-item:hover {
69
+ background-color: #f1f1f1;
70
+ }
71
+ .conversation-history {
72
+ margin-top: 20px;
73
+ }
74
+ </style>
75
+ </head>
76
+ <body>
77
+
78
+ <div class="container">
79
+ <h2 class="text-center">File Analysis</h2>
80
+
81
+ {% if not summary %}
82
+ <form action="/" method="post" enctype="multipart/form-data" class="bg-light p-4 border rounded shadow-sm">
83
+ <h5>Upload File</h5>
84
+ <input type="file" name="file" accept=".pdf,.pptx,.csv,.xlsx,.mp3,.docx" class="form-control">
85
+
86
+ <label>Select Summary Length:</label>
87
+ <select name="summary_length" class="form-select">
88
+ <option value="2 sentences">Short</option>
89
+ <option value="5 sentences">Medium</option>
90
+ <option value="10 sentences">Long</option>
91
+ </select>
92
+ <br>
93
+
94
+ <label>Who are you?</label>
95
+ <input type="text" name="iam" id="iam" class="form-control" required>
96
+
97
+ <label>What's the document context about?</label>
98
+ <input type="text" name="context" id="context" class="form-control" required>
99
+
100
+ <label>Output Expectation (What you want to analyze?)</label>
101
+ <input type="text" name="output" id="output" class="form-control" required>
102
+
103
+ <label>Input your Google Gemini API Key</label>
104
+ <input type="text" name="api_key" id="api_key" class="form-control">
105
+
106
+ <input type="submit" value="Analyze" class="btn btn-primary mt-3">
107
+ </form>
108
+ {% endif %}
109
+
110
+ {% if summary %}
111
+ <div class="summary">
112
+ <h3>Summary:</h3>
113
+ <p>{{ summary|safe }}</p>
114
+
115
+ {% if show_conversation %}
116
+ <h3>Conversation</h3>
117
+ <form action="/ask" method="post" class="mb-3">
118
+ <input type="text" name="question" class="form-control" placeholder="Ask your question">
119
+ <input type="submit" value="Ask" class="btn btn-secondary mt-2">
120
+ </form>
121
+ {% endif %}
122
+ </div>
123
+ {% endif %}
124
+
125
+ {% if question_responses %}
126
+ <br>
127
+ <h3>Conversation History:</h3>
128
+ <ul class="list-group conversation-history">
129
+ {% for question, response in question_responses %}
130
+ <li class="list-group-item">
131
+ <strong>Question:</strong> {{ question }}<br>
132
+ <strong>Response:</strong> {{ response|safe }}
133
+ </li>
134
+ {% endfor %}
135
+ </ul>
136
+ {% endif %}
137
+ </div>
138
+
139
+ <script src="https://code.jquery.com/jquery-3.6.0.min.js"></script>
140
+ <script src="https://cdn.jsdelivr.net/npm/@popperjs/core@2.11.6/dist/umd/popper.min.js"></script>
141
+ <script src="https://stackpath.bootstrapcdn.com/bootstrap/5.3.0/js/bootstrap.min.js"></script>
142
+ </body>
143
+ </html>
webapp.py ADDED
@@ -0,0 +1,279 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, File, UploadFile, Form, Request, HTTPException
2
+ from fastapi.responses import HTMLResponse
3
+ from fastapi.templating import Jinja2Templates
4
+ from typing import List, Optional
5
+ from langchain_google_genai import ChatGoogleGenerativeAI
6
+ from langchain_community.document_loaders import PyPDFLoader, UnstructuredCSVLoader, UnstructuredExcelLoader, Docx2txtLoader, UnstructuredPowerPointLoader
7
+ from langchain.chains import StuffDocumentsChain
8
+ from langchain.chains.llm import LLMChain
9
+ from langchain.prompts import PromptTemplate
10
+ from langchain.vectorstores import FAISS
11
+ from langchain_google_genai import GoogleGenerativeAIEmbeddings
12
+ import json
13
+ import os
14
+ import google.generativeai as genai
15
+ import re
16
+ import nest_asyncio
17
+ import nltk
18
+ from langchain.text_splitter import CharacterTextSplitter
19
+
20
+
21
+ app = FastAPI()
22
+ templates = Jinja2Templates(directory="templates")
23
+
24
+ if os.getenv("FASTAPI_ENV") == "development":
25
+ nest_asyncio.apply()
26
+
27
+
28
+ nltk.download('averaged_perceptron_tagger_eng')
29
+ from nltk.tokenize import word_tokenize
30
+
31
+ # Initialize your model and other variables
32
+ uploaded_file_path = None
33
+ document_analyzed = False
34
+ summary = None
35
+ question_responses = []
36
+ api = None
37
+ llm = None
38
+
39
+ safety_settings = [
40
+ {"category": "HARM_CATEGORY_DANGEROUS", "threshold": "BLOCK_NONE"},
41
+ {"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_NONE"},
42
+ {"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_NONE"},
43
+ {"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_NONE"},
44
+ {"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_NONE"},
45
+ ]
46
+
47
+ def format_text(text: str) -> str:
48
+ text = re.sub(r'\*\*(.*?)\*\*', r'<b>\1</b>', text)
49
+ text = text.replace('*', '<br>')
50
+ return text
51
+
52
+
53
+
54
+ # Route for main page
55
+ @app.get("/", response_class=HTMLResponse)
56
+ async def read_main(request: Request):
57
+ return templates.TemplateResponse("analyze.html", {
58
+ "request": request,
59
+ "summary": summary,
60
+ "show_conversation": document_analyzed,
61
+ "question_responses": question_responses
62
+ })
63
+
64
+ # Route for analyzing documents
65
+ @app.post("/", response_class=HTMLResponse)
66
+ async def analyze_document(
67
+ request: Request,
68
+ api_key: str = Form(...),
69
+ iam: str = Form(...),
70
+ context: str = Form(...),
71
+ output: str = Form(...),
72
+ summary_length: str = Form(...),
73
+ file: UploadFile = File(...)
74
+ ):
75
+ global uploaded_file_path, document_analyzed, summary, question_responses, api, llm
76
+ loader = None
77
+
78
+ try:
79
+ # Initialize or update API key and models
80
+ api = api_key
81
+ genai.configure(api_key=api)
82
+ llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash-latest", google_api_key=api)
83
+
84
+ # Save the uploaded file
85
+ uploaded_file_path = "uploaded_file" + os.path.splitext(file.filename)[1]
86
+ with open(uploaded_file_path, "wb") as f:
87
+ f.write(file.file.read())
88
+
89
+ # Determine the file type and load accordingly
90
+ file_extension = os.path.splitext(uploaded_file_path)[1].lower()
91
+ print(f"File extension: {file_extension}") # Debugging statement
92
+
93
+ if file_extension == ".pdf":
94
+ loader = PyPDFLoader(uploaded_file_path)
95
+ elif file_extension == ".csv":
96
+ loader = UnstructuredCSVLoader(uploaded_file_path, mode="elements", encoding="utf8")
97
+ elif file_extension == ".xlsx":
98
+ loader = UnstructuredExcelLoader(uploaded_file_path, mode="elements")
99
+ elif file_extension == ".docx":
100
+ loader = Docx2txtLoader(uploaded_file_path)
101
+ elif file_extension == ".pptx":
102
+ loader = UnstructuredPowerPointLoader(uploaded_file_path)
103
+ elif file_extension == ".mp3":
104
+ # Process audio files differently
105
+ audio_file = genai.upload_file(path=uploaded_file_path)
106
+ model = genai.GenerativeModel(model_name="gemini-1.5-flash")
107
+ prompt = f"I am an {iam}. This file is about {context}. Answer the question based on this file: {output}. Write a {summary_length} concise summary."
108
+ response = model.generate_content([prompt, audio_file], safety_settings=safety_settings)
109
+ summary = format_text(response.text)
110
+ document_analyzed = True
111
+ outputs = {"summary": summary}
112
+ with open("output_summary.json", "w") as outfile:
113
+ json.dump(outputs, outfile)
114
+ return templates.TemplateResponse("analyze.html", {
115
+ "request": request,
116
+ "summary": summary,
117
+ "show_conversation": document_analyzed,
118
+ "question_responses": question_responses
119
+ })
120
+
121
+ # If no loader is set, raise an exception
122
+ if loader is None:
123
+ raise HTTPException(status_code=400, detail=f"Unsupported file type: {file_extension}")
124
+
125
+ docs = loader.load()
126
+ prompt_template = PromptTemplate.from_template(
127
+ f"I am an {iam}. This file is about {context}. Answer the question based on this file: {output}. Write a {summary_length} concise summary of the following text: {{text}}"
128
+ )
129
+ llm_chain = LLMChain(llm=llm, prompt=prompt_template)
130
+ stuff_chain = StuffDocumentsChain(llm_chain=llm_chain, document_variable_name="text")
131
+ response = stuff_chain.invoke(docs)
132
+ summary = format_text(response["output_text"])
133
+ document_analyzed = True
134
+ outputs = {"summary": summary}
135
+ with open("output.json", "w") as outfile:
136
+ json.dump(outputs, outfile)
137
+ return templates.TemplateResponse("analyze.html", {
138
+ "request": request,
139
+ "summary": summary,
140
+ "show_conversation": document_analyzed,
141
+ "question_responses": question_responses
142
+ })
143
+
144
+ except Exception as e:
145
+ raise HTTPException(status_code=500, detail=f"An error occurred: {e}")
146
+
147
+ # Route for asking questions
148
+ from langchain.text_splitter import CharacterTextSplitter # Ensure this is imported
149
+
150
+ @app.post("/ask", response_class=HTMLResponse)
151
+ async def ask_question(request: Request, question: str = Form(...)):
152
+ global uploaded_file_path, question_responses, llm, api
153
+
154
+ loader = None
155
+
156
+ if uploaded_file_path:
157
+ # Determine the file type and load accordingly
158
+ file_extension = os.path.splitext(uploaded_file_path)[1].lower()
159
+ if file_extension == ".pdf":
160
+ loader = PyPDFLoader(uploaded_file_path)
161
+ elif file_extension == ".csv":
162
+ loader = UnstructuredCSVLoader(uploaded_file_path, mode="elements")
163
+ elif file_extension == ".xlsx":
164
+ loader = UnstructuredExcelLoader(uploaded_file_path, mode="elements")
165
+ elif file_extension == ".docx":
166
+ loader = Docx2txtLoader(uploaded_file_path)
167
+ elif file_extension == ".pptx":
168
+ loader = UnstructuredPowerPointLoader(uploaded_file_path)
169
+ elif file_extension == ".mp3":
170
+ audio_file = genai.upload_file(path=uploaded_file_path)
171
+ model = genai.GenerativeModel(model_name="gemini-1.5-flash")
172
+ latest_conversation = request.cookies.get("latest_question_response", "")
173
+ prompt = "Answer the question based on the speech: " + question + (f" Latest conversation: {latest_conversation}" if latest_conversation else "")
174
+ response = model.generate_content([prompt, audio_file], safety_settings=safety_settings)
175
+ current_response = response.text
176
+ current_question = f"You asked: {question}"
177
+
178
+ # Save the latest question and response to the session
179
+ question_responses.append((current_question, current_response))
180
+
181
+ # Perform vector embedding and search
182
+ text = current_response # Use the summary generated from the MP3 content
183
+ os.environ["GOOGLE_API_KEY"] = api
184
+ embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
185
+ summary_embedding = embeddings.embed_query(text)
186
+ document_search = FAISS.from_texts([text], embeddings)
187
+
188
+ if document_search:
189
+ query_embedding = embeddings.embed_query(question)
190
+ results = document_search.similarity_search_by_vector(query_embedding, k=1)
191
+
192
+ if results:
193
+ current_response = results[0].page_content
194
+ else:
195
+ current_response = "No matching document found in the database."
196
+ else:
197
+ current_response = "Vector database not initialized."
198
+
199
+ # Append the question and response from FAISS search
200
+ question_responses.append((current_question, current_response))
201
+
202
+ # Save all results including FAISS response to output.json
203
+ save_to_json(summary, question_responses)
204
+
205
+ # Save the latest question and response to the session
206
+ response = templates.TemplateResponse("analyze.html", {"request": request, "summary": summary, "show_conversation": document_analyzed, "question_responses": question_responses})
207
+ response.set_cookie(key="latest_question_response", value=current_response)
208
+ return response
209
+
210
+ # If no loader is set, raise an exception
211
+ if loader is None:
212
+ raise HTTPException(status_code=400, detail=f"Unsupported file type: {file_extension}")
213
+
214
+ docs = loader.load()
215
+ text = "\n".join([doc.page_content for doc in docs])
216
+ os.environ["GOOGLE_API_KEY"] = api
217
+
218
+ # Split the text into chunks
219
+ text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
220
+ chunks = text_splitter.split_text(text)
221
+
222
+ # Define the Summarize Chain for the question
223
+ latest_conversation = request.cookies.get("latest_question_response", "")
224
+ template1 = question + """ answer the question based on the following:
225
+ "{text}"
226
+ :""" + (f" Answer the Question with no more than 3 sentences. Latest conversation: {latest_conversation}" if latest_conversation else "")
227
+
228
+ current_response = ""
229
+ for chunk in chunks:
230
+ prompt1 = PromptTemplate.from_template(template1.format(text=chunk))
231
+ # Initialize the LLMChain with the prompt
232
+ llm_chain1 = LLMChain(llm=llm, prompt=prompt1)
233
+ response1 = llm_chain1.invoke({"text": chunk})
234
+ current_response += response1["text"] + "\n"
235
+
236
+ # Generate embeddings for the combined responses
237
+ embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
238
+ summary_embedding = embeddings.embed_query(current_response)
239
+ document_search = FAISS.from_texts([current_response], embeddings)
240
+
241
+ # Perform a search on the FAISS vector database if it's initialized
242
+ if document_search:
243
+ query_embedding = embeddings.embed_query(question)
244
+ results = document_search.similarity_search_by_vector(query_embedding, k=1)
245
+
246
+ if results:
247
+ current_response = format_text(results[0].page_content)
248
+ else:
249
+ current_response = "No matching document found in the database."
250
+ else:
251
+ current_response = "Vector database not initialized."
252
+
253
+ # Append the question and response from FAISS search
254
+ current_question = f"You asked: {question}"
255
+ question_responses.append((current_question, current_response))
256
+
257
+ # Save all results to output.json
258
+ save_to_json(summary, question_responses)
259
+
260
+ # Save the latest question and response to the session
261
+ response = templates.TemplateResponse("analyze.html", {"request": request, "summary": summary, "show_conversation": document_analyzed, "question_responses": question_responses})
262
+ response.set_cookie(key="latest_question_response", value=current_response)
263
+ return response
264
+ else:
265
+ raise HTTPException(status_code=400, detail="No file has been uploaded yet.")
266
+
267
+
268
+
269
+ def save_to_json(summary, question_responses):
270
+ outputs = {
271
+ "summary": summary,
272
+ "question_responses": question_responses
273
+ }
274
+ with open("output_summary.json", "w") as outfile:
275
+ json.dump(outputs, outfile)
276
+
277
+ if __name__ == "__main__":
278
+ import uvicorn
279
+ uvicorn.run(app, host="127.0.0.1", port=8000)