acecalisto3 commited on
Commit
b26edd3
·
verified ·
1 Parent(s): 82241ae

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +235 -319
app.py CHANGED
@@ -1,335 +1,251 @@
 
 
1
  import os
2
  import requests
3
  import uuid
4
- from huggingface_hub import InferenceClient, HfApi
5
- from pypdf import PdfReader
6
- from bs4 import BeautifulSoup
7
  import datetime
8
  import zipfile
9
- import nltk
10
- import nltk.data
11
- import nltk.downloader # Import the downloader explicitly
12
  import tempfile
13
  import shutil
14
- import time # Import time for optional delay
15
- import secrets # For generating session-specific keys (conceptual)
16
- from hashlib import sha256 # For checksum verification (conceptual)
17
-
18
- VERBOSE = True
19
-
20
- def log(message):
21
- if VERBOSE:
22
- print(f"[LOG] {datetime.datetime.now()} - {message}")
23
-
24
- # Hugging Face API Initialization
25
- HF_MODEL = "mistralai/Mixtral-8x7B-Instruct-v0.1"
26
- HF_TOKEN = os.environ.get('HF_TOKEN')
27
- if not HF_TOKEN:
28
- raise EnvironmentError("HF_TOKEN is not set. Please export it as an environment variable.")
29
- try:
30
- client = InferenceClient(model=HF_MODEL, token=HF_TOKEN)
31
- api = HfApi(token=HF_TOKEN)
32
- log("Initialized Hugging Face client and API.")
33
- except Exception as e:
34
- log(f"Error initializing Hugging Face client: {e}")
35
- exit(1)
36
-
37
- REPO_NAME = "acecalisto3/tmp"
38
- DATASET_URL = f"https://huggingface.co/datasets/{REPO_NAME}/raw/main/"
39
-
40
- # Constants
41
- MAX_TOKENS = 8192
42
-
43
- # Utility Functions
44
- def generate_session_key():
45
- """Generates a unique session key (conceptual for ephemeral session)."""
46
- return secrets.token_hex(16)
47
-
48
- def verify_checksum(data):
49
- """Generates a SHA-256 checksum of the data (conceptual for integrity)."""
50
- return sha256(data.encode()).hexdigest()
51
-
52
- def get_file_id_from_google_drive_url(url):
53
- if "drive.google.com" in url and "file/d/" in url:
54
- parts = url.split("/file/d/")
55
- if len(parts) < 2:
56
- return None
57
- file_id = parts[1].split("/")[0].split("?")[0]
58
- return file_id
59
- return None
60
-
61
- def download_google_drive_file(file_id, session_id):
62
- """Downloads a Google Drive file to a temporary session-specific directory."""
63
- download_url = f"https://drive.google.com/uc?id={file_id}"
64
- temp_session_dir = os.path.join(tempfile.gettempdir(), f"session_{session_id}")
65
- os.makedirs(temp_session_dir, exist_ok=True)
66
- try:
67
- response = requests.get(download_url, stream=True)
68
- response.raise_for_status()
69
- content_disposition = response.headers.get('Content-Disposition')
70
- if content_disposition:
71
- filename = content_disposition.split("filename=")[1].strip('"')
72
- else:
73
- filename = f"file_{uuid.uuid4()}"
74
- file_path = os.path.join(temp_session_dir, filename)
75
- with open(file_path, "wb") as f:
76
- for chunk in response.iter_content(chunk_size=8192):
77
- f.write(chunk)
78
- return file_path, temp_session_dir
79
- except Exception as e:
80
- log(f"Error downloading Google Drive file {file_id}: {e}")
81
- # Clean up the session directory on error
82
- if os.path.exists(temp_session_dir):
83
- shutil.rmtree(temp_session_dir)
84
- return None, None
85
-
86
- def read_pdf(file_path):
87
- try:
88
- reader = PdfReader(file_path)
89
- text = "\n".join(page.extract_text() for page in reader.pages)
90
- return text
91
- except Exception as e:
92
- log(f"Error reading PDF {file_path}: {e}")
93
- return ""
94
-
95
- def read_txt(txt_path):
96
- try:
97
- with open(txt_path, "r", encoding="utf-8") as f:
98
- return f.read()
99
- except Exception as e:
100
- log(f"Error reading TXT file {txt_path}: {e}")
101
- return ""
102
-
103
- def read_zip(zip_path, session_id):
104
- """Reads content from a ZIP file within the temporary session directory."""
105
- extracted_data = []
106
- temp_extract_dir = os.path.join(tempfile.gettempdir(), f"session_{session_id}_extract")
107
- os.makedirs(temp_extract_dir, exist_ok=True)
108
- try:
109
- with zipfile.ZipFile(zip_path, 'r') as zip_ref:
110
- for file_info in zip_ref.infolist():
111
- if file_info.filename.endswith((".txt", ".pdf")):
112
- with zip_ref.open(file_info) as file:
113
- content = file.read()
114
- temp_file_path = os.path.join(temp_extract_dir, file_info.filename)
115
- with open(temp_file_path, "wb") as temp_file:
116
- temp_file.write(content)
117
- if file_info.filename.endswith(".txt"):
118
- extracted_data.append(read_txt(temp_file_path))
119
- elif file_info.filename.endswith(".pdf"):
120
- extracted_data.append(read_pdf(temp_file_path))
121
- return "\n".join(extracted_data)
122
- except Exception as e:
123
- log(f"Error reading ZIP file {zip_path}: {e}")
124
- return ""
125
- finally:
126
- shutil.rmtree(temp_extract_dir, ignore_errors=True)
127
 
128
- def fetch_google_doc(url):
129
- if "docs.google.com/document/d/" in url:
130
- # Extract document ID
131
- doc_id = url.split("/d/")[1].split("/")[0]
132
- # Construct export URL for plain text
133
- export_url = f"https://docs.google.com/document/d/{doc_id}/export?format=txt"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
  try:
135
- response = requests.get(export_url)
136
- response.raise_for_status()
137
- return response.text
138
- except requests.exceptions.HTTPError as e:
139
- log(f"Error fetching Google Doc: {e}")
140
- return None
141
- else:
142
- return None
143
-
144
- def fetch_url(url, max_depth, session_id):
145
- visited = set()
146
- to_visit = [(url, 0)]
147
- results = []
148
- errors = []
149
- headers = {
150
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36'
151
- }
152
- temp_session_dir = os.path.join(tempfile.gettempdir(), f"session_{session_id}")
153
- os.makedirs(temp_session_dir, exist_ok=True)
154
- try:
155
- while to_visit:
156
- current_url, depth = to_visit.pop(0)
157
- if current_url in visited:
158
- continue
159
- if depth < max_depth:
160
- visited.add(current_url)
161
- # Check if it's a Google Drive file URL
162
- if "drive.google.com/file/d/" in current_url:
163
- file_id = get_file_id_from_google_drive_url(current_url)
164
- if file_id:
165
- file_path, temp_dir = download_google_drive_file(file_id, session_id)
166
- if file_path:
167
- file_ext = os.path.splitext(file_path)[1].lower()
168
- if file_ext == ".pdf":
169
- pdf_text = read_pdf(file_path)
170
- results.append(pdf_text)
171
- elif file_ext == ".txt":
172
- txt_content = read_txt(file_path)
173
- results.append(txt_content)
174
- elif file_ext == ".zip":
175
- zip_content = read_zip(file_path, session_id)
176
- results.append(zip_content)
177
- else:
178
- errors.append(f"Unsupported file type for URL: {current_url}")
179
- # Clean up the downloaded file, but keep the session dir for other files
180
- if temp_dir and os.path.exists(file_path):
181
- os.remove(file_path)
182
- else:
183
- errors.append(f"Failed to download file from URL: {current_url}")
184
- else:
185
- errors.append(f"Invalid Google Drive URL: {current_url}")
186
- # Check if it's a Google Doc URL
187
- elif "docs.google.com/document/d/" in current_url:
188
- doc_content = fetch_google_doc(current_url)
189
- if doc_content:
190
- results.append(doc_content)
191
- else:
192
- errors.append(f"Failed to fetch Google Doc: {current_url}")
193
- else:
194
- try:
195
- response = requests.get(current_url, headers=headers, timeout=10)
196
- response.raise_for_status()
197
- soup = BeautifulSoup(response.content, 'html.parser')
198
- results.append(soup.get_text())
199
- for link in soup.find_all("a", href=True):
200
- absolute_url = requests.compat.urljoin(current_url, link.get('href'))
201
- if absolute_url.startswith("http") and absolute_url not in visited:
202
- to_visit.append((absolute_url, depth + 1))
203
- # Optional: Introduce a delay between requests
204
- # time.sleep(1)
205
- except Exception as e:
206
- log(f"Error fetching {current_url}: {e}")
207
- errors.append(f"Error fetching {current_url}: {e}")
208
- finally:
209
- # Clean up the temporary session directory after processing URLs
210
- shutil.rmtree(temp_session_dir, ignore_errors=True)
211
- return "\n".join(results), "\n".join(errors)
212
-
213
- def process_file(file, session_id):
214
- """Processes an uploaded file within the temporary session directory."""
215
- temp_session_dir = os.path.join(tempfile.gettempdir(), f"session_{session_id}")
216
- os.makedirs(temp_session_dir, exist_ok=True)
217
- file_path = os.path.join(temp_session_dir, file.name)
218
- try:
219
- # Save the uploaded file to the temporary session directory
220
- with open(file_path, "wb") as f:
221
- f.write(file.read())
222
- if file.name.endswith(".pdf"):
223
- return read_pdf(file_path)
224
- elif file.name.endswith(".txt"):
225
- return read_txt(file_path)
226
- elif file.name.endswith(".zip"):
227
- return read_zip(file_path, session_id)
228
- except Exception as e:
229
- log(f"Error processing file {file.name}: {e}")
230
- return ""
231
- finally:
232
- # The temporary session directory will be cleaned up after the workflow
233
- pass
234
-
235
- def chunk_text(text, max_chunk_size):
236
- tokenizer = nltk.data.load("tokenizers/punkt/english.pickle")
237
- sentences = tokenizer.tokenize(text)
238
- chunks = []
239
- current_chunk = ""
240
- for sentence in sentences:
241
- if len(current_chunk) + len(sentence) + 1 > max_chunk_size:
242
- chunks.append(current_chunk.strip())
243
- current_chunk = ""
244
- current_chunk += sentence + " "
245
- if current_chunk:
246
- chunks.append(current_chunk.strip())
247
- return chunks
248
-
249
- def extract_dataset(data, instructions="Extract {history}", max_tokens=MAX_TOKENS):
250
- extracted = []
251
- chunks = chunk_text(data, max_chunk_size=20000) # Adjusted size
252
- for i, chunk in enumerate(chunks):
253
  try:
254
- prompt = instructions.format(history=chunk)
255
- response = client.text_generation(
256
- prompt=prompt,
257
- max_new_tokens=max_tokens
258
- )
259
- extracted.append(response.choices[0].text)
260
  except Exception as e:
261
- log(f"Error processing chunk {i+1}: {e}")
262
- extracted.append(f"Error processing chunk {i+1}: {e}")
263
- return "\n".join(extracted)
264
-
265
- def combine_datasets(datasets):
266
- return "\n".join(datasets)
267
-
268
- # Gradio App Interface
269
- import gradio as gr # Ensure you import gradio
270
-
271
- with gr.Blocks() as app:
272
- session_id = gr.State(generate_session_key) # Unique session ID for each user
273
- gr.Markdown(
274
- "**Dataset Generator and Flash Chatbot**: Upload files, scrape data from URLs, or enter text to generate datasets and interact with a chatbot."
275
- )
276
- chatbot = gr.Chatbot(label="Flash Trained Chatbot")
277
- command_selector = gr.Dropdown(
278
- label="Select Command",
279
- choices=["Scrape Data", "Extract Dataset", "Combine Datasets", "Train Chatbot"],
280
- value="Scrape Data"
281
- )
282
- data_input = gr.Textbox(label="Input Text", placeholder="Enter text here.")
283
- file_upload = gr.Files(label="Upload Files", file_types=[".pdf", ".txt", ".zip"])
284
- url_input = gr.Textbox(label="URL")
285
- depth_slider = gr.Slider(label="Crawl Depth", minimum=1, maximum=10, value=1)
286
- output_json = gr.JSON(label="Output Dataset")
287
- error_output = gr.Textbox(label="Error Log", interactive=False)
288
- process_button = gr.Button("Process")
289
-
290
- def process_workflow(session_id, command, data, files, url, depth):
291
- datasets = []
292
- errors = []
293
- temp_session_dir = os.path.join(tempfile.gettempdir(), f"session_{session_id}")
294
- os.makedirs(temp_session_dir, exist_ok=True)
295
  try:
296
- if data:
297
- datasets.append(data)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
298
  if files:
299
- for file in files:
300
- file_data = process_file(file, session_id)
301
- if file_data:
302
- datasets.append(file_data)
 
 
 
 
 
 
 
303
  else:
304
- errors.append(f"Failed to process file: {file.name}")
305
- if url:
306
- url_data, fetch_errors = fetch_url(url, depth, session_id)
307
- if url_data:
308
- datasets.append(url_data)
309
- else:
310
- errors.append(f"Failed to fetch data from URL: {url}")
311
- if fetch_errors:
312
- errors.append(fetch_errors)
313
 
314
- if command == "Extract Dataset":
315
- extracted_data = extract_dataset("\n".join(datasets))
316
- return session_id, {"datasets": [extracted_data]}, "\n".join(errors)
317
- elif command == "Combine Datasets":
318
- combined_data = combine_datasets(datasets)
319
- return session_id, {"datasets": [combined_data]}, "\n".join(errors)
320
- else:
321
- return session_id, {"datasets": datasets}, "\n".join(errors)
322
  except Exception as e:
323
- errors.append(str(e))
324
- return session_id, {"datasets": []}, "\n".join(errors)
325
- finally:
326
- # Clean up the temporary session directory after the workflow
327
- shutil.rmtree(temp_session_dir, ignore_errors=True)
328
-
329
- process_button.click(
330
- process_workflow,
331
- inputs=[session_id, command_selector, data_input, file_upload, url_input, depth_slider],
332
- outputs=[session_id, output_json, error_output]
333
- )
334
-
335
- app.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py
2
+
3
  import os
4
  import requests
5
  import uuid
 
 
 
6
  import datetime
7
  import zipfile
 
 
 
8
  import tempfile
9
  import shutil
10
+ import secrets
11
+ import time
12
+ import json
13
+ from typing import List, Tuple, Any, Dict
14
+
15
+ # Third-party libraries
16
+ import gradio as gr
17
+ from huggingface_hub import InferenceClient
18
+ from pypdf import PdfReader
19
+ from bs4 import BeautifulSoup
20
+ import nltk
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
+ # Local imports from the enhanced prompt library
23
+ from agent_prompt import PromptLibrary, SystemAuditor
24
+
25
+ # --- CONFIGURATION ---
26
+ class Config:
27
+ """Centralized configuration for the Maestro application."""
28
+ HF_MODEL = os.getenv("HF_MODEL", "mistralai/Mixtral-8x7B-Instruct-v0.1")
29
+ HF_TOKEN = os.getenv("HF_TOKEN")
30
+ VERBOSE = os.getenv("VERBOSE", "True").lower() == "true"
31
+ MAX_NEW_TOKENS_REPORT = 4096
32
+ MAX_NEW_TOKENS_CHAT = 1024
33
+ REQUESTS_TIMEOUT = 15
34
+ USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36'
35
+
36
+ # --- UTILITIES ---
37
+ def log(message: str) -> None:
38
+ if Config.VERBOSE:
39
+ print(f"[LOG] {datetime.datetime.now(datetime.timezone.utc).isoformat()} - {message}")
40
+
41
+ class SessionManager:
42
+ """A context manager for creating and cleaning up session-specific temporary directories."""
43
+ def __init__(self, session_id: str):
44
+ self.session_id = session_id
45
+ self.temp_dir = os.path.join(tempfile.gettempdir(), f"session_{session_id}")
46
+
47
+ def __enter__(self) -> str:
48
+ os.makedirs(self.temp_dir, exist_ok=True)
49
+ log(f"Session '{self.session_id}' started. Temp dir: {self.temp_dir}")
50
+ return self.temp_dir
51
+
52
+ def __exit__(self, exc_type, exc_val, exc_tb):
53
+ shutil.rmtree(self.temp_dir, ignore_errors=True)
54
+ log(f"Session '{self.session_id}' ended. Temp dir cleaned up.")
55
+
56
+ # --- CORE APPLICATION ENGINE ---
57
+ class MaestroEngine:
58
+ """Handles all data processing and LLM interaction logic."""
59
+ def __init__(self):
60
+ self.client = InferenceClient(model=Config.HF_MODEL, token=Config.HF_TOKEN)
61
  try:
62
+ nltk.data.find("tokenizers/punkt")
63
+ except LookupError:
64
+ log("Downloading NLTK 'punkt' tokenizer...")
65
+ nltk.download('punkt', quiet=True)
66
+ log("MaestroEngine initialized.")
67
+
68
+ def _read_pdf(self, file_path: str) -> str:
69
+ # (Implementation for reading PDF)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  try:
71
+ reader = PdfReader(file_path)
72
+ return "\n".join(page.extract_text() or "" for page in reader.pages)
 
 
 
 
73
  except Exception as e:
74
+ log(f"Error reading PDF {os.path.basename(file_path)}: {e}")
75
+ return f"Error reading PDF: {e}"
76
+
77
+ def _process_zip(self, zip_path: str, temp_dir: str) -> str:
78
+ # (Implementation for processing ZIP)
79
+ extracted_texts = []
80
+ extract_path = os.path.join(temp_dir, "zip_extract")
81
+ os.makedirs(extract_path, exist_ok=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  try:
83
+ with zipfile.ZipFile(zip_path, 'r') as zf:
84
+ for member in zf.infolist():
85
+ if member.filename.endswith('.pdf'):
86
+ zf.extract(member, extract_path)
87
+ extracted_texts.append(self._read_pdf(os.path.join(extract_path, member.filename)))
88
+ elif member.filename.endswith('.txt'):
89
+ extracted_texts.append(zf.read(member).decode('utf-8', errors='ignore'))
90
+ return "\n\n".join(extracted_texts)
91
+ except Exception as e:
92
+ log(f"Error processing ZIP {os.path.basename(zip_path)}: {e}")
93
+ return f"Error processing ZIP: {e}"
94
+
95
+ def process_data_sources(self, session_id: str, url: str, text: str, files: List[Any]) -> Tuple[str, List[str]]:
96
+ """Orchestrates data ingestion from all provided sources."""
97
+ all_content, errors = [], []
98
+ with SessionManager(session_id) as temp_dir:
99
+ if url:
100
+ try:
101
+ response = requests.get(url, headers={'User-Agent': Config.USER_AGENT}, timeout=Config.REQUESTS_TIMEOUT)
102
+ response.raise_for_status()
103
+ all_content.append(BeautifulSoup(response.content, 'html.parser').get_text(separator="\n", strip=True))
104
+ except Exception as e:
105
+ errors.append(f"URL Fetch Error: {e}")
106
+ if text:
107
+ all_content.append(text)
108
  if files:
109
+ for file_obj in files:
110
+ file_path = os.path.join(temp_dir, os.path.basename(file_obj.name))
111
+ with open(file_path, "wb") as f:
112
+ shutil.copyfileobj(file_obj, f)
113
+ ext = os.path.splitext(file_obj.name)[1].lower()
114
+ if ext == '.pdf':
115
+ all_content.append(self._read_pdf(file_path))
116
+ elif ext == '.txt':
117
+ all_content.append(open(file_path, 'r', encoding='utf-8').read())
118
+ elif ext == '.zip':
119
+ all_content.append(self._process_zip(file_path, temp_dir))
120
  else:
121
+ errors.append(f"Unsupported file type: {file_obj.name}")
122
+ return "\n\n---\n\n".join(all_content), errors
 
 
 
 
 
 
 
123
 
124
+ def _query_llm(self, prompt: str, max_tokens: int) -> str:
125
+ try:
126
+ response = self.client.text_generation(prompt, max_new_tokens=max_tokens, temperature=0.7, top_p=0.95)
127
+ return response.strip()
 
 
 
 
128
  except Exception as e:
129
+ log(f"LLM query failed: {e}")
130
+ return f"Error communicating with the model: {e}"
131
+
132
+ def run_rag_query(self, query: str, context: str) -> str:
133
+ prompt = f"Context:\n---\n{context}\n---\nBased only on the context provided, answer the following question:\nQuestion: {query}"
134
+ return self._query_llm(prompt, Config.MAX_NEW_TOKENS_CHAT)
135
+
136
+ def generate_report(self, report_type: str, context: str, objective: str) -> str:
137
+ if report_type == "Narrative Prose Report":
138
+ prompt = PromptLibrary.NARRATIVE_PROSE_REPORT.format(
139
+ task_objective=objective,
140
+ knowledge_base=context
141
+ )
142
+ return self._query_llm(prompt, Config.MAX_NEW_TOKENS_REPORT)
143
+ elif report_type == "Technical JSON Report":
144
+ prompt = PromptLibrary.TECHNICAL_JSON_REPORT.format(
145
+ task_objective=objective,
146
+ baseline_knowledge="Previously established facts",
147
+ new_information=context
148
+ )
149
+ raw_response = self._query_llm(prompt, Config.MAX_NEW_TOKENS_REPORT)
150
+ # Clean up potential markdown code fences for JSON parsing
151
+ clean_json_str = raw_response.replace("```json", "").replace("```", "").strip()
152
+ try:
153
+ # Validate and reformat for pretty printing
154
+ return json.dumps(json.loads(clean_json_str), indent=2)
155
+ except json.JSONDecodeError:
156
+ log(f"Failed to parse LLM response as JSON. Raw response: {raw_response}")
157
+ return '{"error": "The model did not return valid JSON.", "raw_response": ' + json.dumps(raw_response) + '}'
158
+ return "Invalid report type selected."
159
+
160
+ # --- GRADIO APPLICATION ---
161
+ class GradioApp:
162
+ """Manages the Gradio UI and application workflow."""
163
+ def __init__(self, engine: MaestroEngine):
164
+ self.engine = engine
165
+ self.app = self._build_ui()
166
+
167
+ def _build_ui(self) -> gr.Blocks:
168
+ with gr.Blocks(theme=gr.themes.Monochrome(), title="Maestro AI Engine") as app:
169
+ # State management
170
+ session_id = gr.State(lambda: secrets.token_hex(16))
171
+ auditor = gr.State(lambda s_id: SystemAuditor(session_id=s_id), session_id)
172
+ processed_data = gr.State("")
173
+
174
+ gr.Markdown("# 🧠 Maestro: AI Data Engine & Synthesis Platform")
175
+
176
+ with gr.Tabs():
177
+ with gr.TabItem("① Data Ingestion"):
178
+ with gr.Row():
179
+ with gr.Column():
180
+ url_input = gr.Textbox(label="Scrape from URL")
181
+ text_input = gr.Textbox(label="Paste Text", lines=10)
182
+ file_upload = gr.Files(label="Upload Files (.pdf, .txt, .zip)", type="file")
183
+ process_button = gr.Button("🚀 Process All Sources", variant="primary")
184
+ ingestion_summary = gr.Textbox(label="Ingestion Summary", interactive=False)
185
+ error_log = gr.Textbox(label="Errors", interactive=False)
186
+
187
+ with gr.TabItem("② Reporting & Synthesis"):
188
+ report_objective = gr.Textbox(label="Report Objective", placeholder="e.g., 'Synthesize findings on AI in agriculture'")
189
+ report_type = gr.Dropdown(label="Select Report Type", choices=["Narrative Prose Report", "Technical JSON Report"])
190
+ generate_button = gr.Button("Generate Report", variant="primary")
191
+ with gr.Tabs():
192
+ with gr.TabItem("Narrative Output"):
193
+ report_output_md = gr.Markdown()
194
+ with gr.TabItem("JSON Output"):
195
+ report_output_json = gr.JSON()
196
+
197
+ with gr.TabItem("③ Direct Chat Q&A"):
198
+ chatbot = gr.Chatbot(label="Chat Interface", height=550)
199
+ msg_input = gr.Textbox(label="Your Question", placeholder="Ask a question about the processed data...")
200
+ msg_input.submit(self._chat_workflow, [msg_input, chatbot, processed_data], [msg_input, chatbot])
201
+
202
+ # --- Workflow Connections ---
203
+ process_button.click(self._ingest_workflow, [session_id, url_input, text_input, file_upload], [processed_data, ingestion_summary, error_log])
204
+ generate_button.click(self._reporting_workflow, [auditor, report_type, processed_data, report_objective], [report_output_md, report_output_json])
205
+
206
+ return app
207
+
208
+ def _ingest_workflow(self, s_id, url, text, files):
209
+ log(f"Starting ingestion for session {s_id}...")
210
+ data, errors = self.engine.process_data_sources(s_id, url, text, files)
211
+ summary = f"Processing complete. {len(data)} characters ingested. {len(errors)} errors encountered."
212
+ return data, summary, "\n".join(errors)
213
+
214
+ def _chat_workflow(self, message, history, context):
215
+ if not context:
216
+ history.append((message, "Error: No data has been ingested. Please process data in Tab 1 first."))
217
+ return "", history
218
+ response = self.engine.run_rag_query(message, context)
219
+ history.append((message, response))
220
+ return "", history
221
+
222
+ def _reporting_workflow(self, auditor_instance, r_type, context, objective):
223
+ if not context:
224
+ md_out = "### Error: No data has been ingested. Please process data in Tab 1 first."
225
+ return md_out, None
226
+
227
+ start_time = time.time()
228
+ response = self.engine.generate_report(r_type, context, objective)
229
+ latency = (time.time() - start_time) * 1000
230
+
231
+ log(auditor_instance.format_response_log(response, latency, 1, 0.95)) # Log the event
232
+
233
+ if r_type == "Narrative Prose Report":
234
+ return response, None
235
+ elif r_type == "Technical JSON Report":
236
+ # The engine already returns a JSON string or an error object
237
+ return None, json.loads(response)
238
+
239
+ def launch(self):
240
+ self.app.launch(debug=Config.VERBOSE)
241
+
242
+ # --- MAIN EXECUTION BLOCK ---
243
+ if __name__ == "__main__":
244
+ if not Config.HF_TOKEN:
245
+ print("FATAL: Hugging Face token (HF_TOKEN) not found in environment variables.")
246
+ print("Please set your token, e.g., `export HF_TOKEN='hf_...'`")
247
+ else:
248
+ log("Instantiating Maestro Engine and launching Gradio App...")
249
+ maestro_engine = MaestroEngine()
250
+ gradio_app = GradioApp(engine=maestro_engine)
251
+ gradio_app.launch()