Ndux

Runtime error

App Files Files Community

acecalisto3 commited on Aug 7

Commit

b26edd3

verified ·

1 Parent(s): 82241ae

Update app.py

Browse files

Files changed (1) hide show

app.py +235 -319

app.py CHANGED Viewed

@@ -1,335 +1,251 @@
 import os
 import requests
 import uuid
-from huggingface_hub import InferenceClient, HfApi
-from pypdf import PdfReader
-from bs4 import BeautifulSoup
 import datetime
 import zipfile
-import nltk
-import nltk.data
-import nltk.downloader  # Import the downloader explicitly
 import tempfile
 import shutil
-import time  # Import time for optional delay
-import secrets  # For generating session-specific keys (conceptual)
-from hashlib import sha256  # For checksum verification (conceptual)
-VERBOSE = True
-def log(message):
-    if VERBOSE:
-        print(f"[LOG] {datetime.datetime.now()} - {message}")
-# Hugging Face API Initialization
-HF_MODEL = "mistralai/Mixtral-8x7B-Instruct-v0.1"
-HF_TOKEN = os.environ.get('HF_TOKEN')
-if not HF_TOKEN:
-    raise EnvironmentError("HF_TOKEN is not set. Please export it as an environment variable.")
-try:
-    client = InferenceClient(model=HF_MODEL, token=HF_TOKEN)
-    api = HfApi(token=HF_TOKEN)
-    log("Initialized Hugging Face client and API.")
-except Exception as e:
-    log(f"Error initializing Hugging Face client: {e}")
-    exit(1)
-REPO_NAME = "acecalisto3/tmp"
-DATASET_URL = f"https://huggingface.co/datasets/{REPO_NAME}/raw/main/"
-# Constants
-MAX_TOKENS = 8192
-# Utility Functions
-def generate_session_key():
-    """Generates a unique session key (conceptual for ephemeral session)."""
-    return secrets.token_hex(16)
-def verify_checksum(data):
-    """Generates a SHA-256 checksum of the data (conceptual for integrity)."""
-    return sha256(data.encode()).hexdigest()
-def get_file_id_from_google_drive_url(url):
-    if "drive.google.com" in url and "file/d/" in url:
-        parts = url.split("/file/d/")
-        if len(parts) < 2:
-            return None
-        file_id = parts[1].split("/")[0].split("?")[0]
-        return file_id
-    return None
-def download_google_drive_file(file_id, session_id):
-    """Downloads a Google Drive file to a temporary session-specific directory."""
-    download_url = f"https://drive.google.com/uc?id={file_id}"
-    temp_session_dir = os.path.join(tempfile.gettempdir(), f"session_{session_id}")
-    os.makedirs(temp_session_dir, exist_ok=True)
-    try:
-        response = requests.get(download_url, stream=True)
-        response.raise_for_status()
-        content_disposition = response.headers.get('Content-Disposition')
-        if content_disposition:
-            filename = content_disposition.split("filename=")[1].strip('"')
-        else:
-            filename = f"file_{uuid.uuid4()}"
-        file_path = os.path.join(temp_session_dir, filename)
-        with open(file_path, "wb") as f:
-            for chunk in response.iter_content(chunk_size=8192):
-                f.write(chunk)
-        return file_path, temp_session_dir
-    except Exception as e:
-        log(f"Error downloading Google Drive file {file_id}: {e}")
-        # Clean up the session directory on error
-        if os.path.exists(temp_session_dir):
-            shutil.rmtree(temp_session_dir)
-        return None, None
-def read_pdf(file_path):
-    try:
-        reader = PdfReader(file_path)
-        text = "\n".join(page.extract_text() for page in reader.pages)
-        return text
-    except Exception as e:
-        log(f"Error reading PDF {file_path}: {e}")
-        return ""
-def read_txt(txt_path):
-    try:
-        with open(txt_path, "r", encoding="utf-8") as f:
-            return f.read()
-    except Exception as e:
-        log(f"Error reading TXT file {txt_path}: {e}")
-        return ""
-def read_zip(zip_path, session_id):
-    """Reads content from a ZIP file within the temporary session directory."""
-    extracted_data = []
-    temp_extract_dir = os.path.join(tempfile.gettempdir(), f"session_{session_id}_extract")
-    os.makedirs(temp_extract_dir, exist_ok=True)
-    try:
-        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
-            for file_info in zip_ref.infolist():
-                if file_info.filename.endswith((".txt", ".pdf")):
-                    with zip_ref.open(file_info) as file:
-                        content = file.read()
-                        temp_file_path = os.path.join(temp_extract_dir, file_info.filename)
-                        with open(temp_file_path, "wb") as temp_file:
-                            temp_file.write(content)
-                        if file_info.filename.endswith(".txt"):
-                            extracted_data.append(read_txt(temp_file_path))
-                        elif file_info.filename.endswith(".pdf"):
-                            extracted_data.append(read_pdf(temp_file_path))
-        return "\n".join(extracted_data)
-    except Exception as e:
-        log(f"Error reading ZIP file {zip_path}: {e}")
-        return ""
-    finally:
-        shutil.rmtree(temp_extract_dir, ignore_errors=True)
-def fetch_google_doc(url):
-    if "docs.google.com/document/d/" in url:
-        # Extract document ID
-        doc_id = url.split("/d/")[1].split("/")[0]
-        # Construct export URL for plain text
-        export_url = f"https://docs.google.com/document/d/{doc_id}/export?format=txt"
         try:
-            response = requests.get(export_url)
-            response.raise_for_status()
-            return response.text
-        except requests.exceptions.HTTPError as e:
-            log(f"Error fetching Google Doc: {e}")
-            return None
-    else:
-        return None
-def fetch_url(url, max_depth, session_id):
-    visited = set()
-    to_visit = [(url, 0)]
-    results = []
-    errors = []
-    headers = {
-        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36'
-    }
-    temp_session_dir = os.path.join(tempfile.gettempdir(), f"session_{session_id}")
-    os.makedirs(temp_session_dir, exist_ok=True)
-    try:
-        while to_visit:
-            current_url, depth = to_visit.pop(0)
-            if current_url in visited:
-                continue
-            if depth < max_depth:
-                visited.add(current_url)
-                # Check if it's a Google Drive file URL
-                if "drive.google.com/file/d/" in current_url:
-                    file_id = get_file_id_from_google_drive_url(current_url)
-                    if file_id:
-                        file_path, temp_dir = download_google_drive_file(file_id, session_id)
-                        if file_path:
-                            file_ext = os.path.splitext(file_path)[1].lower()
-                            if file_ext == ".pdf":
-                                pdf_text = read_pdf(file_path)
-                                results.append(pdf_text)
-                            elif file_ext == ".txt":
-                                txt_content = read_txt(file_path)
-                                results.append(txt_content)
-                            elif file_ext == ".zip":
-                                zip_content = read_zip(file_path, session_id)
-                                results.append(zip_content)
-                            else:
-                                errors.append(f"Unsupported file type for URL: {current_url}")
-                            # Clean up the downloaded file, but keep the session dir for other files
-                            if temp_dir and os.path.exists(file_path):
-                                os.remove(file_path)
-                        else:
-                            errors.append(f"Failed to download file from URL: {current_url}")
-                    else:
-                        errors.append(f"Invalid Google Drive URL: {current_url}")
-                # Check if it's a Google Doc URL
-                elif "docs.google.com/document/d/" in current_url:
-                    doc_content = fetch_google_doc(current_url)
-                    if doc_content:
-                        results.append(doc_content)
-                    else:
-                        errors.append(f"Failed to fetch Google Doc: {current_url}")
-                else:
-                    try:
-                        response = requests.get(current_url, headers=headers, timeout=10)
-                        response.raise_for_status()
-                        soup = BeautifulSoup(response.content, 'html.parser')
-                        results.append(soup.get_text())
-                        for link in soup.find_all("a", href=True):
-                            absolute_url = requests.compat.urljoin(current_url, link.get('href'))
-                            if absolute_url.startswith("http") and absolute_url not in visited:
-                                to_visit.append((absolute_url, depth + 1))
-                        # Optional: Introduce a delay between requests
-                        # time.sleep(1)
-                    except Exception as e:
-                        log(f"Error fetching {current_url}: {e}")
-                        errors.append(f"Error fetching {current_url}: {e}")
-    finally:
-        # Clean up the temporary session directory after processing URLs
-        shutil.rmtree(temp_session_dir, ignore_errors=True)
-    return "\n".join(results), "\n".join(errors)
-def process_file(file, session_id):
-    """Processes an uploaded file within the temporary session directory."""
-    temp_session_dir = os.path.join(tempfile.gettempdir(), f"session_{session_id}")
-    os.makedirs(temp_session_dir, exist_ok=True)
-    file_path = os.path.join(temp_session_dir, file.name)
-    try:
-        # Save the uploaded file to the temporary session directory
-        with open(file_path, "wb") as f:
-            f.write(file.read())
-        if file.name.endswith(".pdf"):
-            return read_pdf(file_path)
-        elif file.name.endswith(".txt"):
-            return read_txt(file_path)
-        elif file.name.endswith(".zip"):
-            return read_zip(file_path, session_id)
-    except Exception as e:
-        log(f"Error processing file {file.name}: {e}")
-        return ""
-    finally:
-        # The temporary session directory will be cleaned up after the workflow
-        pass
-def chunk_text(text, max_chunk_size):
-    tokenizer = nltk.data.load("tokenizers/punkt/english.pickle")
-    sentences = tokenizer.tokenize(text)
-    chunks = []
-    current_chunk = ""
-    for sentence in sentences:
-        if len(current_chunk) + len(sentence) + 1 > max_chunk_size:
-            chunks.append(current_chunk.strip())
-            current_chunk = ""
-        current_chunk += sentence + " "
-    if current_chunk:
-        chunks.append(current_chunk.strip())
-    return chunks
-def extract_dataset(data, instructions="Extract {history}", max_tokens=MAX_TOKENS):
-    extracted = []
-    chunks = chunk_text(data, max_chunk_size=20000)  # Adjusted size
-    for i, chunk in enumerate(chunks):
         try:
-            prompt = instructions.format(history=chunk)
-            response = client.text_generation(
-                prompt=prompt,
-                max_new_tokens=max_tokens
-            )
-            extracted.append(response.choices[0].text)
         except Exception as e:
-            log(f"Error processing chunk {i+1}: {e}")
-            extracted.append(f"Error processing chunk {i+1}: {e}")
-    return "\n".join(extracted)
-def combine_datasets(datasets):
-    return "\n".join(datasets)
-# Gradio App Interface
-import gradio as gr  # Ensure you import gradio
-with gr.Blocks() as app:
-    session_id = gr.State(generate_session_key) # Unique session ID for each user
-    gr.Markdown(
-        "**Dataset Generator and Flash Chatbot**: Upload files, scrape data from URLs, or enter text to generate datasets and interact with a chatbot."
-    )
-    chatbot = gr.Chatbot(label="Flash Trained Chatbot")
-    command_selector = gr.Dropdown(
-        label="Select Command",
-        choices=["Scrape Data", "Extract Dataset", "Combine Datasets", "Train Chatbot"],
-        value="Scrape Data"
-    )
-    data_input = gr.Textbox(label="Input Text", placeholder="Enter text here.")
-    file_upload = gr.Files(label="Upload Files", file_types=[".pdf", ".txt", ".zip"])
-    url_input = gr.Textbox(label="URL")
-    depth_slider = gr.Slider(label="Crawl Depth", minimum=1, maximum=10, value=1)
-    output_json = gr.JSON(label="Output Dataset")
-    error_output = gr.Textbox(label="Error Log", interactive=False)
-    process_button = gr.Button("Process")
-    def process_workflow(session_id, command, data, files, url, depth):
-        datasets = []
-        errors = []
-        temp_session_dir = os.path.join(tempfile.gettempdir(), f"session_{session_id}")
-        os.makedirs(temp_session_dir, exist_ok=True)
         try:
-            if data:
-                datasets.append(data)
             if files:
-                for file in files:
-                    file_data = process_file(file, session_id)
-                    if file_data:
-                        datasets.append(file_data)
                     else:
-                        errors.append(f"Failed to process file: {file.name}")
-            if url:
-                url_data, fetch_errors = fetch_url(url, depth, session_id)
-                if url_data:
-                    datasets.append(url_data)
-                else:
-                    errors.append(f"Failed to fetch data from URL: {url}")
-                if fetch_errors:
-                    errors.append(fetch_errors)
-            if command == "Extract Dataset":
-                extracted_data = extract_dataset("\n".join(datasets))
-                return session_id, {"datasets": [extracted_data]}, "\n".join(errors)
-            elif command == "Combine Datasets":
-                combined_data = combine_datasets(datasets)
-                return session_id, {"datasets": [combined_data]}, "\n".join(errors)
-            else:
-                return session_id, {"datasets": datasets}, "\n".join(errors)
         except Exception as e:
-            errors.append(str(e))
-            return session_id, {"datasets": []}, "\n".join(errors)
-        finally:
-            # Clean up the temporary session directory after the workflow
-            shutil.rmtree(temp_session_dir, ignore_errors=True)
-    process_button.click(
-        process_workflow,
-        inputs=[session_id, command_selector, data_input, file_upload, url_input, depth_slider],
-        outputs=[session_id, output_json, error_output]
-    )
-    app.launch()

+# app.py
 import os
 import requests
 import uuid
 import datetime
 import zipfile
 import tempfile
 import shutil
+import secrets
+import time
+import json
+from typing import List, Tuple, Any, Dict
+# Third-party libraries
+import gradio as gr
+from huggingface_hub import InferenceClient
+from pypdf import PdfReader
+from bs4 import BeautifulSoup
+import nltk
+# Local imports from the enhanced prompt library
+from agent_prompt import PromptLibrary, SystemAuditor
+# --- CONFIGURATION ---
+class Config:
+    """Centralized configuration for the Maestro application."""
+    HF_MODEL = os.getenv("HF_MODEL", "mistralai/Mixtral-8x7B-Instruct-v0.1")
+    HF_TOKEN = os.getenv("HF_TOKEN")
+    VERBOSE = os.getenv("VERBOSE", "True").lower() == "true"
+    MAX_NEW_TOKENS_REPORT = 4096
+    MAX_NEW_TOKENS_CHAT = 1024
+    REQUESTS_TIMEOUT = 15
+    USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36'
+# --- UTILITIES ---
+def log(message: str) -> None:
+    if Config.VERBOSE:
+        print(f"[LOG] {datetime.datetime.now(datetime.timezone.utc).isoformat()} - {message}")
+class SessionManager:
+    """A context manager for creating and cleaning up session-specific temporary directories."""
+    def __init__(self, session_id: str):
+        self.session_id = session_id
+        self.temp_dir = os.path.join(tempfile.gettempdir(), f"session_{session_id}")
+    def __enter__(self) -> str:
+        os.makedirs(self.temp_dir, exist_ok=True)
+        log(f"Session '{self.session_id}' started. Temp dir: {self.temp_dir}")
+        return self.temp_dir
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        shutil.rmtree(self.temp_dir, ignore_errors=True)
+        log(f"Session '{self.session_id}' ended. Temp dir cleaned up.")
+# --- CORE APPLICATION ENGINE ---
+class MaestroEngine:
+    """Handles all data processing and LLM interaction logic."""
+    def __init__(self):
+        self.client = InferenceClient(model=Config.HF_MODEL, token=Config.HF_TOKEN)
         try:
+            nltk.data.find("tokenizers/punkt")
+        except LookupError:
+            log("Downloading NLTK 'punkt' tokenizer...")
+            nltk.download('punkt', quiet=True)
+        log("MaestroEngine initialized.")
+    def _read_pdf(self, file_path: str) -> str:
+        # (Implementation for reading PDF)
         try:
+            reader = PdfReader(file_path)
+            return "\n".join(page.extract_text() or "" for page in reader.pages)
         except Exception as e:
+            log(f"Error reading PDF {os.path.basename(file_path)}: {e}")
+            return f"Error reading PDF: {e}"
+    def _process_zip(self, zip_path: str, temp_dir: str) -> str:
+        # (Implementation for processing ZIP)
+        extracted_texts = []
+        extract_path = os.path.join(temp_dir, "zip_extract")
+        os.makedirs(extract_path, exist_ok=True)
         try:
+            with zipfile.ZipFile(zip_path, 'r') as zf:
+                for member in zf.infolist():
+                    if member.filename.endswith('.pdf'):
+                        zf.extract(member, extract_path)
+                        extracted_texts.append(self._read_pdf(os.path.join(extract_path, member.filename)))
+                    elif member.filename.endswith('.txt'):
+                        extracted_texts.append(zf.read(member).decode('utf-8', errors='ignore'))
+            return "\n\n".join(extracted_texts)
+        except Exception as e:
+            log(f"Error processing ZIP {os.path.basename(zip_path)}: {e}")
+            return f"Error processing ZIP: {e}"
+    def process_data_sources(self, session_id: str, url: str, text: str, files: List[Any]) -> Tuple[str, List[str]]:
+        """Orchestrates data ingestion from all provided sources."""
+        all_content, errors = [], []
+        with SessionManager(session_id) as temp_dir:
+            if url:
+                try:
+                    response = requests.get(url, headers={'User-Agent': Config.USER_AGENT}, timeout=Config.REQUESTS_TIMEOUT)
+                    response.raise_for_status()
+                    all_content.append(BeautifulSoup(response.content, 'html.parser').get_text(separator="\n", strip=True))
+                except Exception as e:
+                    errors.append(f"URL Fetch Error: {e}")
+            if text:
+                all_content.append(text)
             if files:
+                for file_obj in files:
+                    file_path = os.path.join(temp_dir, os.path.basename(file_obj.name))
+                    with open(file_path, "wb") as f:
+                        shutil.copyfileobj(file_obj, f)
+                    ext = os.path.splitext(file_obj.name)[1].lower()
+                    if ext == '.pdf':
+                        all_content.append(self._read_pdf(file_path))
+                    elif ext == '.txt':
+                        all_content.append(open(file_path, 'r', encoding='utf-8').read())
+                    elif ext == '.zip':
+                        all_content.append(self._process_zip(file_path, temp_dir))
                     else:
+                        errors.append(f"Unsupported file type: {file_obj.name}")
+        return "\n\n---\n\n".join(all_content), errors
+    def _query_llm(self, prompt: str, max_tokens: int) -> str:
+        try:
+            response = self.client.text_generation(prompt, max_new_tokens=max_tokens, temperature=0.7, top_p=0.95)
+            return response.strip()
         except Exception as e:
+            log(f"LLM query failed: {e}")
+            return f"Error communicating with the model: {e}"
+    def run_rag_query(self, query: str, context: str) -> str:
+        prompt = f"Context:\n---\n{context}\n---\nBased only on the context provided, answer the following question:\nQuestion: {query}"
+        return self._query_llm(prompt, Config.MAX_NEW_TOKENS_CHAT)
+    def generate_report(self, report_type: str, context: str, objective: str) -> str:
+        if report_type == "Narrative Prose Report":
+            prompt = PromptLibrary.NARRATIVE_PROSE_REPORT.format(
+                task_objective=objective,
+                knowledge_base=context
+            )
+            return self._query_llm(prompt, Config.MAX_NEW_TOKENS_REPORT)
+        elif report_type == "Technical JSON Report":
+            prompt = PromptLibrary.TECHNICAL_JSON_REPORT.format(
+                task_objective=objective,
+                baseline_knowledge="Previously established facts",
+                new_information=context
+            )
+            raw_response = self._query_llm(prompt, Config.MAX_NEW_TOKENS_REPORT)
+            # Clean up potential markdown code fences for JSON parsing
+            clean_json_str = raw_response.replace("```json", "").replace("```", "").strip()
+            try:
+                # Validate and reformat for pretty printing
+                return json.dumps(json.loads(clean_json_str), indent=2)
+            except json.JSONDecodeError:
+                log(f"Failed to parse LLM response as JSON. Raw response: {raw_response}")
+                return '{"error": "The model did not return valid JSON.", "raw_response": ' + json.dumps(raw_response) + '}'
+        return "Invalid report type selected."
+# --- GRADIO APPLICATION ---
+class GradioApp:
+    """Manages the Gradio UI and application workflow."""
+    def __init__(self, engine: MaestroEngine):
+        self.engine = engine
+        self.app = self._build_ui()
+    def _build_ui(self) -> gr.Blocks:
+        with gr.Blocks(theme=gr.themes.Monochrome(), title="Maestro AI Engine") as app:
+            # State management
+            session_id = gr.State(lambda: secrets.token_hex(16))
+            auditor = gr.State(lambda s_id: SystemAuditor(session_id=s_id), session_id)
+            processed_data = gr.State("")
+            gr.Markdown("# 🧠 Maestro: AI Data Engine & Synthesis Platform")
+            with gr.Tabs():
+                with gr.TabItem("① Data Ingestion"):
+                    with gr.Row():
+                        with gr.Column():
+                            url_input = gr.Textbox(label="Scrape from URL")
+                            text_input = gr.Textbox(label="Paste Text", lines=10)
+                        file_upload = gr.Files(label="Upload Files (.pdf, .txt, .zip)", type="file")
+                    process_button = gr.Button("🚀 Process All Sources", variant="primary")
+                    ingestion_summary = gr.Textbox(label="Ingestion Summary", interactive=False)
+                    error_log = gr.Textbox(label="Errors", interactive=False)
+                with gr.TabItem("② Reporting & Synthesis"):
+                    report_objective = gr.Textbox(label="Report Objective", placeholder="e.g., 'Synthesize findings on AI in agriculture'")
+                    report_type = gr.Dropdown(label="Select Report Type", choices=["Narrative Prose Report", "Technical JSON Report"])
+                    generate_button = gr.Button("Generate Report", variant="primary")
+                    with gr.Tabs():
+                        with gr.TabItem("Narrative Output"):
+                            report_output_md = gr.Markdown()
+                        with gr.TabItem("JSON Output"):
+                            report_output_json = gr.JSON()
+                with gr.TabItem("③ Direct Chat Q&A"):
+                    chatbot = gr.Chatbot(label="Chat Interface", height=550)
+                    msg_input = gr.Textbox(label="Your Question", placeholder="Ask a question about the processed data...")
+                    msg_input.submit(self._chat_workflow, [msg_input, chatbot, processed_data], [msg_input, chatbot])
+            # --- Workflow Connections ---
+            process_button.click(self._ingest_workflow, [session_id, url_input, text_input, file_upload], [processed_data, ingestion_summary, error_log])
+            generate_button.click(self._reporting_workflow, [auditor, report_type, processed_data, report_objective], [report_output_md, report_output_json])
+        return app
+    def _ingest_workflow(self, s_id, url, text, files):
+        log(f"Starting ingestion for session {s_id}...")
+        data, errors = self.engine.process_data_sources(s_id, url, text, files)
+        summary = f"Processing complete. {len(data)} characters ingested. {len(errors)} errors encountered."
+        return data, summary, "\n".join(errors)
+    def _chat_workflow(self, message, history, context):
+        if not context:
+            history.append((message, "Error: No data has been ingested. Please process data in Tab 1 first."))
+            return "", history
+        response = self.engine.run_rag_query(message, context)
+        history.append((message, response))
+        return "", history
+    def _reporting_workflow(self, auditor_instance, r_type, context, objective):
+        if not context:
+            md_out = "### Error: No data has been ingested. Please process data in Tab 1 first."
+            return md_out, None
+        start_time = time.time()
+        response = self.engine.generate_report(r_type, context, objective)
+        latency = (time.time() - start_time) * 1000
+        log(auditor_instance.format_response_log(response, latency, 1, 0.95)) # Log the event
+        if r_type == "Narrative Prose Report":
+            return response, None
+        elif r_type == "Technical JSON Report":
+            # The engine already returns a JSON string or an error object
+            return None, json.loads(response)
+    def launch(self):
+        self.app.launch(debug=Config.VERBOSE)
+# --- MAIN EXECUTION BLOCK ---
+if __name__ == "__main__":
+    if not Config.HF_TOKEN:
+        print("FATAL: Hugging Face token (HF_TOKEN) not found in environment variables.")
+        print("Please set your token, e.g., `export HF_TOKEN='hf_...'`")
+    else:
+        log("Instantiating Maestro Engine and launching Gradio App...")
+        maestro_engine = MaestroEngine()
+        gradio_app = GradioApp(engine=maestro_engine)
+        gradio_app.launch()