Ndux

Runtime error

App Files Files Community

acecalisto3 commited on Dec 10, 2024

Commit

a8f4aca

verified ·

1 Parent(s): d521627

Update app.py

Browse files

Files changed (1) hide show

app.py +194 -337

app.py CHANGED Viewed

@@ -1,365 +1,222 @@
-import asyncio
-import logging
-from pathlib import Path
-from typing import List, Dict, Any, Optional, Tuple
 import os
 import uuid
 import json
 import datetime
 import random
-from dataclasses import dataclass
-# Web and API
-import gradio as gr
-import requests
-import bs4
-import lxml
-from huggingface_hub import InferenceClient, HfApi
-from pypdf import PdfReader
-# Configure logging
-logging.basicConfig(
-    level=logging.INFO,
-    format='%(asctime)s - %(levelname)s - %(message)s'
-)
-logger = logging.getLogger(__name__)
-# Configuration
-@dataclass
-class Config:
-    MODEL_NAME: str = "mistralai/Mixtral-8x7B-Instruct-v0.1"
-    REPO_NAME: str = "acecalisto3/tmp"
-    MAX_HISTORY: int = 100
-    MAX_DATA: int = 20000
-    CHUNK_SIZE: int = 8192
-    REQUEST_TIMEOUT: int = 30
-    MAX_RETRIES: int = 3
-    TEMP_DIR: str = "temp"
-# Initialize configuration
-config = Config()
-# Ensure temp directory exists
-Path(config.TEMP_DIR).mkdir(exist_ok=True)
-# Initialize API clients
-try:
-    token_self = os.environ['HF_TOKEN']
-    client = InferenceClient(config.MODEL_NAME)
-    api = HfApi(token=token_self)
-    save_data = f'https://huggingface.co/datasets/{config.REPO_NAME}/raw/main/'
-except KeyError:
-    logger.error("HF_TOKEN environment variable not set")
-    raise EnvironmentError("Missing HF_TOKEN environment variable")
-except Exception as e:
-    logger.error(f"Failed to initialize API clients: {str(e)}")
-    raise
-class WebScraper:
-    """Handles web scraping operations"""
-    def __init__(self):
-        self.headers = {
-            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
-            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
-        }
-        self.error_box = []
-    async def find_all(self, url: str, max_depth: int = 2) -> Tuple[bool, List[Dict]]:
-        """
-        Asynchronously scrape web content from URL and its links
-        """
-        return_list = []
-        visited_links = set()
-        links_to_visit = [(url, 0)]
-        try:
-            while links_to_visit and len(visited_links) < max_depth:
-                current_url, current_depth = links_to_visit.pop(0)
-                if current_url not in visited_links and current_depth < max_depth:
-                    visited_links.add(current_url)
-                    async with requests.Session() as session:
-                        response = await session.get(
-                            current_url,
-                            headers=self.headers,
-                            timeout=config.REQUEST_TIMEOUT
-                        )
-                        if response.status_code == 200:
-                            soup = bs4.BeautifulSoup(response.content, 'lxml')
-                            return_list.append({
-                                'url': current_url,
-                                'content': soup.text,
-                                'depth': current_depth,
-                                'timestamp': datetime.datetime.now().isoformat()
-                            })
-                            # Process links
-                            for link in soup.find_all("a", href=True):
-                                href = link.get('href')
-                                if href and href.startswith('http'):
-                                    links_to_visit.append((href, current_depth + 1))
-        except Exception as e:
-            logger.error(f"Error during web scraping: {str(e)}")
-            self.error_box.append({
-                'url': url,
-                'error': str(e),
-                'timestamp': datetime.datetime.now().isoformat()
-            })
-            return False, []
-        return True, return_list
-class DocumentProcessor:
-    """Handles document processing operations"""
-    def __init__(self):
-        self.error_box = []
-    async def read_pdf_online(self, url: str) -> str:
-        """
-        Asynchronously download and process PDF from URL
-        """
-        temp_file = Path(config.TEMP_DIR) / f"temp_{uuid.uuid4()}.pdf"
-        try:
-            async with requests.Session() as session:
-                response = await session.get(url, stream=True, timeout=config.REQUEST_TIMEOUT)
-                if response.status_code == 200:
-                    temp_file.write_bytes(response.content)
-                    reader = PdfReader(str(temp_file))
-                    text = "\n".join(
-                        page.extract_text() for page in reader.pages
-                    )
-                    return text
-                else:
-                    raise Exception(f"HTTP {response.status_code}")
-        except Exception as e:
-            logger.error(f"Error processing PDF {url}: {str(e)}")
-            self.error_box.append({
-                'url': url,
-                'error': str(e),
-                'timestamp': datetime.datetime.now().isoformat()
-            })
-            return f"Error processing PDF: {str(e)}"
-        finally:
-            if temp_file.exists():
-                temp_file.unlink()
-    async def process_files(self, files: List[str]) -> str:
-        """
-        Process uploaded files (PDF/TXT)
-        """
-        result = []
-        for file in files:
             try:
-                file_path = Path(file)
-                if file_path.suffix.lower() == '.pdf':
-                    reader = PdfReader(str(file_path))
-                    text = "\n".join(
-                        page.extract_text() for page in reader.pages
-                    )
-                elif file_path.suffix.lower() == '.txt':
-                    text = file_path.read_text()
                 else:
-                    continue
-                result.append({
-                    'filename': file_path.name,
-                    'content': text,
-                    'timestamp': datetime.datetime.now().isoformat()
-                })
             except Exception as e:
-                logger.error(f"Error processing file {file}: {str(e)}")
-                self.error_box.append({
-                    'file': file,
-                    'error': str(e),
-                    'timestamp': datetime.datetime.now().isoformat()
-                })
-        return json.dumps(result, indent=2)
-class DataProcessor:
-    """Handles data processing and compression operations"""
-    def __init__(self):
-        self.error_box = []
-    async def compress_data(self, content: str, instructions: str) -> List[str]:
-        """
-        Compress and process data in chunks
-        """
-        chunk_size = min(config.MAX_DATA, len(content))
-        num_chunks = (len(content) + chunk_size - 1) // chunk_size
-        compressed_data = []
-        seed = random.randint(1, 1000000000)
-        for i in range(num_chunks):
-            start_idx = i * chunk_size
-            end_idx = min((i + 1) * chunk_size, len(content))
-            chunk = content[start_idx:end_idx]
-            try:
-                response = await self.run_gpt(
-                    chunk,
-                    instructions,
-                    seed
-                )
-                compressed_data.append(response)
-            except Exception as e:
-                logger.error(f"Error compressing chunk {i}: {str(e)}")
-                self.error_box.append({
-                    'chunk': i,
-                    'error': str(e),
-                    'timestamp': datetime.datetime.now().isoformat()
-                })
-        return compressed_data
-    async def run_gpt(self, content: str, instructions: str, seed: int) -> str:
-        """
-        Run GPT model inference
-        """
-        try:
-            response = await client.text_generation(
-                content,
-                max_new_tokens=config.CHUNK_SIZE,
-                temperature=0.9,
-                top_p=0.95,
-                repetition_penalty=1.0,
-                do_sample=True,
-                seed=seed,
-                stream=True
-            )
-            return "".join(r.token.text for r in response)
-        except Exception as e:
-            logger.error(f"GPT inference error: {str(e)}")
-            raise
-class WebInterface:
-    """Handles Gradio web interface"""
-    def __init__(self):
-        self.scraper = WebScraper()
-        self.doc_processor = DocumentProcessor()
-        self.data_processor = DataProcessor()
-    def build_interface(self):
-        """
-        Create Gradio interface
-        """
-        with gr.Blocks() as app:
-            gr.Markdown("# Document Processing and Web Scraping Tool")
-            with gr.Tab("Input"):
-                text_input = gr.Textbox(label="Instructions")
-                url_input = gr.Textbox(label="URL")
-                file_input = gr.File(label="Upload Files")
-            with gr.Tab("Options"):
-                depth_slider = gr.Slider(1, 5, value=2, label="Scraping Depth")
-                compress_checkbox = gr.Checkbox(label="Compress Output")
-            with gr.Tab("Output"):
-                output_text = gr.Textbox(label="Results")
-                error_output = gr.JSON(label="Errors")
-            submit_btn = gr.Button("Process")
-            clear_btn = gr.Button("Clear")
-            submit_btn.click(
-                fn=self.process_input,
-                inputs=[text_input, url_input, file_input, depth_slider, compress_checkbox],
-                outputs=[output_text, error_output]
-            )
-            clear_btn.click(
-                fn=self.clear_output,
-                inputs=[],
-                outputs=[text_input, url_input, output_text, error_output]
-            )
-        return app
-    async def process_input(
-        self,
-        instructions: str,
-        url: str,
-        files: List[str],
-        depth: int,
-        compress: bool
-    ) -> Tuple[str, Dict]:
-        """
-        Process user input and return results
-        """
-        results = []
-        errors = []
-        # Process URL if provided
         if url:
-            success, web_data = await self.scraper.find_all(url, depth)
-            if success:
-                results.extend(web_data)
-            errors.extend(self.scraper.error_box)
-        # Process files if provided
-        if files:
-            file_data = await self.doc_processor.process_files(files)
-            results.append(file_data)
-            errors.extend(self.doc_processor.error_box)
-        # Compress results if requested
-        if compress and results:
             try:
-                compressed = await self.data_processor.compress_data(
-                    json.dumps(results),
-                    instructions
-                )
-                results = compressed
             except Exception as e:
-                errors.append({
-                    'operation': 'compression',
-                    'error': str(e),
-                    'timestamp': datetime.datetime.now().isoformat()
-                })
-        return json.dumps(results, indent=2), {'errors': errors}
-    def clear_output(self):
-        """
-        Clear interface outputs
-        """
-        return ["", "", "", None]
-def main():
-    """
-    Main application entry point
-    """
-    try:
-        interface = WebInterface()
-        app = interface.build_interface()
-        app.launch(
-            server_name="0.0.0.0",
-            server_port=7860,
-            share=True
-        )
     except Exception as e:
-        logger.error(f"Application startup failed: {str(e)}")
-        raise
-if __name__ == "__main__":
-    main()

+import gradio as gr
 import os
+import requests
 import uuid
+from huggingface_hub import InferenceClient, HfApi
+from pypdf import PdfReader
+from bs4 import BeautifulSoup
+import lxml
 import json
 import datetime
 import random
+import zipfile
+# Enable verbose logging
+VERBOSE = True
+def log(message):
+    if VERBOSE:
+        print(f"[LOG] {datetime.datetime.now()} - {message}")
+# Hugging Face API Initialization
+HF_MODEL = "mistralai/Mixtral-8x7B-Instruct-v0.1"
+HF_TOKEN = os.environ.get('HF_TOKEN')
+if not HF_TOKEN:
+    raise EnvironmentError("HF_TOKEN is not set. Please export it as an environment variable.")
+client = InferenceClient(HF_MODEL)
+api = HfApi(token=HF_TOKEN)
+REPO_NAME = "acecalisto3/tmp"
+DATASET_URL = f"https://huggingface.co/datasets/{REPO_NAME}/raw/main/"
+log("Initialized Hugging Face client and API.")
+# Constants
+MAX_HISTORY = 100
+MAX_DATA = 20000
+# Utility Functions
+def read_pdf(file_path):
+    log(f"Reading PDF: {file_path}")
+    try:
+        reader = PdfReader(file_path)
+        text = "\n".join(page.extract_text() for page in reader.pages)
+        log(f"Extracted text from {len(reader.pages)} pages.")
+        return text
+    except Exception as e:
+        log(f"Error reading PDF: {e}")
+        return str(e)
+def fetch_url(url, max_depth=1):
+    log(f"Fetching URL: {url} with depth: {max_depth}")
+    visited = set()
+    to_visit = [(url, 0)]
+    results = []
+    while to_visit:
+        current_url, depth = to_visit.pop(0)
+        if depth < max_depth and current_url not in visited:
             try:
+                response = requests.get(current_url)
+                if response.status_code == 200:
+                    visited.add(current_url)
+                    soup = BeautifulSoup(response.content, 'lxml')
+                    results.append(soup.text)
+                    for link in soup.find_all("a", href=True):
+                        if link["href"].startswith("http"):
+                            to_visit.append((link["href"], depth + 1))
                 else:
+                    log(f"Failed to fetch {current_url} (status code: {response.status_code}).")
             except Exception as e:
+                log(f"Error fetching {current_url}: {e}")
+    return results
+def read_txt(txt_path):
+    log(f"Reading TXT file: {txt_path}")
+    try:
+        with open(txt_path, "r") as f:
+            content = f.read()
+        return content
+    except Exception as e:
+        log(f"Error reading TXT file: {e}")
+        return str(e)
+def chunk_text(text, max_chunk_size):
+    log(f"Chunking text into max size: {max_chunk_size}")
+    chunks = []
+    while len(text) > max_chunk_size:
+        split_index = text.rfind(" ", 0, max_chunk_size)
+        split_index = split_index if split_index != -1 else max_chunk_size
+        chunks.append(text[:split_index])
+        text = text[split_index:]
+    if text:
+        chunks.append(text)
+    log(f"Chunked into {len(chunks)} parts.")
+    return chunks
+def run_gpt(prompt, max_tokens=512, temperature=0.9):
+    log("Running GPT task...")
+    try:
+        response = client.text_generation(prompt, max_new_tokens=max_tokens, temperature=temperature)
+        log(f"Received GPT response of length {len(response)}.")
+        return response
+    except Exception as e:
+        log(f"Error during GPT interaction: {e}")
+        return str(e)
+# Data Compression Logic
+def compress_data(data, instructions, max_tokens=8192):
+    log("Compressing data...")
+    total_length = len(data)
+    chunks = chunk_text(data, MAX_DATA)
+    results = []
+    for chunk in chunks:
+        result = run_gpt(
+            prompt=instructions.format(history=chunk),
+            max_tokens=max_tokens,
+            temperature=0.9
+        )
+        results.append(result)
+    combined_result = "\n".join(results)
+    log("Data compression complete.")
+    return combined_result
+def save_memory(task, history):
+    log("Saving memory to Hugging Face...")
+    try:
+        uid = str(uuid.uuid4())
+        timestamp = datetime.datetime.now().isoformat()
+        filename = f"memory-{uid}.json"
+        memory = {
+            "task": task,
+            "history": history,
+            "timestamp": timestamp
+        }
+        with open(filename, "w") as f:
+            json.dump(memory, f)
+        api.upload_file(
+            path_or_fileobj=filename,
+            path_in_repo=f"memories/{filename}",
+            repo_id=REPO_NAME,
+            repo_type="dataset",
+            token=HF_TOKEN
+        )
+        log("Memory saved successfully.")
+        return memory
+    except Exception as e:
+        log(f"Error saving memory: {e}")
+        return None
+# Summarization Logic
+def summarize(inp, history, report_check, data=None, files=None, url=None, pdf_url=None, pdf_batch=None):
+    log("Starting summarization...")
+    output_data = ""
+    error_box = []
+    json_box = []
+    try:
+        if data:
+            log("Processing input text.")
+            output_data += data
+        if files:
+            for file in files:
+                if file.name.endswith(".pdf"):
+                    output_data += f"\n{read_pdf(file.name)}"
+                elif file.name.endswith(".txt"):
+                    output_data += f"\n{read_txt(file.name)}"
         if url:
+            log(f"Processing URL: {url}")
+            output_data += "\n".join(fetch_url(url))
+        if pdf_url:
+            log(f"Processing PDF URL: {pdf_url}")
             try:
+                response = requests.get(pdf_url)
+                if response.status_code == 200:
+                    with open("temp.pdf", "wb") as f:
+                        f.write(response.content)
+                    output_data += read_pdf("temp.pdf")
+                    os.remove("temp.pdf")
             except Exception as e:
+                log(f"Error fetching PDF from URL: {e}")
+                error_box.append(f"PDF Error: {e}")
+        compressed = compress_data(output_data, instructions=inp, max_tokens=8192)
+        log("Summarization complete.")
+        return compressed
     except Exception as e:
+        log(f"Error during summarization: {e}")
+        error_box.append(f"Summarization Error: {e}")
+        return None
+# Gradio App Interface
+with gr.Blocks() as app:
+    gr.HTML("<center><h1>Mixtral 8x7B Summarizer</h1><p>Summarize unlimited-length data</p></center>")
+    chatbot = gr.Chatbot(label="Mixtral 8x7B Chatbot", show_copy_button=True)
+    prompt = gr.Textbox(label="Instructions", placeholder="Summarization instructions (optional)")
+    data = gr.Textbox(label="Input Data", lines=6, placeholder="Enter text or upload a file.")
+    files = gr.Files(label="Upload Files (.pdf, .txt)", file_types=[".pdf", ".txt"])
+    url = gr.Textbox(label="URL")
+    pdf_url = gr.Textbox(label="PDF URL")
+    json_out = gr.JSON(label="Output JSON")
+    error_box = gr.Textbox(label="Error Box", interactive=False)
+    button = gr.Button("Process")
+    def process_summarization(inp, history, data, files, url, pdf_url):
+        return summarize(inp, history, report_check=True, data=data, files=files, url=url, pdf_url=pdf_url)
+    button.click(
+        process_summarization,
+        inputs=[prompt, chatbot, data, files, url, pdf_url],
+        outputs=[json_out, error_box]
+    )
+    app.launch()