acecalisto3 commited on
Commit
a8f4aca
·
verified ·
1 Parent(s): d521627

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +194 -337
app.py CHANGED
@@ -1,365 +1,222 @@
1
- import asyncio
2
- import logging
3
- from pathlib import Path
4
- from typing import List, Dict, Any, Optional, Tuple
5
  import os
 
6
  import uuid
 
 
 
 
7
  import json
8
  import datetime
9
  import random
10
- from dataclasses import dataclass
11
 
12
- # Web and API
13
- import gradio as gr
14
- import requests
15
- import bs4
16
- import lxml
17
- from huggingface_hub import InferenceClient, HfApi
18
- from pypdf import PdfReader
19
 
20
- # Configure logging
21
- logging.basicConfig(
22
- level=logging.INFO,
23
- format='%(asctime)s - %(levelname)s - %(message)s'
24
- )
25
- logger = logging.getLogger(__name__)
26
 
27
- # Configuration
28
- @dataclass
29
- class Config:
30
- MODEL_NAME: str = "mistralai/Mixtral-8x7B-Instruct-v0.1"
31
- REPO_NAME: str = "acecalisto3/tmp"
32
- MAX_HISTORY: int = 100
33
- MAX_DATA: int = 20000
34
- CHUNK_SIZE: int = 8192
35
- REQUEST_TIMEOUT: int = 30
36
- MAX_RETRIES: int = 3
37
- TEMP_DIR: str = "temp"
38
 
39
- # Initialize configuration
40
- config = Config()
41
 
42
- # Ensure temp directory exists
43
- Path(config.TEMP_DIR).mkdir(exist_ok=True)
44
 
45
- # Initialize API clients
46
- try:
47
- token_self = os.environ['HF_TOKEN']
48
- client = InferenceClient(config.MODEL_NAME)
49
- api = HfApi(token=token_self)
50
- save_data = f'https://huggingface.co/datasets/{config.REPO_NAME}/raw/main/'
51
- except KeyError:
52
- logger.error("HF_TOKEN environment variable not set")
53
- raise EnvironmentError("Missing HF_TOKEN environment variable")
54
- except Exception as e:
55
- logger.error(f"Failed to initialize API clients: {str(e)}")
56
- raise
57
 
58
- class WebScraper:
59
- """Handles web scraping operations"""
60
-
61
- def __init__(self):
62
- self.headers = {
63
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
64
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
65
- }
66
- self.error_box = []
67
 
68
- async def find_all(self, url: str, max_depth: int = 2) -> Tuple[bool, List[Dict]]:
69
- """
70
- Asynchronously scrape web content from URL and its links
71
- """
72
- return_list = []
73
- visited_links = set()
74
- links_to_visit = [(url, 0)]
75
-
76
- try:
77
- while links_to_visit and len(visited_links) < max_depth:
78
- current_url, current_depth = links_to_visit.pop(0)
79
-
80
- if current_url not in visited_links and current_depth < max_depth:
81
- visited_links.add(current_url)
82
-
83
- async with requests.Session() as session:
84
- response = await session.get(
85
- current_url,
86
- headers=self.headers,
87
- timeout=config.REQUEST_TIMEOUT
88
- )
89
-
90
- if response.status_code == 200:
91
- soup = bs4.BeautifulSoup(response.content, 'lxml')
92
- return_list.append({
93
- 'url': current_url,
94
- 'content': soup.text,
95
- 'depth': current_depth,
96
- 'timestamp': datetime.datetime.now().isoformat()
97
- })
98
-
99
- # Process links
100
- for link in soup.find_all("a", href=True):
101
- href = link.get('href')
102
- if href and href.startswith('http'):
103
- links_to_visit.append((href, current_depth + 1))
104
-
105
- except Exception as e:
106
- logger.error(f"Error during web scraping: {str(e)}")
107
- self.error_box.append({
108
- 'url': url,
109
- 'error': str(e),
110
- 'timestamp': datetime.datetime.now().isoformat()
111
- })
112
- return False, []
113
-
114
- return True, return_list
115
-
116
- class DocumentProcessor:
117
- """Handles document processing operations"""
118
-
119
- def __init__(self):
120
- self.error_box = []
121
-
122
- async def read_pdf_online(self, url: str) -> str:
123
- """
124
- Asynchronously download and process PDF from URL
125
- """
126
- temp_file = Path(config.TEMP_DIR) / f"temp_{uuid.uuid4()}.pdf"
127
-
128
- try:
129
- async with requests.Session() as session:
130
- response = await session.get(url, stream=True, timeout=config.REQUEST_TIMEOUT)
131
-
132
- if response.status_code == 200:
133
- temp_file.write_bytes(response.content)
134
-
135
- reader = PdfReader(str(temp_file))
136
- text = "\n".join(
137
- page.extract_text() for page in reader.pages
138
- )
139
-
140
- return text
141
- else:
142
- raise Exception(f"HTTP {response.status_code}")
143
-
144
- except Exception as e:
145
- logger.error(f"Error processing PDF {url}: {str(e)}")
146
- self.error_box.append({
147
- 'url': url,
148
- 'error': str(e),
149
- 'timestamp': datetime.datetime.now().isoformat()
150
- })
151
- return f"Error processing PDF: {str(e)}"
152
-
153
- finally:
154
- if temp_file.exists():
155
- temp_file.unlink()
156
-
157
- async def process_files(self, files: List[str]) -> str:
158
- """
159
- Process uploaded files (PDF/TXT)
160
- """
161
- result = []
162
-
163
- for file in files:
164
  try:
165
- file_path = Path(file)
166
- if file_path.suffix.lower() == '.pdf':
167
- reader = PdfReader(str(file_path))
168
- text = "\n".join(
169
- page.extract_text() for page in reader.pages
170
- )
171
- elif file_path.suffix.lower() == '.txt':
172
- text = file_path.read_text()
173
  else:
174
- continue
175
-
176
- result.append({
177
- 'filename': file_path.name,
178
- 'content': text,
179
- 'timestamp': datetime.datetime.now().isoformat()
180
- })
181
-
182
  except Exception as e:
183
- logger.error(f"Error processing file {file}: {str(e)}")
184
- self.error_box.append({
185
- 'file': file,
186
- 'error': str(e),
187
- 'timestamp': datetime.datetime.now().isoformat()
188
- })
189
-
190
- return json.dumps(result, indent=2)
191
 
192
- class DataProcessor:
193
- """Handles data processing and compression operations"""
194
-
195
- def __init__(self):
196
- self.error_box = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
197
 
198
- async def compress_data(self, content: str, instructions: str) -> List[str]:
199
- """
200
- Compress and process data in chunks
201
- """
202
- chunk_size = min(config.MAX_DATA, len(content))
203
- num_chunks = (len(content) + chunk_size - 1) // chunk_size
204
-
205
- compressed_data = []
206
- seed = random.randint(1, 1000000000)
207
-
208
- for i in range(num_chunks):
209
- start_idx = i * chunk_size
210
- end_idx = min((i + 1) * chunk_size, len(content))
211
- chunk = content[start_idx:end_idx]
212
-
213
- try:
214
- response = await self.run_gpt(
215
- chunk,
216
- instructions,
217
- seed
218
- )
219
- compressed_data.append(response)
220
-
221
- except Exception as e:
222
- logger.error(f"Error compressing chunk {i}: {str(e)}")
223
- self.error_box.append({
224
- 'chunk': i,
225
- 'error': str(e),
226
- 'timestamp': datetime.datetime.now().isoformat()
227
- })
228
-
229
- return compressed_data
230
 
231
- async def run_gpt(self, content: str, instructions: str, seed: int) -> str:
232
- """
233
- Run GPT model inference
234
- """
235
- try:
236
- response = await client.text_generation(
237
- content,
238
- max_new_tokens=config.CHUNK_SIZE,
239
- temperature=0.9,
240
- top_p=0.95,
241
- repetition_penalty=1.0,
242
- do_sample=True,
243
- seed=seed,
244
- stream=True
245
- )
246
-
247
- return "".join(r.token.text for r in response)
248
-
249
- except Exception as e:
250
- logger.error(f"GPT inference error: {str(e)}")
251
- raise
252
 
253
- class WebInterface:
254
- """Handles Gradio web interface"""
255
-
256
- def __init__(self):
257
- self.scraper = WebScraper()
258
- self.doc_processor = DocumentProcessor()
259
- self.data_processor = DataProcessor()
260
 
261
- def build_interface(self):
262
- """
263
- Create Gradio interface
264
- """
265
- with gr.Blocks() as app:
266
- gr.Markdown("# Document Processing and Web Scraping Tool")
267
-
268
- with gr.Tab("Input"):
269
- text_input = gr.Textbox(label="Instructions")
270
- url_input = gr.Textbox(label="URL")
271
- file_input = gr.File(label="Upload Files")
272
-
273
- with gr.Tab("Options"):
274
- depth_slider = gr.Slider(1, 5, value=2, label="Scraping Depth")
275
- compress_checkbox = gr.Checkbox(label="Compress Output")
276
-
277
- with gr.Tab("Output"):
278
- output_text = gr.Textbox(label="Results")
279
- error_output = gr.JSON(label="Errors")
280
-
281
- submit_btn = gr.Button("Process")
282
- clear_btn = gr.Button("Clear")
283
-
284
- submit_btn.click(
285
- fn=self.process_input,
286
- inputs=[text_input, url_input, file_input, depth_slider, compress_checkbox],
287
- outputs=[output_text, error_output]
288
- )
289
-
290
- clear_btn.click(
291
- fn=self.clear_output,
292
- inputs=[],
293
- outputs=[text_input, url_input, output_text, error_output]
294
- )
295
-
296
- return app
297
 
298
- async def process_input(
299
- self,
300
- instructions: str,
301
- url: str,
302
- files: List[str],
303
- depth: int,
304
- compress: bool
305
- ) -> Tuple[str, Dict]:
306
- """
307
- Process user input and return results
308
- """
309
- results = []
310
- errors = []
311
-
312
- # Process URL if provided
313
  if url:
314
- success, web_data = await self.scraper.find_all(url, depth)
315
- if success:
316
- results.extend(web_data)
317
- errors.extend(self.scraper.error_box)
318
-
319
- # Process files if provided
320
- if files:
321
- file_data = await self.doc_processor.process_files(files)
322
- results.append(file_data)
323
- errors.extend(self.doc_processor.error_box)
324
-
325
- # Compress results if requested
326
- if compress and results:
327
  try:
328
- compressed = await self.data_processor.compress_data(
329
- json.dumps(results),
330
- instructions
331
- )
332
- results = compressed
 
333
  except Exception as e:
334
- errors.append({
335
- 'operation': 'compression',
336
- 'error': str(e),
337
- 'timestamp': datetime.datetime.now().isoformat()
338
- })
339
-
340
- return json.dumps(results, indent=2), {'errors': errors}
341
-
342
- def clear_output(self):
343
- """
344
- Clear interface outputs
345
- """
346
- return ["", "", "", None]
347
 
348
- def main():
349
- """
350
- Main application entry point
351
- """
352
- try:
353
- interface = WebInterface()
354
- app = interface.build_interface()
355
- app.launch(
356
- server_name="0.0.0.0",
357
- server_port=7860,
358
- share=True
359
- )
360
  except Exception as e:
361
- logger.error(f"Application startup failed: {str(e)}")
362
- raise
 
363
 
364
- if __name__ == "__main__":
365
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
 
 
 
2
  import os
3
+ import requests
4
  import uuid
5
+ from huggingface_hub import InferenceClient, HfApi
6
+ from pypdf import PdfReader
7
+ from bs4 import BeautifulSoup
8
+ import lxml
9
  import json
10
  import datetime
11
  import random
12
+ import zipfile
13
 
14
+ # Enable verbose logging
15
+ VERBOSE = True
16
+ def log(message):
17
+ if VERBOSE:
18
+ print(f"[LOG] {datetime.datetime.now()} - {message}")
 
 
19
 
20
+ # Hugging Face API Initialization
21
+ HF_MODEL = "mistralai/Mixtral-8x7B-Instruct-v0.1"
22
+ HF_TOKEN = os.environ.get('HF_TOKEN')
 
 
 
23
 
24
+ if not HF_TOKEN:
25
+ raise EnvironmentError("HF_TOKEN is not set. Please export it as an environment variable.")
 
 
 
 
 
 
 
 
 
26
 
27
+ client = InferenceClient(HF_MODEL)
28
+ api = HfApi(token=HF_TOKEN)
29
 
30
+ REPO_NAME = "acecalisto3/tmp"
31
+ DATASET_URL = f"https://huggingface.co/datasets/{REPO_NAME}/raw/main/"
32
 
33
+ log("Initialized Hugging Face client and API.")
 
 
 
 
 
 
 
 
 
 
 
34
 
35
+ # Constants
36
+ MAX_HISTORY = 100
37
+ MAX_DATA = 20000
 
 
 
 
 
 
38
 
39
+ # Utility Functions
40
+ def read_pdf(file_path):
41
+ log(f"Reading PDF: {file_path}")
42
+ try:
43
+ reader = PdfReader(file_path)
44
+ text = "\n".join(page.extract_text() for page in reader.pages)
45
+ log(f"Extracted text from {len(reader.pages)} pages.")
46
+ return text
47
+ except Exception as e:
48
+ log(f"Error reading PDF: {e}")
49
+ return str(e)
50
+
51
+ def fetch_url(url, max_depth=1):
52
+ log(f"Fetching URL: {url} with depth: {max_depth}")
53
+ visited = set()
54
+ to_visit = [(url, 0)]
55
+ results = []
56
+
57
+ while to_visit:
58
+ current_url, depth = to_visit.pop(0)
59
+ if depth < max_depth and current_url not in visited:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  try:
61
+ response = requests.get(current_url)
62
+ if response.status_code == 200:
63
+ visited.add(current_url)
64
+ soup = BeautifulSoup(response.content, 'lxml')
65
+ results.append(soup.text)
66
+ for link in soup.find_all("a", href=True):
67
+ if link["href"].startswith("http"):
68
+ to_visit.append((link["href"], depth + 1))
69
  else:
70
+ log(f"Failed to fetch {current_url} (status code: {response.status_code}).")
 
 
 
 
 
 
 
71
  except Exception as e:
72
+ log(f"Error fetching {current_url}: {e}")
73
+ return results
 
 
 
 
 
 
74
 
75
+ def read_txt(txt_path):
76
+ log(f"Reading TXT file: {txt_path}")
77
+ try:
78
+ with open(txt_path, "r") as f:
79
+ content = f.read()
80
+ return content
81
+ except Exception as e:
82
+ log(f"Error reading TXT file: {e}")
83
+ return str(e)
84
+
85
+ def chunk_text(text, max_chunk_size):
86
+ log(f"Chunking text into max size: {max_chunk_size}")
87
+ chunks = []
88
+ while len(text) > max_chunk_size:
89
+ split_index = text.rfind(" ", 0, max_chunk_size)
90
+ split_index = split_index if split_index != -1 else max_chunk_size
91
+ chunks.append(text[:split_index])
92
+ text = text[split_index:]
93
+ if text:
94
+ chunks.append(text)
95
+ log(f"Chunked into {len(chunks)} parts.")
96
+ return chunks
97
+
98
+ def run_gpt(prompt, max_tokens=512, temperature=0.9):
99
+ log("Running GPT task...")
100
+ try:
101
+ response = client.text_generation(prompt, max_new_tokens=max_tokens, temperature=temperature)
102
+ log(f"Received GPT response of length {len(response)}.")
103
+ return response
104
+ except Exception as e:
105
+ log(f"Error during GPT interaction: {e}")
106
+ return str(e)
107
+
108
+ # Data Compression Logic
109
+ def compress_data(data, instructions, max_tokens=8192):
110
+ log("Compressing data...")
111
+ total_length = len(data)
112
+ chunks = chunk_text(data, MAX_DATA)
113
+ results = []
114
+
115
+ for chunk in chunks:
116
+ result = run_gpt(
117
+ prompt=instructions.format(history=chunk),
118
+ max_tokens=max_tokens,
119
+ temperature=0.9
120
+ )
121
+ results.append(result)
122
 
123
+ combined_result = "\n".join(results)
124
+ log("Data compression complete.")
125
+ return combined_result
126
+
127
+ def save_memory(task, history):
128
+ log("Saving memory to Hugging Face...")
129
+ try:
130
+ uid = str(uuid.uuid4())
131
+ timestamp = datetime.datetime.now().isoformat()
132
+ filename = f"memory-{uid}.json"
133
+
134
+ memory = {
135
+ "task": task,
136
+ "history": history,
137
+ "timestamp": timestamp
138
+ }
139
+ with open(filename, "w") as f:
140
+ json.dump(memory, f)
141
+
142
+ api.upload_file(
143
+ path_or_fileobj=filename,
144
+ path_in_repo=f"memories/{filename}",
145
+ repo_id=REPO_NAME,
146
+ repo_type="dataset",
147
+ token=HF_TOKEN
148
+ )
149
+ log("Memory saved successfully.")
150
+ return memory
151
+ except Exception as e:
152
+ log(f"Error saving memory: {e}")
153
+ return None
 
154
 
155
+ # Summarization Logic
156
+ def summarize(inp, history, report_check, data=None, files=None, url=None, pdf_url=None, pdf_batch=None):
157
+ log("Starting summarization...")
158
+ output_data = ""
159
+ error_box = []
160
+ json_box = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
 
162
+ try:
163
+ if data:
164
+ log("Processing input text.")
165
+ output_data += data
 
 
 
166
 
167
+ if files:
168
+ for file in files:
169
+ if file.name.endswith(".pdf"):
170
+ output_data += f"\n{read_pdf(file.name)}"
171
+ elif file.name.endswith(".txt"):
172
+ output_data += f"\n{read_txt(file.name)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
173
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
174
  if url:
175
+ log(f"Processing URL: {url}")
176
+ output_data += "\n".join(fetch_url(url))
177
+
178
+ if pdf_url:
179
+ log(f"Processing PDF URL: {pdf_url}")
 
 
 
 
 
 
 
 
180
  try:
181
+ response = requests.get(pdf_url)
182
+ if response.status_code == 200:
183
+ with open("temp.pdf", "wb") as f:
184
+ f.write(response.content)
185
+ output_data += read_pdf("temp.pdf")
186
+ os.remove("temp.pdf")
187
  except Exception as e:
188
+ log(f"Error fetching PDF from URL: {e}")
189
+ error_box.append(f"PDF Error: {e}")
 
 
 
 
 
 
 
 
 
 
 
190
 
191
+ compressed = compress_data(output_data, instructions=inp, max_tokens=8192)
192
+ log("Summarization complete.")
193
+ return compressed
 
 
 
 
 
 
 
 
 
194
  except Exception as e:
195
+ log(f"Error during summarization: {e}")
196
+ error_box.append(f"Summarization Error: {e}")
197
+ return None
198
 
199
+ # Gradio App Interface
200
+ with gr.Blocks() as app:
201
+ gr.HTML("<center><h1>Mixtral 8x7B Summarizer</h1><p>Summarize unlimited-length data</p></center>")
202
+
203
+ chatbot = gr.Chatbot(label="Mixtral 8x7B Chatbot", show_copy_button=True)
204
+ prompt = gr.Textbox(label="Instructions", placeholder="Summarization instructions (optional)")
205
+ data = gr.Textbox(label="Input Data", lines=6, placeholder="Enter text or upload a file.")
206
+ files = gr.Files(label="Upload Files (.pdf, .txt)", file_types=[".pdf", ".txt"])
207
+ url = gr.Textbox(label="URL")
208
+ pdf_url = gr.Textbox(label="PDF URL")
209
+ json_out = gr.JSON(label="Output JSON")
210
+ error_box = gr.Textbox(label="Error Box", interactive=False)
211
+ button = gr.Button("Process")
212
+
213
+ def process_summarization(inp, history, data, files, url, pdf_url):
214
+ return summarize(inp, history, report_check=True, data=data, files=files, url=url, pdf_url=pdf_url)
215
+
216
+ button.click(
217
+ process_summarization,
218
+ inputs=[prompt, chatbot, data, files, url, pdf_url],
219
+ outputs=[json_out, error_box]
220
+ )
221
+
222
+ app.launch()