Ndux

Runtime error

App Files Files Community

acecalisto3 commited on Dec 30, 2024

Commit

ad1c93f

verified ·

1 Parent(s): be90db0

Update app.py

Browse files

Files changed (1) hide show

app.py +86 -31

app.py CHANGED Viewed

@@ -9,6 +9,8 @@ import datetime
 import zipfile
 import nltk.data
 import nltk
 # Ensure the 'punkt' tokenizer is downloaded only if missing
 try:
@@ -43,6 +45,36 @@ DATASET_URL = f"https://huggingface.co/datasets/{REPO_NAME}/raw/main/"
 MAX_TOKENS = 8192
 # Utility Functions
 def read_pdf(file_path):
     try:
         reader = PdfReader(file_path)
@@ -52,6 +84,35 @@ def read_pdf(file_path):
         log(f"Error reading PDF {file_path}: {e}")
         return ""
 def fetch_google_doc(url):
     if "docs.google.com/document/d/" in url:
         # Extract document ID
@@ -79,8 +140,31 @@ def fetch_url(url, max_depth):
             continue
         if depth < max_depth:
             visited.add(current_url)
-            # Check if it's a Google Doc
-            if "docs.google.com/document/d/" in current_url:
                 doc_content = fetch_google_doc(current_url)
                 if doc_content:
                     results.append(doc_content)
@@ -101,35 +185,6 @@ def fetch_url(url, max_depth):
                     errors.append(f"Error fetching {current_url}: {e}")
     return "\n".join(results), "\n".join(errors)
-def read_txt(txt_path):
-    try:
-        with open(txt_path, "r", encoding="utf-8") as f:
-            return f.read()
-    except Exception as e:
-        log(f"Error reading TXT file {txt_path}: {e}")
-        return ""
-def read_zip(zip_path):
-    try:
-        extracted_data = []
-        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
-            for file_info in zip_ref.infolist():
-                if file_info.filename.endswith((".txt", ".pdf")):
-                    with zip_ref.open(file_info) as file:
-                        content = file.read()
-                        if file_info.filename.endswith(".txt"):
-                            extracted_data.append(content.decode("utf-8"))
-                        elif file_info.filename.endswith(".pdf"):
-                            temp_path = f"/tmp/{uuid.uuid4()}"
-                            with open(temp_path, "wb") as temp_file:
-                                temp_file.write(content)
-                            extracted_data.append(read_pdf(temp_path))
-                            os.remove(temp_path)
-        return "\n".join(extracted_data)
-    except Exception as e:
-        log(f"Error reading ZIP file {zip_path}: {e}")
-        return ""
 def process_file(file):
     try:
         if file.name.endswith(".pdf"):

 import zipfile
 import nltk.data
 import nltk
+import tempfile
+import shutil
 # Ensure the 'punkt' tokenizer is downloaded only if missing
 try:
 MAX_TOKENS = 8192
 # Utility Functions
+def get_file_id_from_google_drive_url(url):
+    if "drive.google.com" in url and "file/d/" in url:
+        parts = url.split("/file/d/")
+        if len(parts) < 2:
+            return None
+        file_id = parts[1].split("/")[0].split("?")[0]
+        return file_id
+    return None
+def download_google_drive_file(file_id):
+    download_url = f"https://drive.google.com/uc?id={file_id}"
+    try:
+        response = requests.get(download_url, stream=True)
+        response.raise_for_status()
+        content_disposition = response.headers.get('Content-Disposition')
+        if content_disposition:
+            filename = content_disposition.split("filename=")[1].strip('"')
+        else:
+            filename = f"file_{uuid.uuid4()}"
+        temp_dir = tempfile.mkdtemp()
+        file_path = os.path.join(temp_dir, filename)
+        with open(file_path, "wb") as f:
+            for chunk in response.iter_content(chunk_size=8192):
+                f.write(chunk)
+        return file_path, temp_dir
+    except Exception as e:
+        log(f"Error downloading Google Drive file {file_id}: {e}")
+        return None, None
 def read_pdf(file_path):
     try:
         reader = PdfReader(file_path)
         log(f"Error reading PDF {file_path}: {e}")
         return ""
+def read_txt(txt_path):
+    try:
+        with open(txt_path, "r", encoding="utf-8") as f:
+            return f.read()
+    except Exception as e:
+        log(f"Error reading TXT file {txt_path}: {e}")
+        return ""
+def read_zip(zip_path):
+    try:
+        extracted_data = []
+        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
+            for file_info in zip_ref.infolist():
+                if file_info.filename.endswith((".txt", ".pdf")):
+                    with zip_ref.open(file_info) as file:
+                        content = file.read()
+                        if file_info.filename.endswith(".txt"):
+                            extracted_data.append(content.decode("utf-8"))
+                        elif file_info.filename.endswith(".pdf"):
+                            temp_path = os.path.join(tempfile.mkdtemp(), file_info.filename)
+                            with open(temp_path, "wb") as temp_file:
+                                temp_file.write(content)
+                            extracted_data.append(read_pdf(temp_path))
+                            os.remove(temp_path)
+        return "\n".join(extracted_data)
+    except Exception as e:
+        log(f"Error reading ZIP file {zip_path}: {e}")
+        return ""
 def fetch_google_doc(url):
     if "docs.google.com/document/d/" in url:
         # Extract document ID
             continue
         if depth < max_depth:
             visited.add(current_url)
+            # Check if it's a Google Drive file URL
+            if "drive.google.com/file/d/" in current_url:
+                file_id = get_file_id_from_google_drive_url(current_url)
+                if file_id:
+                    file_path, temp_dir = download_google_drive_file(file_id)
+                    if file_path:
+                        file_ext = os.path.splitext(file_path)[1].lower()
+                        if file_ext == ".pdf":
+                            pdf_text = read_pdf(file_path)
+                            results.append(pdf_text)
+                        elif file_ext == ".txt":
+                            txt_content = read_txt(file_path)
+                            results.append(txt_content)
+                        elif file_ext == ".zip":
+                            zip_content = read_zip(file_path)
+                            results.append(zip_content)
+                        else:
+                            errors.append(f"Unsupported file type for URL: {current_url}")
+                        shutil.rmtree(temp_dir)
+                    else:
+                        errors.append(f"Failed to download file from URL: {current_url}")
+                else:
+                    errors.append(f"Invalid Google Drive URL: {current_url}")
+            # Check if it's a Google Doc URL
+            elif "docs.google.com/document/d/" in current_url:
                 doc_content = fetch_google_doc(current_url)
                 if doc_content:
                     results.append(doc_content)
                     errors.append(f"Error fetching {current_url}: {e}")
     return "\n".join(results), "\n".join(errors)
 def process_file(file):
     try:
         if file.name.endswith(".pdf"):