Spaces:
Runtime error
Runtime error
Carlos Salgado
commited on
Commit
·
bbe64b5
1
Parent(s):
b10d0e6
update flake, fix ingest steamlit compatibility bug
Browse files- backend/generate_metadata.py +9 -8
- flake.nix +11 -3
backend/generate_metadata.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
import os
|
|
|
|
| 2 |
import argparse
|
| 3 |
import json
|
| 4 |
import openai
|
|
@@ -12,13 +13,13 @@ from langchain_text_splitters import RecursiveCharacterTextSplitter
|
|
| 12 |
load_dotenv()
|
| 13 |
|
| 14 |
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
if
|
| 19 |
-
loader = UnstructuredPDFLoader(
|
| 20 |
-
elif
|
| 21 |
-
loader = TextLoader(
|
| 22 |
else:
|
| 23 |
raise NotImplementedError('Only .txt or .pdf files are supported')
|
| 24 |
|
|
@@ -29,7 +30,7 @@ def ingest(file_path):
|
|
| 29 |
"\n\n",
|
| 30 |
"\n",
|
| 31 |
" ",
|
| 32 |
-
",",
|
| 33 |
"\uff0c", # Fullwidth comma
|
| 34 |
"\u3001", # Ideographic comma
|
| 35 |
"\uff0e", # Fullwidth full stop
|
|
|
|
| 1 |
import os
|
| 2 |
+
import io
|
| 3 |
import argparse
|
| 4 |
import json
|
| 5 |
import openai
|
|
|
|
| 13 |
load_dotenv()
|
| 14 |
|
| 15 |
|
| 16 |
+
import io
|
| 17 |
+
|
| 18 |
+
def ingest(file_obj, file_ext='pdf'):
|
| 19 |
+
if file_ext == 'pdf':
|
| 20 |
+
loader = UnstructuredPDFLoader(file_obj)
|
| 21 |
+
elif file_ext == 'txt':
|
| 22 |
+
loader = TextLoader(file_obj)
|
| 23 |
else:
|
| 24 |
raise NotImplementedError('Only .txt or .pdf files are supported')
|
| 25 |
|
|
|
|
| 30 |
"\n\n",
|
| 31 |
"\n",
|
| 32 |
" ",
|
| 33 |
+
",",
|
| 34 |
"\uff0c", # Fullwidth comma
|
| 35 |
"\u3001", # Ideographic comma
|
| 36 |
"\uff0e", # Fullwidth full stop
|
flake.nix
CHANGED
|
@@ -14,6 +14,9 @@
|
|
| 14 |
devShells.${system}.default = pkgs.mkShell {
|
| 15 |
packages = [
|
| 16 |
(pkgs.python311.withPackages (python-pkgs: [
|
|
|
|
|
|
|
|
|
|
| 17 |
python-pkgs.numpy
|
| 18 |
python-pkgs.pandas
|
| 19 |
python-pkgs.scipy
|
|
@@ -23,15 +26,20 @@
|
|
| 23 |
python-pkgs.langchain
|
| 24 |
python-pkgs.langchain-text-splitters
|
| 25 |
python-pkgs.unstructured
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
python-pkgs.openai
|
| 27 |
python-pkgs.pydantic
|
| 28 |
python-pkgs.python-dotenv
|
| 29 |
python-pkgs.configargparse
|
| 30 |
python-pkgs.streamlit
|
| 31 |
-
python-pkgs.pip
|
| 32 |
python-pkgs.lark
|
| 33 |
-
python-pkgs.jupyter
|
| 34 |
-
python-pkgs.notebook
|
| 35 |
python-pkgs.sentence-transformers
|
| 36 |
pkgs.unstructured-api
|
| 37 |
]))
|
|
|
|
| 14 |
devShells.${system}.default = pkgs.mkShell {
|
| 15 |
packages = [
|
| 16 |
(pkgs.python311.withPackages (python-pkgs: [
|
| 17 |
+
python-pkgs.pip # VsCode starts
|
| 18 |
+
python-pkgs.jupyter
|
| 19 |
+
python-pkgs.notebook # VsCode ends
|
| 20 |
python-pkgs.numpy
|
| 21 |
python-pkgs.pandas
|
| 22 |
python-pkgs.scipy
|
|
|
|
| 26 |
python-pkgs.langchain
|
| 27 |
python-pkgs.langchain-text-splitters
|
| 28 |
python-pkgs.unstructured
|
| 29 |
+
python-pkgs.wrapt # unstructured[local-inference] starts
|
| 30 |
+
python-pkgs.iso-639
|
| 31 |
+
python-pkgs.emoji
|
| 32 |
+
python-pkgs.pillow-heif
|
| 33 |
+
python-pkgs.magic
|
| 34 |
+
python-pkgs.poppler-qt5
|
| 35 |
+
python-pkgs.pytesseract
|
| 36 |
+
python-pkgs.langdetect # unstructured[local-inference] ends
|
| 37 |
python-pkgs.openai
|
| 38 |
python-pkgs.pydantic
|
| 39 |
python-pkgs.python-dotenv
|
| 40 |
python-pkgs.configargparse
|
| 41 |
python-pkgs.streamlit
|
|
|
|
| 42 |
python-pkgs.lark
|
|
|
|
|
|
|
| 43 |
python-pkgs.sentence-transformers
|
| 44 |
pkgs.unstructured-api
|
| 45 |
]))
|