Spaces:
Sleeping
Sleeping
Update retrival.py
Browse files- retrival.py +8 -3
retrival.py
CHANGED
|
@@ -136,6 +136,7 @@ def load_document(data_path):
|
|
| 136 |
|
| 137 |
# Loop over tables and match text from the same document and page
|
| 138 |
|
|
|
|
| 139 |
for doc in processed_documents:
|
| 140 |
cnt=1 # count for storing number of the table
|
| 141 |
for table_metadata in doc.get("tables", {}).get("Metadata", []):
|
|
@@ -180,6 +181,7 @@ def load_document(data_path):
|
|
| 180 |
}
|
| 181 |
)
|
| 182 |
)
|
|
|
|
| 183 |
|
| 184 |
# Initialize a structure to group content by doc_id
|
| 185 |
grouped_by_doc_id = defaultdict(lambda: {
|
|
@@ -224,6 +226,7 @@ def load_document(data_path):
|
|
| 224 |
|
| 225 |
|
| 226 |
#Dirctory loader for loading the text data only to specific db
|
|
|
|
| 227 |
loader = DirectoryLoader(data_path, glob="*.*")
|
| 228 |
documents = loader.load()
|
| 229 |
|
|
@@ -234,8 +237,9 @@ def load_document(data_path):
|
|
| 234 |
path=doc.metadata.get("source")
|
| 235 |
match = re.search(r'([^\\]+\.[^\\]+)$', path)
|
| 236 |
doc.metadata.update({"filename":match.group(1)})
|
| 237 |
-
|
| 238 |
-
|
|
|
|
| 239 |
#documents,processed_documents,table_document = load_document(data_path)
|
| 240 |
|
| 241 |
|
|
@@ -395,7 +399,8 @@ def generate_data_store(file_path, db_name):
|
|
| 395 |
print(f"Filepath ===> {file_path} DB Name ====> {db_name}")
|
| 396 |
|
| 397 |
try:
|
| 398 |
-
documents,grouped_documents = load_document(file_path)
|
|
|
|
| 399 |
print("Documents loaded successfully.")
|
| 400 |
except Exception as e:
|
| 401 |
print(f"Error loading documents: {e}")
|
|
|
|
| 136 |
|
| 137 |
# Loop over tables and match text from the same document and page
|
| 138 |
|
| 139 |
+
'''
|
| 140 |
for doc in processed_documents:
|
| 141 |
cnt=1 # count for storing number of the table
|
| 142 |
for table_metadata in doc.get("tables", {}).get("Metadata", []):
|
|
|
|
| 181 |
}
|
| 182 |
)
|
| 183 |
)
|
| 184 |
+
'''
|
| 185 |
|
| 186 |
# Initialize a structure to group content by doc_id
|
| 187 |
grouped_by_doc_id = defaultdict(lambda: {
|
|
|
|
| 226 |
|
| 227 |
|
| 228 |
#Dirctory loader for loading the text data only to specific db
|
| 229 |
+
'''
|
| 230 |
loader = DirectoryLoader(data_path, glob="*.*")
|
| 231 |
documents = loader.load()
|
| 232 |
|
|
|
|
| 237 |
path=doc.metadata.get("source")
|
| 238 |
match = re.search(r'([^\\]+\.[^\\]+)$', path)
|
| 239 |
doc.metadata.update({"filename":match.group(1)})
|
| 240 |
+
return documents,
|
| 241 |
+
'''
|
| 242 |
+
return grouped_documents
|
| 243 |
#documents,processed_documents,table_document = load_document(data_path)
|
| 244 |
|
| 245 |
|
|
|
|
| 399 |
print(f"Filepath ===> {file_path} DB Name ====> {db_name}")
|
| 400 |
|
| 401 |
try:
|
| 402 |
+
#documents,grouped_documents = load_document(file_path)
|
| 403 |
+
grouped_documents = load_document(file_path)
|
| 404 |
print("Documents loaded successfully.")
|
| 405 |
except Exception as e:
|
| 406 |
print(f"Error loading documents: {e}")
|