Spaces:
Running
Running
Arjun Moorthy
commited on
Commit
Β·
2105147
1
Parent(s):
2720b05
Enable PDF processing in RAG system
Browse files- Oncolife/app.py +16 -1
Oncolife/app.py
CHANGED
|
@@ -129,7 +129,22 @@ class OncoLifeAssistant:
|
|
| 129 |
|
| 130 |
documents_loaded = 0
|
| 131 |
|
| 132 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 133 |
for json_file in docs_path.glob("*.json"):
|
| 134 |
try:
|
| 135 |
print(f"π Processing JSON: {json_file.name}")
|
|
|
|
| 129 |
|
| 130 |
documents_loaded = 0
|
| 131 |
|
| 132 |
+
# Process PDF files (essential medical guidelines)
|
| 133 |
+
for pdf_file in docs_path.glob("*.pdf"):
|
| 134 |
+
try:
|
| 135 |
+
print(f"π Processing PDF: {pdf_file.name}")
|
| 136 |
+
text = self._extract_pdf_text(pdf_file)
|
| 137 |
+
if text:
|
| 138 |
+
chunks = text_splitter.split_text(text)
|
| 139 |
+
self._add_chunks_to_db(chunks, pdf_file.name)
|
| 140 |
+
documents_loaded += 1
|
| 141 |
+
print(f"β
Added {len(chunks)} chunks from {pdf_file.name}")
|
| 142 |
+
else:
|
| 143 |
+
print(f"β οΈ No text extracted from {pdf_file.name}")
|
| 144 |
+
except Exception as e:
|
| 145 |
+
print(f"β Error processing {pdf_file.name}: {e}")
|
| 146 |
+
|
| 147 |
+
# Process JSON files (lightweight)
|
| 148 |
for json_file in docs_path.glob("*.json"):
|
| 149 |
try:
|
| 150 |
print(f"π Processing JSON: {json_file.name}")
|