Spaces:
Build error
Build error
XThomasBU
commited on
Commit
·
4308a1a
1
Parent(s):
e17a5d0
remove hard coded values
Browse files
code/modules/config/{user_config.yml → project_config.yml}
RENAMED
|
@@ -1,3 +1,7 @@
|
|
| 1 |
retriever:
|
| 2 |
retriever_hf_paths:
|
| 3 |
RAGatouille: "XThomasBU/Colbert_Index"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
retriever:
|
| 2 |
retriever_hf_paths:
|
| 3 |
RAGatouille: "XThomasBU/Colbert_Index"
|
| 4 |
+
|
| 5 |
+
metadata:
|
| 6 |
+
metada_links: ["https://dl4ds.github.io/sp2024/lectures/", "https://dl4ds.github.io/sp2024/schedule/"]
|
| 7 |
+
slide_base_link: "https://dl4ds.github.io"
|
code/modules/dataloader/data_loader.py
CHANGED
|
@@ -222,8 +222,7 @@ class ChunkProcessor:
|
|
| 222 |
|
| 223 |
def chunk_docs(self, file_reader, uploaded_files, weblinks):
|
| 224 |
addl_metadata = get_metadata(
|
| 225 |
-
|
| 226 |
-
"https://dl4ds.github.io/sp2024/schedule/",
|
| 227 |
) # For any additional metadata
|
| 228 |
|
| 229 |
# remove already processed files if reparse_files is False
|
|
@@ -426,6 +425,12 @@ if __name__ == "__main__":
|
|
| 426 |
with open("../code/modules/config/config.yml", "r") as f:
|
| 427 |
config = yaml.safe_load(f)
|
| 428 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 429 |
STORAGE_DIR = os.path.join(BASE_DIR, config["vectorstore"]["data_path"])
|
| 430 |
uploaded_files = [
|
| 431 |
os.path.join(STORAGE_DIR, file)
|
|
@@ -434,6 +439,7 @@ if __name__ == "__main__":
|
|
| 434 |
]
|
| 435 |
|
| 436 |
data_loader = DataLoader(config, logger=logger)
|
|
|
|
| 437 |
document_chunks, document_names, documents, document_metadata = (
|
| 438 |
data_loader.get_chunks(
|
| 439 |
[
|
|
|
|
| 222 |
|
| 223 |
def chunk_docs(self, file_reader, uploaded_files, weblinks):
|
| 224 |
addl_metadata = get_metadata(
|
| 225 |
+
*self.config["metadata"]["metada_links"], self.config
|
|
|
|
| 226 |
) # For any additional metadata
|
| 227 |
|
| 228 |
# remove already processed files if reparse_files is False
|
|
|
|
| 425 |
with open("../code/modules/config/config.yml", "r") as f:
|
| 426 |
config = yaml.safe_load(f)
|
| 427 |
|
| 428 |
+
with open("../code/modules/config/project_config.yml", "r") as f:
|
| 429 |
+
project_config = yaml.safe_load(f)
|
| 430 |
+
|
| 431 |
+
# Combine project config with the main config
|
| 432 |
+
config.update(project_config)
|
| 433 |
+
|
| 434 |
STORAGE_DIR = os.path.join(BASE_DIR, config["vectorstore"]["data_path"])
|
| 435 |
uploaded_files = [
|
| 436 |
os.path.join(STORAGE_DIR, file)
|
|
|
|
| 439 |
]
|
| 440 |
|
| 441 |
data_loader = DataLoader(config, logger=logger)
|
| 442 |
+
# Just for testing
|
| 443 |
document_chunks, document_names, documents, document_metadata = (
|
| 444 |
data_loader.get_chunks(
|
| 445 |
[
|
code/modules/dataloader/helpers.py
CHANGED
|
@@ -21,7 +21,8 @@ def get_base_url(url):
|
|
| 21 |
return base_url
|
| 22 |
|
| 23 |
|
| 24 |
-
|
|
|
|
| 25 |
"""
|
| 26 |
Function to get the lecture metadata from the lectures and schedule URLs.
|
| 27 |
"""
|
|
@@ -50,7 +51,9 @@ def get_metadata(lectures_url, schedule_url):
|
|
| 50 |
slides_link_tag = description_div.find("a", title="Download slides")
|
| 51 |
slides_link = slides_link_tag["href"].strip() if slides_link_tag else None
|
| 52 |
slides_link = (
|
| 53 |
-
f"
|
|
|
|
|
|
|
| 54 |
)
|
| 55 |
if slides_link:
|
| 56 |
date_mapping[slides_link] = date
|
|
@@ -70,7 +73,9 @@ def get_metadata(lectures_url, schedule_url):
|
|
| 70 |
slides_link_tag = block.find("a", title="Download slides")
|
| 71 |
slides_link = slides_link_tag["href"].strip() if slides_link_tag else None
|
| 72 |
slides_link = (
|
| 73 |
-
f"
|
|
|
|
|
|
|
| 74 |
)
|
| 75 |
|
| 76 |
# Extract the link to the lecture recording
|
|
|
|
| 21 |
return base_url
|
| 22 |
|
| 23 |
|
| 24 |
+
### THIS FUNCTION IS NOT GENERALIZABLE.. IT IS SPECIFIC TO THE COURSE WEBSITE ###
|
| 25 |
+
def get_metadata(lectures_url, schedule_url, config):
|
| 26 |
"""
|
| 27 |
Function to get the lecture metadata from the lectures and schedule URLs.
|
| 28 |
"""
|
|
|
|
| 51 |
slides_link_tag = description_div.find("a", title="Download slides")
|
| 52 |
slides_link = slides_link_tag["href"].strip() if slides_link_tag else None
|
| 53 |
slides_link = (
|
| 54 |
+
f"{config['metadata']['slide_base_link']}{slides_link}"
|
| 55 |
+
if slides_link
|
| 56 |
+
else None
|
| 57 |
)
|
| 58 |
if slides_link:
|
| 59 |
date_mapping[slides_link] = date
|
|
|
|
| 73 |
slides_link_tag = block.find("a", title="Download slides")
|
| 74 |
slides_link = slides_link_tag["href"].strip() if slides_link_tag else None
|
| 75 |
slides_link = (
|
| 76 |
+
f"{config['metadata']['slide_base_link']}{slides_link}"
|
| 77 |
+
if slides_link
|
| 78 |
+
else None
|
| 79 |
)
|
| 80 |
|
| 81 |
# Extract the link to the lecture recording
|
code/modules/vectorstore/store_manager.py
CHANGED
|
@@ -168,19 +168,21 @@ if __name__ == "__main__":
|
|
| 168 |
|
| 169 |
with open("modules/config/config.yml", "r") as f:
|
| 170 |
config = yaml.safe_load(f)
|
| 171 |
-
with open("modules/config/
|
| 172 |
-
|
|
|
|
|
|
|
|
|
|
| 173 |
print(config)
|
| 174 |
-
print(user_config)
|
| 175 |
print(f"Trying to create database with config: {config}")
|
| 176 |
vector_db = VectorStoreManager(config)
|
| 177 |
if config["vectorstore"]["load_from_HF"]:
|
| 178 |
if (
|
| 179 |
config["vectorstore"]["db_option"]
|
| 180 |
-
in
|
| 181 |
):
|
| 182 |
vector_db.load_from_HF(
|
| 183 |
-
HF_PATH=
|
| 184 |
config["vectorstore"]["db_option"]
|
| 185 |
]
|
| 186 |
)
|
|
|
|
| 168 |
|
| 169 |
with open("modules/config/config.yml", "r") as f:
|
| 170 |
config = yaml.safe_load(f)
|
| 171 |
+
with open("modules/config/project_config.yml", "r") as f:
|
| 172 |
+
project_config = yaml.safe_load(f)
|
| 173 |
+
|
| 174 |
+
# combine the two configs
|
| 175 |
+
config.update(project_config)
|
| 176 |
print(config)
|
|
|
|
| 177 |
print(f"Trying to create database with config: {config}")
|
| 178 |
vector_db = VectorStoreManager(config)
|
| 179 |
if config["vectorstore"]["load_from_HF"]:
|
| 180 |
if (
|
| 181 |
config["vectorstore"]["db_option"]
|
| 182 |
+
in config["retriever"]["retriever_hf_paths"]
|
| 183 |
):
|
| 184 |
vector_db.load_from_HF(
|
| 185 |
+
HF_PATH=config["retriever"]["retriever_hf_paths"][
|
| 186 |
config["vectorstore"]["db_option"]
|
| 187 |
]
|
| 188 |
)
|
docs/setup.md
CHANGED
|
@@ -124,4 +124,4 @@ CHAINLIT_URL=<your_chainlit_url>
|
|
| 124 |
# Configuration
|
| 125 |
|
| 126 |
The configuration file `code/modules/config.yaml` contains the parameters that control the behaviour of your app.
|
| 127 |
-
The configuration file `code/modules/
|
|
|
|
| 124 |
# Configuration
|
| 125 |
|
| 126 |
The configuration file `code/modules/config.yaml` contains the parameters that control the behaviour of your app.
|
| 127 |
+
The configuration file `code/modules/project_config.yaml` contains project-specific parameters.
|