Spaces:
Running
on
Zero
Running
on
Zero
| from calendar import month_name | |
| from retriever import BuildRetriever, db_dir | |
| import json | |
| import os | |
| import re | |
| def get_collection(compute_mode): | |
| """ | |
| Returns the vectorstore collection. | |
| Usage Examples: | |
| # Number of child documents | |
| collection = get_collection("remote") | |
| len(collection["ids"]) | |
| # Number of parent documents (unique doc_ids) | |
| len(set([m["doc_id"] for m in collection["metadatas"]])) | |
| """ | |
| retriever = BuildRetriever(compute_mode, "dense") | |
| return retriever.vectorstore.get() | |
| def get_sources(): | |
| """ | |
| Return the source files indexed in the database, e.g. 'R-help/2024-April.txt'. | |
| """ | |
| # Path to your JSON Lines file | |
| file_path = os.path.join(db_dir, "bm25", "corpus.jsonl") | |
| # Reading the JSON Lines file | |
| with open(file_path, "r", encoding="utf-8") as file: | |
| # Parse each line as a JSON object | |
| sources = [json.loads(line.strip())["metadata"]["source"] for line in file] | |
| return sources | |
| def get_start_end_months(sources): | |
| """ | |
| Given a set of filenames like 'R-help/2024-January.txt', return the earliest and latest month in 'Month YYYY' format. | |
| """ | |
| pattern = re.compile(r"R-help/(\d{4})-([A-Za-z]+)\.txt") | |
| months = [] | |
| # Start with the unique sources | |
| unique_sources = set(sources) | |
| for src in unique_sources: | |
| m = pattern.match(src) | |
| if m: | |
| year = int(m.group(1)) | |
| month_str = m.group(2) | |
| try: | |
| month_num = list(month_name).index(month_str) | |
| except ValueError: | |
| continue | |
| if month_num == 0: | |
| continue | |
| months.append((year, month_num, month_str)) | |
| if not months: | |
| return None, None | |
| months.sort() | |
| start = months[0] | |
| end = months[-1] | |
| return f"{start[2]} {start[0]}", f"{end[2]} {end[0]}" | |