Spaces:
Runtime error
Runtime error
| import json | |
| import logging | |
| def process_json_files(start, end): | |
| """ | |
| Processes JSON files containing Tanach text and returns a dictionary | |
| mapping book IDs to their data. | |
| Args: | |
| start: The starting book ID (inclusive). | |
| end: The ending book ID (inclusive). | |
| Returns: | |
| A dictionary where keys are book IDs and values are dictionaries | |
| containing 'title' and 'text' fields. | |
| """ | |
| base_path = "texts" | |
| results = {} | |
| for i in range(start, end + 1): | |
| file_name = f"{base_path}/{i:02}.json" | |
| try: | |
| with open(file_name, 'r', encoding='utf-8') as file: | |
| data = json.load(file) | |
| if data: | |
| results[i] = {"title": data.get("title", "No title"), "text": data.get("text", [])} | |
| except FileNotFoundError: | |
| logging.warning(f"File {file_name} not found.") | |
| except json.JSONDecodeError as e: | |
| logging.warning(f"File {file_name} could not be read as JSON: {e}") | |
| except KeyError as e: | |
| logging.warning(f"Expected key 'text' is missing in {file_name}: {e}") | |
| return results | |
| def flatten_text_with_line_breaks(text): | |
| """ | |
| Flattens nested lists while preserving line breaks. | |
| """ | |
| flattened_text = [] | |
| for item in text: | |
| if isinstance(item, list): | |
| flattened_text.extend(flatten_text_with_line_breaks(item)) | |
| elif isinstance(item, str): | |
| flattened_text.append(item) | |
| else: | |
| flattened_text.append(str(item)) | |
| return flattened_text | |
| def calculate_tanach_statistics(tanach_data): | |
| """ | |
| Calculates statistics for the Tanach corpus. | |
| """ | |
| # ... (rest of the function remains the same) | |
| def build_word_index(tanach_data): | |
| """ | |
| Builds a word index for efficient lookup, ensuring the last word | |
| aligns with the last second of the day. | |
| """ | |
| word_index = {} | |
| word_count = 0 | |
| total_seconds = 24 * 60 * 60 # Total seconds in a day | |
| # Calculate total words first | |
| total_words = 0 | |
| for book_id in tanach_data: | |
| for chapter in tanach_data[book_id]["text"]: | |
| flattened_chapter = flatten_text_with_line_breaks(chapter) | |
| total_words += len(flattened_chapter) | |
| # Calculate the seconds per word | |
| seconds_per_word = total_seconds / total_words if total_words > 0 else 0 | |
| for book_id in tanach_data: | |
| for chapter_index, chapter in enumerate(tanach_data[book_id]["text"]): | |
| flattened_chapter = flatten_text_with_line_breaks(chapter) | |
| for verse_index, word in enumerate(flattened_chapter): | |
| # Calculate the target second for the current word | |
| target_second = int(word_count * seconds_per_word) | |
| # Use the target second as the key | |
| word_index[target_second] = { | |
| "book_id": book_id, | |
| "chapter_id": chapter_index, | |
| "verse_id": verse_index + 1, | |
| } | |
| word_count += 1 | |
| return word_index | |