Spaces:
Sleeping
Sleeping
| import xml.etree.ElementTree as ET | |
| import glob | |
| import os | |
| import logging | |
| def process_json_files(start, end): | |
| base_path = "texts/tanach" | |
| results = {} | |
| for i in range(start, end + 1): | |
| file_pattern = f"{base_path}/{i:02}*.xml" | |
| matching_files = glob.glob(file_pattern) | |
| if not matching_files: | |
| logging.warning(f"No file matching pattern '{file_pattern}' found.") | |
| results[i] = {"title": "No title", "text": []} | |
| continue | |
| book_texts = [] | |
| for file_name in matching_files: | |
| try: | |
| tree = ET.parse(file_name) | |
| root = tree.getroot() | |
| chapter_texts = [] | |
| for chapter in root.findall('.//c'): | |
| verse_texts = [] | |
| for verse in chapter.findall('./v'): | |
| verse_text = "" | |
| for word in verse.findall('./w'): | |
| verse_text += " " + "".join(word.itertext()) | |
| verse_texts.append(verse_text.strip()) | |
| chapter_texts.append(verse_texts) | |
| book_texts = chapter_texts | |
| book_title = root.find('.//names/name').text if root.find('.//names/name') is not None else os.path.basename(file_name) | |
| results[i] = { | |
| "title": book_title, | |
| "text": book_texts | |
| } | |
| except FileNotFoundError: | |
| logging.warning(f"File {file_name} not found.") | |
| results[i] = {"title": "No title", "text": []} | |
| except ET.ParseError as e: | |
| logging.warning(f"File {file_name} could not be read as XML: {e}") | |
| results[i] = {"title": "No title", "text": []} | |
| except KeyError as e: | |
| logging.warning(f"Expected key 'text' is missing in {file_name}: {e}") | |
| results[i] = {"title": "No title", "text": []} | |
| return results | |