Spaces:
Runtime error
Runtime error
| import logging | |
| import pickle | |
| from dataclasses import asdict, dataclass, field | |
| from pathlib import Path | |
| from typing import Callable | |
| import numpy as np | |
| import pandas as pd | |
| from common.constants import UNKNOWN | |
| from components.embedding_extraction import EmbeddingExtractor | |
| logger = logging.getLogger(__name__) | |
| class DatasetRow: | |
| """ | |
| Класс для хранения данных одной строки датасета. | |
| """ | |
| Index: int | |
| Text: str | |
| DocName: str | |
| Title: str | |
| DocNumber: str | |
| LevelParagraph: str = field(default=UNKNOWN) | |
| Pargaraph: str = field(default=UNKNOWN) | |
| Duplicate: str = field(default=UNKNOWN) | |
| PartLevel1: str = field(default=UNKNOWN) | |
| PartLevel2: str = field(default=UNKNOWN) | |
| Appendix: str = field(default=UNKNOWN) | |
| LevelParagraphAppendix: str = field(default=UNKNOWN) | |
| PargaraphAppendix: str = field(default=UNKNOWN) | |
| DuplicateAppendix: str = field(default=UNKNOWN) | |
| PartLevel1Appendix: str = field(default=UNKNOWN) | |
| Table: str = field(default=UNKNOWN) | |
| class DocumentsDataset: | |
| """ | |
| Класс для хранения данных датасета. | |
| Содержит список строк и векторы текстов. | |
| Изначально не содержит векторов, чтобы запустить процесс векторизации, | |
| нужно вызвать метод vectorize_with. | |
| """ | |
| def __init__(self, rows: list[DatasetRow]): | |
| self.rows = rows | |
| self.vectors: np.ndarray | None = None | |
| def vectorize_with( | |
| self, | |
| vectorizer: EmbeddingExtractor, | |
| progress_callback: Callable[[int, int], None] | None = None, | |
| ) -> None: | |
| """ | |
| Векторизация текстов в датасете. | |
| """ | |
| logger.info('Starting dataset vectorization') | |
| total = len(self.rows) | |
| rows = [row.Text for row in self.rows] | |
| vectors = vectorizer.vectorize(rows, progress_callback) | |
| self.vectors = vectors | |
| logger.info(f'Completed vectorization of {total} rows') | |
| def to_pandas(self) -> pd.DataFrame: | |
| """ | |
| Преобразовать датасет в pandas DataFrame. | |
| Returns: | |
| pd.DataFrame: Датафрейм с данными. | |
| """ | |
| df = pd.DataFrame([asdict(row) for row in self.rows]) | |
| if self.vectors is not None: | |
| df['Embedding'] = self.vectors.tolist() | |
| else: | |
| df['Embedding'] = np.nan | |
| return df | |
| def to_pickle(self, path: Path) -> None: | |
| """ | |
| Сохранение датасета в pickle файл. | |
| """ | |
| logger.info(f'Saving dataset to {path}') | |
| with open(path, 'wb') as f: | |
| pickle.dump(self.to_pandas(), f) | |
| logger.info('Dataset saved successfully') | |
| def from_pickle(cls, path: Path) -> 'DocumentsDataset': | |
| """ | |
| Загрузка датасета из pickle файла. | |
| """ | |
| logger.info(f'Loading dataset from {path}') | |
| try: | |
| with open(path, 'rb') as f: | |
| dataset = pickle.load(f) | |
| logger.info(f'Loaded dataset with {len(dataset.rows)} rows') | |
| return dataset | |
| except Exception as e: | |
| logger.error(f'Failed to load dataset: {e}') | |
| raise | |