Upload feature extractor
Browse files- feature_extraction_moment.py +91 -0
- preprocessor_config.json +6 -0
feature_extraction_moment.py
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# FeatureExtractorでの実施事項
|
| 2 |
+
# - 時系列データをdataframe, numpy array, torch tensorの状態からtorch tensor化
|
| 3 |
+
# - input validation
|
| 4 |
+
|
| 5 |
+
from typing import List, Optional, Union
|
| 6 |
+
|
| 7 |
+
from pandas import DataFrame
|
| 8 |
+
import numpy as np
|
| 9 |
+
import torch
|
| 10 |
+
import tensorflow as tf
|
| 11 |
+
import jax.numpy as jnp
|
| 12 |
+
|
| 13 |
+
from transformers import FeatureExtractionMixin
|
| 14 |
+
from transformers import TensorType
|
| 15 |
+
from transformers import BatchFeature
|
| 16 |
+
from transformers.utils import logging
|
| 17 |
+
|
| 18 |
+
logger = logging.get_logger(__name__)
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
class MomentFeatureExtractor(FeatureExtractionMixin):
|
| 22 |
+
|
| 23 |
+
# TODO: 本来はMoment側のTokenizerもts_tokenizerとして入れたかったが、モデルに組み込まれてしまっている。
|
| 24 |
+
# refers: https://github.com/moment-timeseries-foundation-model/moment/blob/088b253a1138ac7e48a7efc9bf902336c9eec8d9/momentfm/models/moment.py#L105
|
| 25 |
+
|
| 26 |
+
model_input_names = ["time_series_values", "input_mask"]
|
| 27 |
+
|
| 28 |
+
def __init__(self, **kwargs):
|
| 29 |
+
super().__init__(**kwargs)
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def __call__(
|
| 33 |
+
self,
|
| 34 |
+
time_series: Union[DataFrame, np.ndarray, torch.Tensor, List[DataFrame], List[np.ndarray], List[torch.Tensor]] = None,
|
| 35 |
+
return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
|
| 36 |
+
torch_dtype: Optional[Union[str, torch.dtype]] = torch.float,
|
| 37 |
+
) -> BatchFeature:
|
| 38 |
+
if time_series is not None:
|
| 39 |
+
time_series_values = self._convert_time_series(time_series, return_tensors, torch_dtype)
|
| 40 |
+
else:
|
| 41 |
+
time_series_values = None
|
| 42 |
+
|
| 43 |
+
return BatchFeature(data={"time_series_values": time_series_values})
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def _convert_time_series(self, time_series, return_tensors, torch_dtype):
|
| 47 |
+
# DataFrame, np.ndarray, または torch.Tensor を torch.Tensor に変換
|
| 48 |
+
if isinstance(time_series, list):
|
| 49 |
+
# リスト内の各要素を torch.Tensor に変換し、最終的には1つのTensorに結合
|
| 50 |
+
time_series_tensor = torch.stack([self._convert_to_tensor(ts, torch_dtype) for ts in time_series])
|
| 51 |
+
else:
|
| 52 |
+
time_series_tensor = self._convert_to_tensor(time_series, torch_dtype)
|
| 53 |
+
|
| 54 |
+
# 次元数の確認
|
| 55 |
+
if time_series_tensor.dim() > 3:
|
| 56 |
+
raise ValueError("time_series_tensor must not have more than 3 dimensions")
|
| 57 |
+
elif time_series_tensor.dim() == 2:
|
| 58 |
+
time_series_tensor = time_series_tensor.unsqueeze(0)
|
| 59 |
+
elif time_series_tensor.dim() == 1:
|
| 60 |
+
time_series_tensor = time_series_tensor.unsqueeze(0).unsqueeze(0)
|
| 61 |
+
|
| 62 |
+
# 形式の出力
|
| 63 |
+
batch_size, n_channels, d_model = time_series_tensor.shape
|
| 64 |
+
logger.info(f"Batch size: {batch_size}, Number of channels: {n_channels}, Dimension of model: {d_model}")
|
| 65 |
+
|
| 66 |
+
# seq_lenを最大値512までに絞り込み
|
| 67 |
+
if time_series_tensor.shape[2] > 512:
|
| 68 |
+
time_series_tensor = time_series_tensor[:, :, :512]
|
| 69 |
+
logger.info("Sequence length has been truncated to 512.")
|
| 70 |
+
|
| 71 |
+
# return_tensorsの指定に応じてデータ形式を変換
|
| 72 |
+
if return_tensors == 'pt' or return_tensors == TensorType.PYTORCH:
|
| 73 |
+
return time_series_tensor
|
| 74 |
+
elif return_tensors == 'np' or return_tensors == TensorType.NUMPY:
|
| 75 |
+
return time_series_tensor.numpy()
|
| 76 |
+
elif return_tensors == 'tf' or return_tensors == TensorType.TENSORFLOW:
|
| 77 |
+
return tf.convert_to_tensor(time_series_tensor.numpy())
|
| 78 |
+
elif return_tensors == 'jax' or return_tensors == TensorType.JAX:
|
| 79 |
+
return jnp.array(time_series_tensor.numpy())
|
| 80 |
+
else:
|
| 81 |
+
raise ValueError("Unsupported return_tensors type")
|
| 82 |
+
|
| 83 |
+
def _convert_to_tensor(self, time_series, torch_dtype):
|
| 84 |
+
if isinstance(time_series, DataFrame):
|
| 85 |
+
time_series_tensor = torch.tensor(time_series.values, dtype=torch_dtype).t()
|
| 86 |
+
elif isinstance(time_series, np.ndarray):
|
| 87 |
+
time_series_tensor = torch.tensor(time_series, dtype=torch_dtype)
|
| 88 |
+
elif isinstance(time_series, torch.Tensor):
|
| 89 |
+
time_series_tensor = time_series.to(torch_dtype)
|
| 90 |
+
|
| 91 |
+
return time_series_tensor
|
preprocessor_config.json
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"auto_map": {
|
| 3 |
+
"AutoFeatureExtractor": "feature_extraction_moment.MomentFeatureExtractor"
|
| 4 |
+
},
|
| 5 |
+
"feature_extractor_type": "MomentFeatureExtractor"
|
| 6 |
+
}
|