Upload feature_extraction_moment.py
Browse files- feature_extraction_moment.py +126 -19
feature_extraction_moment.py
CHANGED
|
@@ -2,7 +2,7 @@
|
|
| 2 |
# - 時系列データをdataframe, numpy array, torch tensorの状態からtorch tensor化
|
| 3 |
# - input validation
|
| 4 |
|
| 5 |
-
from typing import List, Optional, Union
|
| 6 |
|
| 7 |
from pandas import DataFrame
|
| 8 |
import numpy as np
|
|
@@ -29,63 +29,170 @@ class MomentFeatureExtractor(FeatureExtractionMixin):
|
|
| 29 |
super().__init__(**kwargs)
|
| 30 |
|
| 31 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
def __call__(
|
| 33 |
self,
|
| 34 |
time_series: Union[DataFrame, np.ndarray, torch.Tensor, List[DataFrame], List[np.ndarray], List[torch.Tensor]] = None,
|
| 35 |
return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
|
| 36 |
torch_dtype: Optional[Union[str, torch.dtype]] = torch.float,
|
|
|
|
|
|
|
| 37 |
) -> BatchFeature:
|
| 38 |
if time_series is not None:
|
| 39 |
-
time_series_values = self._convert_time_series(time_series, return_tensors, torch_dtype)
|
| 40 |
else:
|
| 41 |
time_series_values = None
|
|
|
|
| 42 |
|
| 43 |
-
return BatchFeature(data={"time_series_values": time_series_values})
|
| 44 |
|
| 45 |
|
| 46 |
-
def _convert_time_series(self, time_series, return_tensors, torch_dtype):
|
| 47 |
# DataFrame, np.ndarray, または torch.Tensor を torch.Tensor に変換
|
| 48 |
if isinstance(time_series, list):
|
| 49 |
# リスト内の各要素を torch.Tensor に変換し、最終的には1つのTensorに結合
|
| 50 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
else:
|
| 52 |
time_series_tensor = self._convert_to_tensor(time_series, torch_dtype)
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
elif time_series_tensor.dim() == 2:
|
| 58 |
-
time_series_tensor = time_series_tensor.unsqueeze(0)
|
| 59 |
-
elif time_series_tensor.dim() == 1:
|
| 60 |
-
time_series_tensor = time_series_tensor.unsqueeze(0).unsqueeze(0)
|
| 61 |
|
| 62 |
# 形式の出力
|
| 63 |
batch_size, n_channels, d_model = time_series_tensor.shape
|
| 64 |
logger.info(f"Batch size: {batch_size}, Number of channels: {n_channels}, Dimension of model: {d_model}")
|
| 65 |
|
| 66 |
-
# seq_lenを最大値512
|
| 67 |
if time_series_tensor.shape[2] > 512:
|
| 68 |
time_series_tensor = time_series_tensor[:, :, :512]
|
| 69 |
logger.info("Sequence length has been truncated to 512.")
|
| 70 |
|
| 71 |
# return_tensorsの指定に応じてデータ形式を変換
|
| 72 |
if return_tensors == 'pt' or return_tensors == TensorType.PYTORCH:
|
| 73 |
-
return time_series_tensor
|
| 74 |
elif return_tensors == 'np' or return_tensors == TensorType.NUMPY:
|
| 75 |
-
return time_series_tensor.numpy()
|
| 76 |
elif return_tensors == 'tf' or return_tensors == TensorType.TENSORFLOW:
|
| 77 |
-
return tf.convert_to_tensor(time_series_tensor.numpy())
|
| 78 |
elif return_tensors == 'jax' or return_tensors == TensorType.JAX:
|
| 79 |
-
return jnp.array(time_series_tensor.numpy())
|
| 80 |
else:
|
| 81 |
raise ValueError("Unsupported return_tensors type")
|
| 82 |
|
| 83 |
def _convert_to_tensor(self, time_series, torch_dtype):
|
| 84 |
if isinstance(time_series, DataFrame):
|
| 85 |
time_series_tensor = torch.tensor(time_series.values, dtype=torch_dtype).t()
|
| 86 |
-
elif isinstance(time_series, np.ndarray):
|
| 87 |
time_series_tensor = torch.tensor(time_series, dtype=torch_dtype)
|
| 88 |
elif isinstance(time_series, torch.Tensor):
|
| 89 |
time_series_tensor = time_series.to(torch_dtype)
|
| 90 |
|
| 91 |
return time_series_tensor
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
# - 時系列データをdataframe, numpy array, torch tensorの状態からtorch tensor化
|
| 3 |
# - input validation
|
| 4 |
|
| 5 |
+
from typing import List, Optional, Union, Literal, Tuple
|
| 6 |
|
| 7 |
from pandas import DataFrame
|
| 8 |
import numpy as np
|
|
|
|
| 29 |
super().__init__(**kwargs)
|
| 30 |
|
| 31 |
|
| 32 |
+
"""
|
| 33 |
+
padding ( bool、strまたはPaddingStrategy、オプション、デフォルトはFalse):
|
| 34 |
+
paddingをアクティブ化および制御します。次の値を受け入れます:
|
| 35 |
+
- True or 'longest': バッチ内の最長シーケンスにパディングします (シーケンスが 1 つだけの場合はパディングしません)。
|
| 36 |
+
- 'max_length': 引数で指定された最大長までパディングします。max_length引数が指定されていない場合は、モデルで許容される最大入力長までパディングします。
|
| 37 |
+
- False or 'do_not_pad'(デフォルト): パディングなし (つまり、異なる長さのシーケンスを含むバッチを出力できます)。
|
| 38 |
+
"""
|
| 39 |
def __call__(
|
| 40 |
self,
|
| 41 |
time_series: Union[DataFrame, np.ndarray, torch.Tensor, List[DataFrame], List[np.ndarray], List[torch.Tensor]] = None,
|
| 42 |
return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
|
| 43 |
torch_dtype: Optional[Union[str, torch.dtype]] = torch.float,
|
| 44 |
+
padding: Union[bool, str] = False, # 追加
|
| 45 |
+
max_length: int = None, # 追加
|
| 46 |
) -> BatchFeature:
|
| 47 |
if time_series is not None:
|
| 48 |
+
time_series_values, input_mask = self._convert_time_series(time_series, return_tensors, torch_dtype, max_length)
|
| 49 |
else:
|
| 50 |
time_series_values = None
|
| 51 |
+
input_mask = None
|
| 52 |
|
| 53 |
+
return BatchFeature(data={"time_series_values": time_series_values, "input_mask": input_mask})
|
| 54 |
|
| 55 |
|
| 56 |
+
def _convert_time_series(self, time_series, return_tensors, torch_dtype, padding, max_length):
|
| 57 |
# DataFrame, np.ndarray, または torch.Tensor を torch.Tensor に変換
|
| 58 |
if isinstance(time_series, list):
|
| 59 |
# リスト内の各要素を torch.Tensor に変換し、最終的には1つのTensorに結合
|
| 60 |
+
time_series_list = [self._convert_to_tensor(ts, torch_dtype) for ts in time_series]
|
| 61 |
+
# 次元数の確認
|
| 62 |
+
time_series_list = [self._convert_tensor_dim(ts, dim=2) for ts in time_series_list]
|
| 63 |
+
# trancate, padding
|
| 64 |
+
time_series_tensor, input_mask = self._pad_time_series(time_series_list, padding, max_length)
|
| 65 |
else:
|
| 66 |
time_series_tensor = self._convert_to_tensor(time_series, torch_dtype)
|
| 67 |
+
# 次元数の確認
|
| 68 |
+
time_series_tensor = self._convert_tensor_dim(time_series_tensor, dim=3)
|
| 69 |
+
# trancate, padding
|
| 70 |
+
time_series_tensor, input_mask = self._pad_time_series(time_series_tensor, padding, max_length)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
|
| 72 |
# 形式の出力
|
| 73 |
batch_size, n_channels, d_model = time_series_tensor.shape
|
| 74 |
logger.info(f"Batch size: {batch_size}, Number of channels: {n_channels}, Dimension of model: {d_model}")
|
| 75 |
|
| 76 |
+
# seq_lenを最大値512までに切り詰め
|
| 77 |
if time_series_tensor.shape[2] > 512:
|
| 78 |
time_series_tensor = time_series_tensor[:, :, :512]
|
| 79 |
logger.info("Sequence length has been truncated to 512.")
|
| 80 |
|
| 81 |
# return_tensorsの指定に応じてデータ形式を変換
|
| 82 |
if return_tensors == 'pt' or return_tensors == TensorType.PYTORCH:
|
| 83 |
+
return time_series_tensor, input_mask
|
| 84 |
elif return_tensors == 'np' or return_tensors == TensorType.NUMPY:
|
| 85 |
+
return time_series_tensor.numpy(), input_mask
|
| 86 |
elif return_tensors == 'tf' or return_tensors == TensorType.TENSORFLOW:
|
| 87 |
+
return tf.convert_to_tensor(time_series_tensor.numpy()), input_mask
|
| 88 |
elif return_tensors == 'jax' or return_tensors == TensorType.JAX:
|
| 89 |
+
return jnp.array(time_series_tensor.numpy()), input_mask
|
| 90 |
else:
|
| 91 |
raise ValueError("Unsupported return_tensors type")
|
| 92 |
|
| 93 |
def _convert_to_tensor(self, time_series, torch_dtype):
|
| 94 |
if isinstance(time_series, DataFrame):
|
| 95 |
time_series_tensor = torch.tensor(time_series.values, dtype=torch_dtype).t()
|
| 96 |
+
elif isinstance(time_series, np.ndarray) or isinstance(time_series, list):
|
| 97 |
time_series_tensor = torch.tensor(time_series, dtype=torch_dtype)
|
| 98 |
elif isinstance(time_series, torch.Tensor):
|
| 99 |
time_series_tensor = time_series.to(torch_dtype)
|
| 100 |
|
| 101 |
return time_series_tensor
|
| 102 |
+
|
| 103 |
+
def _convert_tensor_dim(self, time_series, dim=3):
|
| 104 |
+
if time_series.dim() > dim:
|
| 105 |
+
raise ValueError("time_series must not have more than 3 dimensions")
|
| 106 |
+
|
| 107 |
+
while time_series.dim() < dim:
|
| 108 |
+
time_series = time_series.unsqueeze(0)
|
| 109 |
+
|
| 110 |
+
return time_series
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
def _pad_time_series(
|
| 114 |
+
time_series_values: Union[torch.Tensor, List[torch.Tensor]],
|
| 115 |
+
padding: Union[bool, Literal['longest', 'max_length', 'do_not_pad']] = 'do_not_pad',
|
| 116 |
+
max_length: Union[int, None] = None
|
| 117 |
+
) -> Tuple[torch.Tensor, torch.Tensor]:
|
| 118 |
+
"""
|
| 119 |
+
時系列データにパディングを適用し、対応するinput_maskを生成する関数。
|
| 120 |
+
|
| 121 |
+
Args:
|
| 122 |
+
time_series_values (Union[torch.Tensor, List[torch.Tensor]]):
|
| 123 |
+
パディングする時系列データ。
|
| 124 |
+
3次元テンソル (batch_size, n_channels, seq_len) または
|
| 125 |
+
2次元テンソル (n_channels, seq_len) のリストを想定。
|
| 126 |
+
padding (Union[bool, Literal['longest', 'max_length', 'do_not_pad']], optional):
|
| 127 |
+
パディングの種類。デフォルトは 'do_not_pad'。
|
| 128 |
+
- True または 'longest': バッチ内の最長シーケンスにパディング
|
| 129 |
+
- 'max_length': 指定された最大長までパディング
|
| 130 |
+
- False または 'do_not_pad': パディングなし(最短シーケンスに合わせて切り捨て)
|
| 131 |
+
max_length (Union[int, None], optional):
|
| 132 |
+
'max_length' パディング時の最大長。
|
| 133 |
+
指定がない場合は512を使用。デフォルトは None。
|
| 134 |
+
|
| 135 |
+
Returns:
|
| 136 |
+
Tuple[torch.Tensor, torch.Tensor]:
|
| 137 |
+
- パディングされた時系列データ。形状は (batch_size, n_channels, padded_seq_len)。
|
| 138 |
+
- input_mask。形状は (batch_size, padded_seq_len)。
|
| 139 |
+
1はデータが存在する部分、0はパディングされた部分を示す。
|
| 140 |
+
|
| 141 |
+
Raises:
|
| 142 |
+
ValueError: サポートされていない入力形状、無効なパディングオプション、
|
| 143 |
+
不適切なmax_length、またはチャンネル数の不一致の場合。
|
| 144 |
+
"""
|
| 145 |
+
# max_lengthの検証
|
| 146 |
+
if max_length is not None:
|
| 147 |
+
if not isinstance(max_length, int) or max_length <= 0:
|
| 148 |
+
raise ValueError("max_length は正の整数である必要があります。")
|
| 149 |
+
|
| 150 |
+
if isinstance(time_series_values, list):
|
| 151 |
+
if not all(isinstance(ts, torch.Tensor) and ts.dim() == 2 for ts in time_series_values):
|
| 152 |
+
raise ValueError("リストの各要素は2次元のtorch.Tensorである必要があります。")
|
| 153 |
+
|
| 154 |
+
batch_size = len(time_series_values)
|
| 155 |
+
n_channels = time_series_values[0].shape[0]
|
| 156 |
+
seq_lens = [ts.shape[1] for ts in time_series_values]
|
| 157 |
+
|
| 158 |
+
# チャンネル数の一貫性チェック
|
| 159 |
+
if not all(ts.shape[0] == n_channels for ts in time_series_values):
|
| 160 |
+
raise ValueError("全ての時系列データは同じチャンネル数を持つ必要があります。")
|
| 161 |
+
|
| 162 |
+
elif isinstance(time_series_values, torch.Tensor):
|
| 163 |
+
if time_series_values.dim() == 3:
|
| 164 |
+
batch_size, n_channels, seq_len = time_series_values.shape
|
| 165 |
+
seq_lens = [seq_len] * batch_size
|
| 166 |
+
time_series_values = [time_series_values[i] for i in range(batch_size)]
|
| 167 |
+
elif time_series_values.dim() == 2:
|
| 168 |
+
n_channels, seq_len = time_series_values.shape
|
| 169 |
+
batch_size = 1
|
| 170 |
+
seq_lens = [seq_len]
|
| 171 |
+
time_series_values = [time_series_values]
|
| 172 |
+
else:
|
| 173 |
+
raise ValueError("テンソルは2次元または3次元である必要があります。")
|
| 174 |
+
else:
|
| 175 |
+
raise ValueError("入力は torch.Tensor または torch.Tensor のリストである必要があります。")
|
| 176 |
+
|
| 177 |
+
if padding == True or padding == 'longest':
|
| 178 |
+
target_len = max(seq_lens)
|
| 179 |
+
elif padding == 'max_length':
|
| 180 |
+
target_len = max_length if max_length is not None else 512
|
| 181 |
+
elif padding == False or padding == 'do_not_pad':
|
| 182 |
+
target_len = min(seq_lens)
|
| 183 |
+
else:
|
| 184 |
+
raise ValueError("無効なパディングオプションです。")
|
| 185 |
+
|
| 186 |
+
# デバイスの一貫性を保証
|
| 187 |
+
device = time_series_values[0].device
|
| 188 |
+
|
| 189 |
+
padded_values = torch.zeros((batch_size, n_channels, target_len), dtype=time_series_values[0].dtype, device=device)
|
| 190 |
+
input_mask = torch.zeros((batch_size, target_len), dtype=torch.bool, device=device)
|
| 191 |
+
|
| 192 |
+
for i in range(batch_size):
|
| 193 |
+
seq = time_series_values[i]
|
| 194 |
+
length = min(seq.shape[1], target_len)
|
| 195 |
+
padded_values[i, :, :length] = seq[:, :length]
|
| 196 |
+
input_mask[i, :length] = True
|
| 197 |
+
|
| 198 |
+
return padded_values, input_mask
|