add MLP estimator, update Estimator generics
This commit is contained in:
@@ -80,10 +80,10 @@ from trainlib.domain import SequenceDomain
|
||||
from trainlib.dataset import TupleDataset, DatasetKwargs
|
||||
|
||||
|
||||
class SlidingWindowDataset[T: Tensor](TupleDataset[T]):
|
||||
class SlidingWindowDataset(TupleDataset[Tensor]):
|
||||
def __init__(
|
||||
self,
|
||||
domain: SequenceDomain[tuple[T, ...]],
|
||||
domain: SequenceDomain[tuple[Tensor, ...]],
|
||||
lookback: int,
|
||||
offset: int = 0,
|
||||
lookahead: int = 1,
|
||||
@@ -99,9 +99,9 @@ class SlidingWindowDataset[T: Tensor](TupleDataset[T]):
|
||||
|
||||
def _process_batch_data(
|
||||
self,
|
||||
batch_data: tuple[T, ...],
|
||||
batch_data: tuple[Tensor, ...],
|
||||
batch_index: int,
|
||||
) -> list[tuple[T, ...]]:
|
||||
) -> list[tuple[Tensor, ...]]:
|
||||
"""
|
||||
Backward pads first sequence over (lookback-1) length, and steps the
|
||||
remaining items forward by the lookahead.
|
||||
|
||||
@@ -64,3 +64,14 @@ class SequenceDomain[R](Domain[int, R]):
|
||||
|
||||
def __len__(self) -> int:
|
||||
return len(self.sequence)
|
||||
|
||||
|
||||
class TupleDomain[T](SequenceDomain[tuple[T, ...]]):
|
||||
"""
|
||||
Domain for homogenous tuples of the same type.
|
||||
|
||||
This class header exists primarily as typed alias that aligns with
|
||||
TupleDataset.
|
||||
"""
|
||||
|
||||
...
|
||||
|
||||
@@ -8,10 +8,16 @@ class SimulatorDomain[P, R](Domain[int, R]):
|
||||
Base simulator domain, generic to arbitrary callables.
|
||||
|
||||
Note: we don't store simulation results here; that's left to a downstream
|
||||
object, like a `BatchedDataset`, to cache if needed. We also don't subclass
|
||||
`SequenceDataset` because the item getter type doesn't align: we accept an
|
||||
`int` in the parameter list, but don't return the items directly from that
|
||||
collection (we transform them first).
|
||||
object, like a ``BatchedDataset``, to cache if needed. We also don't
|
||||
subclass ``SequenceDataset`` because the item getter type doesn't align: we
|
||||
accept an ``int`` in the parameter list, but don't return the items
|
||||
directly from that collection (we transform them first).
|
||||
|
||||
Note: it's interesting to consider the idea of having parameters directly
|
||||
act as URIs. There is, however, no obvious way to iterate over allowed
|
||||
parameters (without additional components, like a prior or some other
|
||||
generator), so we leave that outside the class scope and simply operate
|
||||
over of a provided parameter sequence.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
|
||||
149
trainlib/estimators/mlp.py
Normal file
149
trainlib/estimators/mlp.py
Normal file
@@ -0,0 +1,149 @@
|
||||
import logging
|
||||
from typing import Unpack, NotRequired
|
||||
from collections.abc import Callable, Generator
|
||||
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
from torch import nn, Tensor
|
||||
from torch.optim import Optimizer
|
||||
from torch.utils.tensorboard import SummaryWriter
|
||||
|
||||
from trainlib.estimator import Estimator, EstimatorKwargs
|
||||
from trainlib.utils.type import OptimizerKwargs
|
||||
from trainlib.utils.module import get_grad_norm
|
||||
from trainlib.estimators.tdnn import TDNNLayer
|
||||
|
||||
logger: logging.Logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class MLPKwargs(EstimatorKwargs):
|
||||
inputs: Tensor
|
||||
labels: NotRequired[Tensor]
|
||||
|
||||
|
||||
class MLP[K: MLPKwargs](Estimator[K]):
|
||||
"""
|
||||
Base MLP architecture.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
input_dim: int,
|
||||
output_dim: int,
|
||||
hidden_dims: list[int] | None = None,
|
||||
norm_layer: Callable[..., nn.Module] | None = None,
|
||||
activation_fn: nn.Module | None = None,
|
||||
inplace: bool = False,
|
||||
bias: bool = True,
|
||||
dropout: float = 0.0,
|
||||
verbose: bool = True,
|
||||
) -> None:
|
||||
"""
|
||||
Parameters:
|
||||
input_dim: dimensionality of the input
|
||||
output_dim: dimensionality of the output
|
||||
hidden_dims: dimensionalities of hidden layers
|
||||
"""
|
||||
|
||||
super().__init__()
|
||||
|
||||
self.input_dim = input_dim
|
||||
self.output_dim = output_dim
|
||||
self.hidden_dims = hidden_dims or []
|
||||
self.norm_layer = norm_layer
|
||||
self.activation_fn = activation_fn or nn.ReLU
|
||||
|
||||
# self._layers: nn.ModuleList = nn.ModuleList()
|
||||
self._layers = []
|
||||
|
||||
layer_in_dim = input_dim
|
||||
for layer_out_dim in self.hidden_dims:
|
||||
hidden_layer = nn.Linear(layer_in_dim, layer_out_dim, bias=bias)
|
||||
self._layers.append(hidden_layer)
|
||||
|
||||
if norm_layer is not None:
|
||||
self._layers.append(norm_layer(layer_out_dim))
|
||||
|
||||
self._layers.append(self.activation_fn(inplace=inplace))
|
||||
self._layers.append(nn.Dropout(dropout, inplace=inplace))
|
||||
|
||||
layer_in_dim = layer_out_dim
|
||||
|
||||
self._layers.append(nn.Linear(layer_in_dim, self.output_dim))
|
||||
self._net = nn.Sequential(*self._layers)
|
||||
|
||||
if verbose:
|
||||
self.log_arch()
|
||||
|
||||
def _clamp_rand(self, x: Tensor) -> Tensor:
|
||||
return torch.clamp(
|
||||
x + (1.0 / 127.0) * (torch.rand_like(x) - 0.5),
|
||||
min=-1.0,
|
||||
max=1.0,
|
||||
)
|
||||
|
||||
def forward(self, **kwargs: Unpack[K]) -> tuple[Tensor, ...]:
|
||||
inputs = kwargs["inputs"]
|
||||
x = self._net(inputs)
|
||||
|
||||
return (x,)
|
||||
|
||||
def loss(self, **kwargs: Unpack[K]) -> Generator[Tensor]:
|
||||
predictions = self(**kwargs)[0]
|
||||
labels = kwargs["labels"]
|
||||
|
||||
yield F.mse_loss(predictions, labels)
|
||||
|
||||
def metrics(self, **kwargs: Unpack[K]) -> dict[str, float]:
|
||||
with torch.no_grad():
|
||||
loss = next(self.loss(**kwargs)).item()
|
||||
|
||||
predictions = self(**kwargs)[0]
|
||||
labels = kwargs["labels"]
|
||||
mae = F.l1_loss(predictions, labels).item()
|
||||
|
||||
return {
|
||||
"loss": loss,
|
||||
"mse": loss,
|
||||
"mae": mae,
|
||||
"grad_norm": get_grad_norm(self)
|
||||
}
|
||||
|
||||
def optimizers(
|
||||
self,
|
||||
**kwargs: Unpack[OptimizerKwargs],
|
||||
) -> tuple[Optimizer, ...]:
|
||||
"""
|
||||
"""
|
||||
|
||||
default_kwargs: Unpack[OptimizerKwargs] = {
|
||||
"lr": 1e-3,
|
||||
"eps": 1e-8,
|
||||
}
|
||||
opt_kwargs = {**default_kwargs, **kwargs}
|
||||
|
||||
optimizer = torch.optim.AdamW(
|
||||
self.parameters(),
|
||||
**opt_kwargs,
|
||||
)
|
||||
|
||||
return (optimizer,)
|
||||
|
||||
def epoch_step(self) -> None:
|
||||
return None
|
||||
|
||||
def epoch_write(
|
||||
self,
|
||||
writer: SummaryWriter,
|
||||
step: int | None = None,
|
||||
val: bool = False,
|
||||
**kwargs: Unpack[K],
|
||||
) -> None:
|
||||
return None
|
||||
|
||||
def log_arch(self) -> None:
|
||||
super().log_arch()
|
||||
|
||||
logger.info(f"| > {self.input_dim=}")
|
||||
logger.info(f"| > {self.hidden_dims=}")
|
||||
logger.info(f"| > {self.output_dim=}")
|
||||
@@ -458,7 +458,6 @@ class ConvGRU[K: RNNKwargs](Estimator[K]):
|
||||
"grad_norm": get_grad_norm(self)
|
||||
}
|
||||
|
||||
|
||||
def optimizers(
|
||||
self,
|
||||
**kwargs: Unpack[OptimizerKwargs],
|
||||
|
||||
@@ -258,10 +258,10 @@ class Trainer[I, K: EstimatorKwargs]:
|
||||
|
||||
return val_loss_sums
|
||||
|
||||
def train(
|
||||
def train[B](
|
||||
self,
|
||||
dataset: BatchedDataset[..., ..., I],
|
||||
batch_estimator_map: Callable[[I, Self], K],
|
||||
batch_estimator_map: Callable[[B, Self], K],
|
||||
lr: float = 1e-3,
|
||||
eps: float = 1e-8,
|
||||
max_grad_norm: float | None = None,
|
||||
@@ -280,6 +280,10 @@ class Trainer[I, K: EstimatorKwargs]:
|
||||
summary_writer: SummaryWriter | None = None,
|
||||
) -> Estimator:
|
||||
"""
|
||||
TODO: consider making the dataloader ``collate_fn`` an explicit
|
||||
parameter with a type signature that reflects ``B``, connecting the
|
||||
``batch_estimator_map`` somewhere
|
||||
|
||||
Note: this method attempts to implement a general scheme for passing
|
||||
needed items to the estimator's loss function from the dataloader. The
|
||||
abstract ``Estimator`` base only requires the model output be provided
|
||||
@@ -317,10 +321,12 @@ class Trainer[I, K: EstimatorKwargs]:
|
||||
[2, 2],
|
||||
[3, 3]] )
|
||||
|
||||
This function should map from batches (which should be *item
|
||||
shaped*, i.e., have an ``I`` skeleton, even if stacked items may be
|
||||
different on the inside) into estimator keyword arguments (type
|
||||
``K``).
|
||||
This function should map from batches - which *may* be item
|
||||
shaped, i.e., have an ``I`` skeleton, even if stacked items may be
|
||||
different on the inside - into estimator keyword arguments (type
|
||||
``K``). Collation behavior from a DataLoader (which can be
|
||||
customized) doesn't consistently yield a known type shape, however,
|
||||
so it's not appropriate to use ``I`` as the callable param type.
|
||||
|
||||
Parameters:
|
||||
dataset: dataset to train the estimator
|
||||
|
||||
Reference in New Issue
Block a user