add MLP estimator, update Estimator generics

2026-03-10 02:39:27 -07:00
parent c2e4294c8c
commit 95d7bc68ce
6 changed files with 189 additions and 18 deletions
--- a/trainlib/datasets/memory.py
+++ b/trainlib/datasets/memory.py
@@ -80,10 +80,10 @@ from trainlib.domain import SequenceDomain
 from trainlib.dataset import TupleDataset, DatasetKwargs
-class SlidingWindowDataset[T: Tensor](TupleDataset[T]):
+class SlidingWindowDataset(TupleDataset[Tensor]):
    def __init__(
        self,
-        domain: SequenceDomain[tuple[T, ...]],
+        domain: SequenceDomain[tuple[Tensor, ...]],
        lookback: int,
        offset: int = 0,
        lookahead: int = 1,
@@ -99,9 +99,9 @@ class SlidingWindowDataset[T: Tensor](TupleDataset[T]):
    def _process_batch_data(
        self,
-        batch_data: tuple[T, ...],
+        batch_data: tuple[Tensor, ...],
        batch_index: int,
-    ) -> list[tuple[T, ...]]:
+    ) -> list[tuple[Tensor, ...]]:
        """
        Backward pads first sequence over (lookback-1) length, and steps the
        remaining items forward by the lookahead.
--- a/trainlib/domain.py
+++ b/trainlib/domain.py
@@ -64,3 +64,14 @@ class SequenceDomain[R](Domain[int, R]):
    def __len__(self) -> int:
        return len(self.sequence)
 class TupleDomain[T](SequenceDomain[tuple[T, ...]]):
    """
    Domain for homogenous tuples of the same type.
    This class header exists primarily as typed alias that aligns with
    TupleDataset.
    """
    ...
--- a/trainlib/domains/functional.py
+++ b/trainlib/domains/functional.py
@@ -8,10 +8,16 @@ class SimulatorDomain[P, R](Domain[int, R]):
    Base simulator domain, generic to arbitrary callables.
    Note: we don't store simulation results here; that's left to a downstream
-    object, like a `BatchedDataset`, to cache if needed. We also don't subclass
+    object, like a ``BatchedDataset``, to cache if needed. We also don't
-    `SequenceDataset` because the item getter type doesn't align: we accept an
+    subclass ``SequenceDataset`` because the item getter type doesn't align: we
-    `int` in the parameter list, but don't return the items directly from that
+    accept an ``int`` in the parameter list, but don't return the items
-    collection (we transform them first).
+    directly from that collection (we transform them first).
    Note: it's interesting to consider the idea of having parameters directly
    act as URIs. There is, however, no obvious way to iterate over allowed
    parameters (without additional components, like a prior or some other
    generator), so we leave that outside the class scope and simply operate
    over of a provided parameter sequence.
    """
    def __init__(
--- a/trainlib/estimators/mlp.py
+++ b/trainlib/estimators/mlp.py
@@ -0,0 +1,149 @@
 import logging
 from typing import Unpack, NotRequired
 from collections.abc import Callable, Generator
 import torch
 import torch.nn.functional as F
 from torch import nn, Tensor
 from torch.optim import Optimizer
 from torch.utils.tensorboard import SummaryWriter
 from trainlib.estimator import Estimator, EstimatorKwargs
 from trainlib.utils.type import OptimizerKwargs
 from trainlib.utils.module import get_grad_norm
 from trainlib.estimators.tdnn import TDNNLayer
 logger: logging.Logger = logging.getLogger(__name__)
 class MLPKwargs(EstimatorKwargs):
    inputs: Tensor
    labels: NotRequired[Tensor]
 class MLP[K: MLPKwargs](Estimator[K]):
    """
    Base MLP architecture.
    """
    def __init__(
        self,
        input_dim: int,
        output_dim: int,
        hidden_dims: list[int] | None = None,
        norm_layer: Callable[..., nn.Module] | None = None,
        activation_fn: nn.Module | None = None,
        inplace: bool = False,
        bias: bool = True,
        dropout: float = 0.0,
        verbose: bool = True,
    ) -> None:
        """
        Parameters:
            input_dim: dimensionality of the input
            output_dim: dimensionality of the output
            hidden_dims: dimensionalities of hidden layers
        """
        super().__init__()
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.hidden_dims = hidden_dims or []
        self.norm_layer = norm_layer
        self.activation_fn = activation_fn or nn.ReLU
        # self._layers: nn.ModuleList = nn.ModuleList()
        self._layers = []
        layer_in_dim = input_dim
        for layer_out_dim in self.hidden_dims:
            hidden_layer = nn.Linear(layer_in_dim, layer_out_dim, bias=bias)
            self._layers.append(hidden_layer)
            if norm_layer is not None:
                self._layers.append(norm_layer(layer_out_dim))
            self._layers.append(self.activation_fn(inplace=inplace))
            self._layers.append(nn.Dropout(dropout, inplace=inplace))
            layer_in_dim = layer_out_dim
        self._layers.append(nn.Linear(layer_in_dim, self.output_dim))
        self._net = nn.Sequential(*self._layers)
        if verbose:
            self.log_arch()
    def _clamp_rand(self, x: Tensor) -> Tensor:
        return torch.clamp(
            x + (1.0 / 127.0) * (torch.rand_like(x) - 0.5),
            min=-1.0,
            max=1.0,
        )
    def forward(self, **kwargs: Unpack[K]) -> tuple[Tensor, ...]:
        inputs = kwargs["inputs"]
        x = self._net(inputs)
        return (x,)
    def loss(self, **kwargs: Unpack[K]) -> Generator[Tensor]:
        predictions = self(**kwargs)[0]
        labels = kwargs["labels"]
        yield F.mse_loss(predictions, labels)
    def metrics(self, **kwargs: Unpack[K]) -> dict[str, float]:
        with torch.no_grad():
            loss = next(self.loss(**kwargs)).item()
            predictions = self(**kwargs)[0]
            labels = kwargs["labels"]
            mae = F.l1_loss(predictions, labels).item()
        return {
            "loss": loss,
            "mse": loss,
            "mae": mae,
            "grad_norm": get_grad_norm(self)
        }
    def optimizers(
        self,
        **kwargs: Unpack[OptimizerKwargs],
    ) -> tuple[Optimizer, ...]:
        """
        """
        default_kwargs: Unpack[OptimizerKwargs] = {
            "lr": 1e-3,
            "eps": 1e-8,
        }
        opt_kwargs = {**default_kwargs, **kwargs}
        optimizer = torch.optim.AdamW(
            self.parameters(),
            **opt_kwargs,
        )
        return (optimizer,)
    def epoch_step(self) -> None:
        return None
    def epoch_write(
        self,
        writer: SummaryWriter,
        step: int | None = None,
        val: bool = False,
        **kwargs: Unpack[K],
    ) -> None:
        return None
    def log_arch(self) -> None:
        super().log_arch()
        logger.info(f"| > {self.input_dim=}")
        logger.info(f"| > {self.hidden_dims=}")
        logger.info(f"| > {self.output_dim=}")
--- a/trainlib/estimators/rnn.py
+++ b/trainlib/estimators/rnn.py
@@ -458,7 +458,6 @@ class ConvGRU[K: RNNKwargs](Estimator[K]):
            "grad_norm": get_grad_norm(self)
        }
    def optimizers(
        self,
        **kwargs: Unpack[OptimizerKwargs],
--- a/trainlib/trainer.py
+++ b/trainlib/trainer.py
@@ -258,10 +258,10 @@ class Trainer[I, K: EstimatorKwargs]:
        return val_loss_sums
-    def train(
+    def train[B](
        self,
        dataset: BatchedDataset[..., ..., I],
-        batch_estimator_map: Callable[[I, Self], K],
+        batch_estimator_map: Callable[[B, Self], K],
        lr: float = 1e-3,
        eps: float = 1e-8,
        max_grad_norm: float | None = None,
@@ -280,6 +280,10 @@ class Trainer[I, K: EstimatorKwargs]:
        summary_writer: SummaryWriter | None = None,
    ) -> Estimator:
        """
        TODO: consider making the dataloader ``collate_fn`` an explicit
        parameter with a type signature that reflects ``B``, connecting the
        ``batch_estimator_map`` somewhere
        Note: this method attempts to implement a general scheme for passing
        needed items to the estimator's loss function from the dataloader. The
        abstract ``Estimator`` base only requires the model output be provided
@@ -317,10 +321,12 @@ class Trainer[I, K: EstimatorKwargs]:
                   [2, 2],
                   [3, 3]] )
-            This function should map from batches (which should be *item
+            This function should map from batches - which *may* be item
-            shaped*, i.e., have an ``I`` skeleton, even if stacked items may be
+            shaped, i.e., have an ``I`` skeleton, even if stacked items may be
-            different on the inside) into estimator keyword arguments (type
+            different on the inside - into estimator keyword arguments (type
-            ``K``).
+            ``K``). Collation behavior from a DataLoader (which can be
            customized) doesn't consistently yield a known type shape, however,
            so it's not appropriate to use ``I`` as the callable param type.
        Parameters:
            dataset: dataset to train the estimator