initial commit

2026-03-03 18:11:37 -08:00
commit 337175d428
24 changed files with 4940 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,20 @@
 # generic
 __pycache__/
 *.egg-info/
 .python-version
 # package-specific
 .ipynb_checkpoints/
 .pytest_cache/
 # vendor/build files
 dist/
 build/
 doc/_autoref/
 doc/_autosummary/
 doc/_build/
 # misc local
 /Makefile
 notebooks/
--- a/README.md
+++ b/README.md
@@ -0,0 +1,105 @@
 # Overview
 Package summary goes here, ideally with a diagram
 # Install
 Installation instructions
 ```sh
 pip install <package>
 ```
 or as a CLI tool
 ```sh
 uv tool install <package>
 ```
 # Development
 - Initialize/synchronize the project with `uv sync`, creating a virtual
  environment with base package dependencies.
 - Depending on needs, install the development dependencies with `uv sync
  --extra dev`.
 # Testing
 - To run the unit tests, make sure to first have the test dependencies
  installed with `uv sync --extra test`, then run `make test`.
 - For notebook testing, run `make install-kernel` to make the environment
  available as a Jupyter kernel (to be selected when running notebooks).
 # Documentation
 - Install the documentation dependencies with `uv sync --extra doc`.
 - Run `make docs-build` (optionally preceded by `make docs-clean`), and serve
  locally with `docs-serve`.
 # Development remarks
 - Across `Trainer` / `Estimator` / `Dataset`, I've considered a
  `ParamSpec`-based typing scheme to better orchestrate alignment in the
  `Trainer.train()` loop, e.g., so we can statically check whether a dataset
  appears to be fulfilling the argument requirements for the estimator's
  `loss()` / `metrics()`  methods. Something like
  ```py
  class Estimator[**P](nn.Module):
      def loss(
          self,
          input: Tensor,
          *args: P.args,
          **kwargs: P.kwargs,
      ) -> Generator:
          ...
  class Trainer[**P]:
      def __init__(
          self,
          estimator: Estimator[P],
          ...
      ): ...
  ```
  might be how we begin threading signatures. But ensuring dataset items can
  match `P` is challenging. You can consider a "packed" object where we
  obfuscate passing data through `P`-signatures:
  ```py
  class PackedItem[**P]:
      def __init__(self, *args: P.args, **kwargs: P.kwargs) -> None:
          self._args = args
          self._kwargs = kwargs
      def apply[R](self, func: Callable[P, R]) -> R:
          return func(*self._args, **self._kwargs)
  class BatchedDataset[U, R, I, **P](Dataset):
      @abstractmethod
      def _process_item_data(
          self,
          item_data: I,
          item_index: int,
      ) -> PackedItem[P]:
          ...
      def __iter__(self) -> Iterator[PackedItem[P]]:
          ...
  ```
  Meaningfully shaping those signatures is what remains, but you can't really
  do this, not with typical type expression flexibility. For instance, if I'm
  trying to appropriately type my base `TupleDataset`:
  ```py
  class SequenceDataset[I, **P](HomogenousDataset[int, I, I, P]):
      ...
  class TupleDataset[I](SequenceDataset[tuple[I, ...], ??]):
      ...
  ```
  Here there's no way for me to shape a `ParamSpec` to indicate arbitrarily
  many arguments of a fixed type (`I` in this case) to allow me to unpack my
  item tuples into an appropriate `PackedItem`.
  Until this (among other issues) becomes clearer, I'm setting up around a
  simpler `TypedDict` type variable. We won't have particularly strong static
  checks for item alignment inside `Trainer`, but this seems about as good as I
  can get around the current infrastructure. 
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -0,0 +1,84 @@
 [build-system]
 requires = ["setuptools", "wheel"]
 build-backend = "setuptools.build_meta"
 [project]
 name = "trainlib"
 version = "0.1.0"
 description = "Minimal framework for ML modeling. Supports advanced dataset operations and streamlined training."
 requires-python = ">=3.13"
 authors = [
  { name="Sam Griesemer", email="git@olog.io" },
 ]
 readme = "README.md"
 license = "MIT"
 keywords = [
    "machine-learning",
 ]
 classifiers = [
    "Programming Language :: Python",
    "Operating System :: OS Independent",
    "Development Status :: 3 - Alpha",
    "Intended Audience :: Developers",
    "Intended Audience :: End Users/Desktop",
 ]
 dependencies = [
    "colorama>=0.4.6",
    "matplotlib>=3.10.8",
    "numpy>=2.4.1",
    "tensorboard>=2.20.0",
    "torch>=2.5.1",
    "tqdm>=4.67.1",
 ]
 [project.scripts]
 symconf = "trainlib.__main__:main"
 [project.optional-dependencies]
 dev = [
    "ipykernel",
 ]
 doc = [
    "furo",
    "myst-parser",
    "sphinx",
    "sphinx-togglebutton",
    "sphinx-autodoc-typehints",
 ]
 test = [
    "pytest",
 ]
 [project.urls]
 Homepage = "https://doc.olog.io/trainlib"
 Documentation = "https://doc.olog.io/trainlib"
 Repository = "https://git.olog.io/olog/trainlib"
 Issues = "https://git.olog.io/olog/trainlib/issues"
 [tool.setuptools.packages.find]
 include = ["trainlib*"]
 # for static data files under package root
 # [tool.setuptools.package-data]
 # "<package>" = ["data/*.toml"]
 [tool.ruff]
 line-length = 79
 [tool.ruff.lint]
 select = ["ANN", "E", "F", "UP", "B", "SIM", "I", "C4", "PERF"]
 [tool.ruff.lint.isort]
 length-sort = true
 order-by-type = false
 force-sort-within-sections = false
 [tool.ruff.lint.per-file-ignores]
 "tests/**" = ["S101"]
 "**/__init__.py" = ["F401"]
 [tool.ruff.format]
 quote-style = "double"
 indent-style = "space"
 docstring-code-format = true
--- a/trainlib/init.py
+++ b/trainlib/init.py
--- a/trainlib/dataset.py
+++ b/trainlib/dataset.py
@@ -0,0 +1,964 @@
 """
 Marginalizing out the modality layer:
    With ``domain`` being an instance variable, one possible interpretation of
    the object structures here is that one could completely abstract away
    the domain model, defining only item structures and processing data. You
    could have a single dataset definition for a particular concrete dataset,
    and so long as we're talking about the same items, it can be instantiated
    using *any domain*. You wouldn't need specific subclasses for disk or
    network or in-memory; you can tell it directly at runtime.
    That's an eventually possibility, anyway. As it stands, however, this is
    effectively impossible:
    You can't easily abstract the batch -> item splitting process, i.e.,
    ``_process_batch_data()``. A list-based version of the dataset you're
    trying to define might have an individual item tuple at every index,
    whereas a disk-based version might have tuples batched across a few files.
    This can't reliably be inferred, nor can it be pushed to the
    ``Domain``-level without needing equal levels of specialization (you'd just
    end up needing the exact same structural distinctions in the ``Domain``
    hierarchy). So *somewhere* you need a batch splitting implementation that
    is both item structure-dependent *and* domain-dependent...the question is
    how dynamic you're willing to be about where it comes from. Right now, we
    require this actually be defined in the ``_process_batch_data()`` method,
    meaning you'll need a specific ``Dataset`` class for each domain you want
    to support (e.g., ``MNISTDisk``, ``MNISTList``, ``MNISTNetwork``, etc), or
    at least for each domain where "interpreting" a batch could possibly
    differ. This is a case where the interface is all that enforces a
    distinction: if you've got two domains that can be counted on to yield
    batches in the exact same way and can use the same processing, then you
    could feasibly provide ``Domain`` objects from either at runtime and have
    no issues. We're "structurally blind" to any differentiation beyond the URI
    and resource types by design, so two different domain implementations with
    the same type signature ``Domain[U, R]`` should be expected to work fine at
    runtime (again, so long as they don't also need different batch
    processing), but that's not affording us much flexibility, i.e., most of
    the time we'll still be defining new dataset classes for each domain.
    I initially flagged this as feasible, however, because one could imagine
    accepting a batch processing method upon instantiation rather than
    structurally bolting it into the ``Dataset`` definition. This would require
    knowledge of the item structure ``I`` as well as the ``Domain[U, R]``, so
    such a function will always have to be (I, U, R)-dependent. It nevertheless
    would take out some of the pain of having to define new dataset classes;
    instead, you'd just need to define the batch processing method. I see this
    as a worse alternative to just defining *inside* a safe context like a new
    dataset class: you know the types you have to respect, and you stick that
    method exactly in a context where it's understood. Freeing this up doesn't
    lighten the burden of processing logic, it just changes *when* it has to be
    provided, and that's not worth much (to me) in this case given the bump in
    complexity. (Taking this to the extreme: you could supply *all* of an
    object's methods "dynamically" and glue them together at runtime so long as
    they all played nice. But wherever you were "laying them out" beforehand is
    exactly the job of a class to begin with, so you don't end up with anything
    more dynamic. All we're really discussing here is pushing around
    unavoidable complexity inside and outside of the "class walls," and in the
    particular case of ``_process_batch_data()``, it feels much better when
    it's on the inside.)
 Holding:
    @abstractmethod
    def _get_uri_groups(self) -> Iterable[tuple[U, ...]]:
        Get URI groups for each batch.
        If there's more than one URI per batch (e.g., a data file and a
        metadata file), zip the URIs such that we have a tuple of URIs per
        batch.
        Note that this effectively defines the index style over batches in the
        attached domain. We get an ``int -> tuple[U, ...]`` map that turns
        batch indices into URIs that can be read under the domain.
        ``get_batch()`` turns an integer index into its corresponding
        ``tuple[U, ...]``, reading the resources with ``_read_resources()`` in
        the tuple, treating them as providers of batched data.
        ``_read_resources()`` passes through to the attached domain logic,
        which, although common, need not supply an explicit iterable of batch
        items: we just access items with ``__getitem__()`` and may ask for
        ``__len__``. So the returned URI group collection (this method) does
        need to be iterable to measure the number of batches, but the batch
        objects that are ultimately produced by these URI groups need not be
        iterables themselves.
        raise NotImplementedError
    def _read_resources(
        self,
        uri_group: tuple[U, ...],
        batch_index: int
    ) -> tuple[R, ...]:
        Read batch files at the provided paths.
        This method should operate on a single tuple from the list of batch
        tuples returned by the ``_get_uri_groups()`` method. That is, it reads
        all of the resources for a single batch and returns a tuple of the same
        size with their contents.
        Note: the dependence on a batch index is mostly here to make
        multi-dataset composition easier later. In-dataset, you don't need to
        know the batch index to to simply process URIs, but across datasets you
        need it to find out the origin of the batch (and process those URIs
        accordingly).
        return tuple(self.domain.read(uri) for uri in uri_group)
 # pulling the type variable out of the inline generic b/c `ty` has trouble
 # understanding bound type variables in subclasses (specifically with Self@)
 T = TypeVar("T", bound=NamedTuple)
 class NamedTupleDataset[I](Dataset):
    def __init__(self, data_list: list[I]) -> None:
        self.data_list = data_list
    def __len__(self) -> int:
        return len(self.data_list)
    def __getitem__(self, index: int) -> I:
        return self.data_list[index]
 """
 import math
 import random
 import logging
 from abc import abstractmethod
 from copy import copy
 from bisect import bisect
 from typing import Unpack, TypedDict
 from functools import lru_cache
 from collections import defaultdict
 from collections.abc import Callable, Iterator
 from concurrent.futures import ThreadPoolExecutor
 from torch.utils.data import Dataset
 from trainlib.utils import job
 from trainlib.domain import Domain, SequenceDomain
 from trainlib.transform import Transform
 logger: logging.Logger = logging.getLogger(__name__)
 class DatasetKwargs[I](TypedDict, total=False):
    pre_transform: Transform[I]
    post_transform: Transform[I]
    batch_cache_limit: int
    preload: bool
    num_workers: int
 class BatchedDataset[U, R, I](Dataset):
    """
    Generic dataset that dynamically pulls batched data from resources (e.g.,
    files, remote locations, streams, etc).
    The class is generic over a URI type ``U``, a resource type ``R`` (both of
    which are used to concretize a domain ``Domain[U, R]``), and an item type
    ``T`` (which has a ``tuple`` upper bound).
    Pipeline overview:
    ```
    Domain -> [U] (get _batch_uris)
    U -> R (domain access ; Rs provide batches)
    R -> [I] (cache here ; _process_batch_data to use load_transform)
    [I] -> I (human item obj ; _get_item)
    I -> **P (final packed item ; __getitem__ to use transform)
    ```
    Note^1: as far as positioning, this class is meant to play nice with
    PyTorch DataLoaders, hence the inheritance from ``torch.Dataset``. The
    value add for this over the ``torch.Dataset`` base is almost entirely in
    the logic it implements to map out of *batched resources* that are holding
    data, and flattening it out into typical dataset items. There are also some
    QoL items when it comes to splitting and balancing samples. 
    Note^2: even though ``Domains`` implement iterators over their URIs, this
    doesn't imply a ``BatchedDataset`` is iterable. This just means we can walk
    over the resources that provide data, but we don't necessarily presuppose
    an ordered walk over samples within batches. Point being:
    ``torch.Dataset``, not ``torch.IterableDataset``, is the appropriate
    superclass, even when we're working around iterable ``Domains``.
    Note^3: transforms are expected to operate on ``I``-items and produce
    ``I``-items. They shouldn't be the "introducers" of ``I`` types from some
    other intermediate representation, nor should they map from ``I`` to
    something else. Point being: the dataset definition should be able to map
    resources ``R`` to ``I`` without a transform: that much should be baked
    into the class definition. If you find you're expecting the transform to do
    that for you, you should consider pulling in some common structure across
    the allowed transforms and make it a fixed part of the class.
    """
    def __init__(
        self,
        domain: Domain[U, R],
        pre_transform: Transform[I] | None = None,
        post_transform: Transform[I] | None = None,
        batch_cache_limit: int | None = None,
        preload: bool = False,
        num_workers: int = 1,
    ) -> None:
        """
        Parameters:
            pre_transform: transform to apply over items during loading (in
                ``_process_batch_data()``), i.e., *before* going into
                persistent storage
            post_transform: transform to apply just prior to returning an item
                (in ``_process_item_data()``), i.e., only *after* retrieval
                from persistent storage
            batch_cache_limit: the max number of max batches to cache at any
                one time
            preload: whether to load all data into memory during instantiation
        """
        self.domain = domain
        self.pre_transform = pre_transform
        self.post_transform = post_transform
        self.batch_cache_limit = batch_cache_limit
        self.num_workers = num_workers
        logger.info("Fetching URIs...")
        self._batch_uris: list[U] = list(domain)
        self._indices: list[int] | None = None
        self._dataset_len: int | None = None
        self._num_batches: int = len(domain)
        self.get_batch: Callable[[int], list[I]] = lru_cache(
            maxsize=batch_cache_limit
        )(self._get_batch)
        if preload:
            self.load_all(num_workers=num_workers)
    @abstractmethod
    def _get_dataset_len(self) -> int:
        """
        Calculate the total dataset size in units of samples (not batches).
        """
        raise NotImplementedError
    @abstractmethod
    def _get_batch_for_item(self, item_index: int) -> tuple[int, int]:
        """
        Return the index of the batch containing the item at the provided item
        index, and the index of the item within that batch.
        The behavior of this method can vary depending on what we know about
        batch sizes, and should therefore be implemented by inheriting classes.
        Returns:
            batch_index: int
            index_in_batch: int
        """
        raise NotImplementedError
    @abstractmethod
    def _process_batch_data(
        self,
        batch_data: R,
        batch_index: int,
    ) -> list[I]:
        """
        Process raw domain resource data (e.g., parse JSON or load tensors) and
        split accordingly, such that the returned batch is a collection of
        ``I`` items.
        If an inheriting class wants to allow dynamic transforms, this is the
        place to use a provided ``pre_transform``; the collection of items
        produced by this method are cached as a batch, so results from such a
        transform will be stored.
        Parameters:
            batch_data: tuple of resource data
            batch_index: index of batch
        """
        raise NotImplementedError
    @abstractmethod
    def _process_item_data(
        self,
        item_data: I,
        item_index: int,
    ) -> I:
        """
        Process individual items and produce final tuples.
        If an inheriting class wants to allow dynamic transforms, this is the
        place to use a provided ``post_transform``; items are pulled from the
        cache (if enabled) and processed before being returned as the final
        tuple outputs (so this processing is not persistent).
        """
        raise NotImplementedError
    def _get_item(self, item_index: int) -> I:
        """
        Get the item data and zip with the item header.
        Items should be the most granular representation of dataset samples
        with maximum detail (i.e., yield a all available information). An
        iterator over these representations across all samples can be retrieved
        with `.items()`.
        Note that return values from `__getitem__()` are "cleaned up" versions
        of this representation, with minimal info needed for training.
        """
        if item_index >= len(self):
            raise IndexError
        # alt indices redefine index count
        item_index = self.indices[item_index]
        batch_index, index_in_batch = self._get_batch_for_item(item_index)
        return self.get_batch(batch_index)[index_in_batch]
    def _get_batch(self, batch_index: int) -> list[I]:
        """
        Return the batch data for the provided index.
        Note that we require a list return type. This is where the rubber meets
        the road in terms of expanding batches: the outputs here get cached, if
        caching is enabled. If we were to defer batch expansion to some later
        stage, that caching will be more or less worthless. For instance, if
        ``._process_batch_data()`` was an iterator, at best the batch
        processing logic could be delayed, but then you'd either 1) cache the
        iterator reference, or 2) have to further delay caching until
        post-batch processing. Further, ``._process_batch_data()`` can (and
        often does) depend on the entire batch, so you can't handle that
        item-wise: you've got to pass all batch data in and can't act on
        slices, so doing this would be irrelevant anyway.
        How about iterators out from ``_read_resources()``? Then your
        ``_process_batch_data()`` can iterate as needed and do the work later?
        The point here is that this distinction is irrelevant because you're
        reading resources and processing the data here in the same method:
        they're always connected, and nothing would notice if you waited
        between steps. The only way this could matter is if you split the
        resource reading and batch processing steps across methods, but when it
        actually comes to accessing/caching the batch, you'd have to expand
        any delayed reads here. There's no way around needing to see all batch
        data at once here, and we don't want to make that ambiguous: ``list``
        output type it is.
        """
        logger.debug("Batch cache miss, reading from root...")
        if batch_index >= self._num_batches:
            raise IndexError
        batch_uri = self._batch_uris[batch_index]
        batch_data = self.domain[batch_uri]
        return self._process_batch_data(batch_data, batch_index)
    def load_all(self, num_workers: int | None = None) -> list[list[I]]:
        """
        Preload all data batches into the cache.
        Can be useful when dynamically pulling data (as it's requested) isn't
        desired. Requires that `cache_sample_limit=None`, i.e., the cache won't
        continually remove previous batches as they're loaded.
        """
        assert self.batch_cache_limit is None, "Preloading under cache limit"
        if num_workers is None:
            num_workers = self.num_workers
        thread_pool = ThreadPoolExecutor(max_workers=num_workers)
        futures = []
        for batch_index in range(self._num_batches):
            future = thread_pool.submit(
                self.get_batch,
                batch_index,
            )
            futures.append(future)
        job.process_futures(futures, "Loading dataset batches", "batch")
        thread_pool.shutdown(wait=True)
        return [future.result() for future in futures]
    def split(
        self,
        fracs: list[float],
        dataset: "BatchedDataset | None" = None,
        by_attr: str | list[str | None] | None = None,
        shuffle_strata: bool = True,
    ) -> list["BatchedDataset"]:
        """
        Split dataset into fractional pieces by data attribute.
        If `by_attr` is None, recovers typical fractional splitting of dataset
        items, partitioning by size. Using None anywhere will index each item
        into its own bucket, i.e., by its index. For instance,
        - by_attr=["color"] -> {("red", 1), ("red", 2)},
                               {("blue", 1), ("blue", 2)}
          Splits on the attribute such that each subset contains entire strata
          of the attribute. "Homogeneity within clusters"
        - `by_attr=["color", None]` -> {("red", 1), ("blue", 1)},
                                       {("red", 2), ("blue", 2)}
          Stratifies by attribute and then splits "by index" within, uniformly
          grabbing samples across strata to form new clusters. "Homogeneity
          across clusters"
        Note that the final list of Subsets returned are built from shallow
        copies of the underlying dataset (i.e., `self`) to allow manual
        intervention with dataset attributes (e.g., setting the splits to have
        different `transform`s). This is subject to possibly unexpected
        behavior if re-caching data or you need a true copy of all data in
        memory, but should otherwise leave most interactions unchanged.
        Parameters:
            shuffle_strata: shuffle the strata order before split is drawn. We
                parameterize this because a dataloader-level shuffle operation
                will only change the order of the indices in the resulting
                splits; only a shuffle of items inside the strata can change
                the actual content of the splits themselves.
        """
        if by_attr == []:
            raise ValueError("Cannot parse empty value list")
        assert (
            math.isclose(sum(fracs), 1) and sum(fracs) <= 1
        ), "Fractions do not sum to 1"
        if isinstance(by_attr, str) or by_attr is None:
            by_attr = [by_attr]
        if dataset is None:
            dataset = self
            # dataset = DictDataset([
            #     self._get_item(i) for i in range(len(self))
            # ])
        # group samples by specified attr
        attr_dict = defaultdict(list)
        attr_key, by_attr = by_attr[0], by_attr[1:]
        # for i in range(len(dataset)):
        for i, item in enumerate(dataset.items()):
            # item = dataset[i]
            if attr_key is None:
                attr_val = i
            elif attr_key in item:
                attr_val = item[attr_key]
            else:
                raise IndexError(f"Attribute {attr_key} not in dataset item")
            attr_dict[attr_val].append(i)
        if by_attr == []:
            attr_keys = list(attr_dict.keys())
            # shuffle keys; randomized group-level split
            if shuffle_strata:
                random.shuffle(attr_keys)
            # considering: defer to dataloader shuffle param; should have same
            # effect shuffle values; has no impact on where the split is drawn
            # for attr_vals in attr_dict.values():
            #     random.shuffle(attr_vals)
            # fractionally split over attribute keys
            offset, splits = 0, []
            for frac in fracs[:-1]:
                frac_indices = []
                frac_size = int(frac * len(attr_keys))
                for j in range(offset, offset + frac_size):
                    j_indices = attr_dict.pop(attr_keys[j])
                    frac_indices.extend(j_indices)
                offset += frac_size
                splits.append(frac_indices)
            rem_indices = []
            for r_indices in attr_dict.values():
                rem_indices.extend(r_indices)
            splits.append(rem_indices)
            subsets = []
            for split in splits:
                subset = copy(dataset)
                subset.indices = split
                subsets.append(subset)
            return subsets
        else:
            splits = [[] for _ in range(len(fracs))]
            for index_split in attr_dict.values():
                # subset = Subset(dataset, index_split)
                subset = copy(dataset)
                subset.indices = index_split
                subset_splits = self.split(
                    fracs, subset, by_attr, shuffle_strata
                )
                # unpack stratified splits
                for i, subset_split in enumerate(subset_splits):
                    # splits[i].extend([
                    #     index_split[s] for s in subset_split.indices
                    # ])
                    splits[i].extend(subset_split.indices)
            subsets = []
            for split in splits:
                subset = copy(dataset)
                subset.reset_indices()
                subset.indices = split
                subsets.append(subset)
            return subsets
        # considering: defer to dataloader shuffle param; should have same
        # effect shuffle each split after merging; may otherwise be homogenous
        # for split in splits:
        #     random.shuffle(split)
        # return [Subset(copy(self), split) for split in splits]
    def balance(
        self,
        dataset: "BatchedSubset[U, R, I] | None" = None,
        by_attr: str | list[str | None] | None = None,
        split_min_sizes: list[int] | None = None,
        split_max_sizes: list[int] | None = None,
        shuffle_strata: bool = True,
    ) -> None:
        self.indices = self._balance(
            dataset,
            by_attr,
            split_min_sizes,
            split_max_sizes,
            shuffle_strata,
        )
    def _balance(
        self,
        dataset: "BatchedSubset[U, R, I] | None" = None,
        by_attr: str | list[str | None] | None = None,
        split_min_sizes: list[int] | None = None,
        split_max_sizes: list[int] | None = None,
        shuffle_strata: bool = True,
    ) -> list[int]:
        """
        Note: behavior is a little odd for nested behavior; not exactly
        perfectly uniform throughout.  This is a little difficult: you can't
        exactly know ahead of time the size of the subgroups across splits
        """
        if by_attr == []:
            raise ValueError("Cannot parse empty value list")
        if isinstance(by_attr, str) or by_attr is None:
            by_attr = [by_attr]
        if dataset is None:
            dataset = BatchedSubset(self, self.indices)
        if split_min_sizes == [] or split_min_sizes is None:
            split_min_sizes = [0]
        if split_max_sizes == [] or split_max_sizes is None:
            split_max_sizes = [len(dataset)]
        # group samples by specified attr
        attr_dict = defaultdict(list)
        attr_key, by_attr = by_attr[0], by_attr[1:]
        for i in range(len(dataset)):
            item = dataset[i]
            if attr_key is None:
                attr_val = i
            elif attr_key in item:
                attr_val = getattr(item, attr_key)
            else:
                raise IndexError(f"Attribute {attr_key} not in dataset item")
            attr_dict[attr_val].append(i)
        subset_splits = []
        split_min_size, split_min_sizes = (
            split_min_sizes[0],
            split_min_sizes[1:]
        )
        split_max_size, split_max_sizes = (
            split_max_sizes[0],
            split_max_sizes[1:]
        )
        for split_indices in attr_dict.values():
            if by_attr != []:
                subset_indices = self._balance(
                    BatchedSubset(dataset, split_indices),
                    by_attr,
                    split_min_sizes,
                    split_max_sizes,
                    shuffle_strata,
                )
                split_indices = [split_indices[s] for s in subset_indices]
            subset_splits.append(split_indices)
        # shuffle splits; randomized group-level split
        if shuffle_strata:
            random.shuffle(subset_splits)
        # note: split_min_size is smallest allowed, min_split_size is smallest
        # observed
        valid_splits = [
            ss for ss in subset_splits
            if len(ss) >= split_min_size
        ]
        min_split_size = min(len(split) for split in valid_splits)
        subset_indices = []
        for split_indices in valid_splits:
            # if shuffle_strata:
            #     random.shuffle(split_indices)
            subset_indices.extend(
                split_indices[: min(min_split_size, split_max_size)]
            )
        # print(f"{attr_dict.keys()=}")
        # print(f"{[len(s) for s in valid_splits]=}")
        # print(f"{min_split_size=}")
        return subset_indices
    @property
    def indices(self) -> list[int]:
        if self._indices is None:
            self._indices = list(range(len(self)))
        return self._indices
    @indices.setter
    def indices(self, indices: list[int]) -> None:
        """
        Note: this logic facilitates nested re-indexing over the same base
        dataset. The underlying data remain the same, but when indices get set,
        you're effectively applying a mask over any existing indices, always
        operating *relative* to the existing mask.
        """
        # manually set new size
        self._dataset_len = len(indices)
        # note: this is a little tricky and compact; follow what happens when
        # _indices aren't already set
        self._indices = [
            self.indices[index]
            for index in indices
        ]
    def reset_indices(self) -> None:
        self._indices = None
        self._dataset_len = None
    def items(self) -> Iterator[I]:
        for i in range(len(self)):
            yield self._get_item(i)
    def __len__(self) -> int:
        if self._dataset_len is None:
            self._dataset_len = self._get_dataset_len()
        return self._dataset_len
    def __getitem__(self, index: int) -> I:
        item_data = self._get_item(index)
        index = self.indices[index]
        return self._process_item_data(item_data, index)
    def __iter__(self) -> Iterator[I]:
        """
        Note: this method isn't technically needed given ``__getitem__`` is
        defined and we operate cleanly over integer indices 0..(N-1), so even
        without an explicit ``__iter__``, Python will fall back to a reliable
        iteration mechanism. We nevertheless implement the trivial logic below
        to convey intent and meet static type checks for iterables.
        """
        return (self[i] for i in range(len(self)))
 class CompositeBatchedDataset[U, R, I](BatchedDataset[U, R, I]):
    """
    Dataset class for wrapping individual datasets.
    Note: because this remains a valid ``BatchedDataset``, we re-thread the
    generic type variables through the set of composed datasets. That is, they
    must have a common domain type ``Domain[U, R]``.
    """
    def __init__(
        self,
        datasets: list[BatchedDataset[U, R, I]],
    ) -> None:
        """
        Parameters:
            datasets: list of datasets
        """
        self.datasets = datasets
        self._indices: list[int] | None = None
        self._dataset_len: int | None = None
        self._item_psum: list[int] = []
        self._batch_psum: list[int] = []
    def _compute_prefix_sum(self, arr: list[int]) -> list[int]:
        if not arr:
            return []
        prefix_sum = [0] * (len(arr) + 1)
        for i in range(len(arr)):
            prefix_sum[i + 1] = prefix_sum[i] + arr[i]
        return prefix_sum
    def _get_dataset_for_item(self, item_index: int) -> tuple[int, int]:
        dataset_index = bisect(self._item_psum, item_index) - 1
        index_in_dataset = item_index - self._item_psum[dataset_index]
        return dataset_index, index_in_dataset
    def _get_dataset_for_batch(self, batch_index: int) -> tuple[int, int]:
        dataset_index = bisect(self._batch_psum, batch_index) - 1
        index_in_dataset = batch_index - self._batch_psum[dataset_index]
        return dataset_index, index_in_dataset
    def _get_batch_for_item(self, item_index: int) -> tuple[int, int]:
        index_pair = self._get_dataset_for_item(item_index)
        dataset_index, index_in_dataset = index_pair
        dataset = self.datasets[dataset_index]
        return dataset._get_batch_for_item(index_in_dataset)
    def _get_dataset_len(self) -> int:
        self.load_all()
        dataset_batch_sizes = [len(dataset) for dataset in self.datasets]
        dataset_sizes = [dataset._num_batches for dataset in self.datasets]
        # this method will only be ran once; set this instance var
        self._item_psum = self._compute_prefix_sum(dataset_batch_sizes)
        self._batch_psum = self._compute_prefix_sum(dataset_sizes)
        return self._item_psum[-1]
    def _process_batch_data(
        self,
        batch_data: R,
        batch_index: int,
    ) -> list[I]:
        index_pair = self._get_dataset_for_item(batch_index)
        dataset_index, index_in_dataset = index_pair
        dataset = self.datasets[dataset_index]
        return dataset._process_batch_data(batch_data, index_in_dataset)
    def _process_item_data(self, item_data: I, item_index: int) -> I:
        index_pair = self._get_dataset_for_item(item_index)
        dataset_index, index_in_dataset = index_pair
        dataset = self.datasets[dataset_index]
        return dataset._process_item_data(item_data, index_in_dataset)
    def _get_item(self, item_index: int) -> I:
        item_index = self.indices[item_index]
        index_pair = self._get_dataset_for_item(item_index)
        dataset_index, index_in_dataset = index_pair
        dataset = self.datasets[dataset_index]
        return dataset._get_item(index_in_dataset)
    def _get_batch(self, batch_index: int) -> list[I]:
        index_pair = self._get_dataset_for_item(batch_index)
        dataset_index, index_in_dataset = index_pair
        dataset = self.datasets[dataset_index]
        return dataset._get_batch(index_in_dataset)
    def load_all(
        self,
        num_workers: int | None = None
    ) -> list[list[I]]:
        batches = []
        for dataset in self.datasets:
            batches.extend(dataset.load_all(num_workers))
        return batches
    @property
    def pre_transform(self) -> list[Transform[I] | None]:
        return [dataset.pre_transform for dataset in self.datasets]
    @pre_transform.setter
    def pre_transform(self, pre_transform: Transform[I]) -> None:
        for dataset in self.datasets:
            dataset.pre_transform = pre_transform
    @property
    def post_transform(self) -> list[Transform[I] | None]:
        return [dataset.post_transform for dataset in self.datasets]
    @post_transform.setter
    def post_transform(self, post_transform: Transform[I]) -> None:
        for dataset in self.datasets:
            dataset.post_transform = post_transform
 class BatchedSubset[U, R, I](BatchedDataset[U, R, I]):
    def __init__(
        self,
        dataset: BatchedDataset[U, R, I],
        indices: list[int],
    ) -> None:
        self.dataset = dataset
        self.indices = indices
    def _get_item(self, item_index: int) -> I:
        """
        Subset indices are "reset" in its context. Simply passes through
        """
        return self.dataset._get_item(self.indices[item_index])
    def _get_dataset_len(self) -> int:
        return len(self.indices)
 class HomogenousDataset[U, R, I](BatchedDataset[U, R, I]):
    """
    Batched dataset where batches are equally sized.
    Subclass from this base when you can count on the reference data being
    prepared with fixed size batches (up to the last batch), e.g., that which
    has been prepared with a `Packer`. This can greatly improve measurement
    time of the dataset size by preventing the need for reading all batch files
    upfront, and reduces the cost of identifying item batches from O(log n) to
    O(1).
    Methods left for inheriting classes:
    - ``_process_item_data()``: item processing
    - ``_process_batch_data()``: batch processing
    """
    def __init__(
        self,
        domain: Domain[U, R],
        **kwargs: Unpack[DatasetKwargs],
    ) -> None:
        super().__init__(domain, **kwargs)
        # determine batch size across dataset, along w/ possible partial final
        # batch
        bsize = rem = len(self.get_batch(self._num_batches - 1))
        if self._num_batches > 1:
            bsize, rem = len(self.get_batch(self._num_batches - 2)), bsize
        self._batch_size: int = bsize
        self._batch_rem: int = rem
    def _get_dataset_len(self) -> int:
        return self._batch_size * (self._num_batches - 1) + self._batch_rem
    def _get_batch_for_item(self, item_index: int) -> tuple[int, int]:
        return item_index // self._batch_size, item_index % self._batch_size
 class HeterogenousDataset[U, R, I](BatchedDataset[U, R, I]):
    """
    Batched dataset where batches have arbitrary size.
    Methods left for inheriting classes:
    - ``_process_item_data()``: item processing
    - ``_process_batch_data()``: batch processing
    """
    def __init__(
        self,
        domain: Domain[U, R],
        **kwargs: Unpack[DatasetKwargs],
    ) -> None:
        super().__init__(domain, **kwargs)
        self._batch_size_psum: list[int] = []
    def _compute_prefix_sum(self, arr: list[int]) -> list[int]:
        if not arr:
            return []
        prefix_sum = [0] * (len(arr) + 1)
        for i in range(len(arr)):
            prefix_sum[i + 1] = prefix_sum[i] + arr[i]
        return prefix_sum
    def _get_dataset_len(self) -> int:
        # type error below: no idea why this is flagged
        batches = self.load_all()
        batch_sizes = [len(batch) for batch in batches]
        # this method will only be ran once; set this instance var
        self._batch_size_psum = self._compute_prefix_sum(batch_sizes)
        return self._batch_size_psum[-1]
    def _get_batch_for_item(self, item_index: int) -> tuple[int, int]:
        batch_index = bisect(self._batch_size_psum, item_index) - 1
        index_in_batch = item_index - self._batch_size_psum[batch_index]
        return batch_index, index_in_batch
 class SequenceDataset[I](HomogenousDataset[int, I, I]):
    """
    Trivial dataset skeleton for sequence domains.
    ``I``-typed sequence items map directly to dataset items. To produce a
    fully concrete dataset, one still needs to define ``_process_item_data()``
    to map from ``I``-items to tuples.
    """
    domain: SequenceDomain[I]
    def _process_batch_data(
        self,
        batch_data: I,
        batch_index: int,
    ) -> list[I]:
        if self.pre_transform is not None:
            batch_data = self.pre_transform(batch_data)
        return [batch_data]
 class TupleDataset[T](SequenceDataset[tuple[T, ...]]):
    """
    Trivial sequence-of-tuples dataset.
    This is the most straightforward line to a concrete dataset from a
    ``BatchedDataset`` base class. That is: the underlying domain is a sequence
    whose items are mapped to single-item batches and are already tuples.
    """
    def _process_item_data(
        self,
        item_data: tuple[T, ...],
        item_index: int,
    ) -> tuple[T, ...]:
        if self.post_transform is not None:
            item_data = self.post_transform(item_data)
        return item_data
--- a/trainlib/datasets/init.py
+++ b/trainlib/datasets/init.py
--- a/trainlib/datasets/disk.py
+++ b/trainlib/datasets/disk.py
@@ -0,0 +1,179 @@
 from io import BytesIO
 from abc import abstractmethod
 from typing import Any, NamedTuple
 from pathlib import Path
 from zipfile import ZipFile
 from mema.dataset import HomogenousDataset
 from mema.domains.disk import DiskDomain
 class DiskDataset[T: NamedTuple](HomogenousDataset[Path, bytes, T]):
    """
    The following line is to satisfy the type checker, which
    1. Can't recognize an appropriately re-typed constructor arg like
       def __init__(
           self,
           domain: DiskDomain,
           ...
        ): ...
       This *does* match the parent generic for the U=Path, R=bytes context
       def __init__(
           self,
           domain: Domain[U, R],
           ...
        ): ...
       but the type checker doesn't see this.
    2. "Lifted" type variables out of generics can't be used as upper bounds,
       at least not without throwing type checker warnings (thanks to PEP695).
       So I'm not allowed to have
       ```
       class BatchedDataset[U, R, D: Domain[U, R]]:
           ...
       ```
       which could bring appropriately dynamic typing for ``Domain``s, but is
       not a sufficiently concrete upper bound.
    So: we settle for a class-level type declaration, which despite not being
    technically appropriately scoped, it's not harming anything and satisfies
    ``ty`` type checks downstream (e.g., when we access ``DiskDomain.root``.
    """
    domain: DiskDomain
 class PackedDataset(DiskDataset):
    """
    Packed dataset.
    Currently out of commission - not compatible with latest dataset
    definitions. Will require a zipped disk domain
    Requires a specific dataset storage structure on the root data path:
    <data-path>/data/*-i<batch-num>-b<batch-size>
    <data-path>/meta/*-i<batch-num>-b<batch-size>
    That is, all data are compacted into core data (`data/`) and metadata
    (`meta/`) subdirectories. Compatible out-of-the-box with datasets written
    with a `Packer`.
    """
    def _get_uri_groups(self) -> list[tuple[Path, ...]]:
        data_root = Path(self.domain.root, "data")
        meta_root = Path(self.domain.root, "meta")
        data_file_paths = data_root.iterdir()
        meta_file_paths = meta_root.iterdir()
        return list(zip(data_file_paths, meta_file_paths, strict=True))
    def _process_batch_data(
        self,
        batch_data: tuple[bytes, ...],
        batch_index: int,
    ) -> list[tuple[bytes, ...]]:
        data_bytes, meta_bytes = batch_data
        meta_batch = self._unpack_meta(meta_bytes)
        data_batch = self._unpack_data(data_bytes, meta_batch)
        # zip up batch partial batch items into a single batch iterable
        # composed of item tuples
        batch_items = [
            (*ba, *bm)  # pyre-ignore[60]
            for ba, bm in zip(data_batch, meta_batch, strict=True)
        ]
        # apply transform to batch items if provided
        if self.load_transform:
            batch_items = list(map(self.load_transform, batch_items))
        return batch_items
    @abstractmethod
    def _unpack_data(
        self,
        batch_data_bytes: bytes,
        batch_meta: list[tuple[Any, ...]],
    ) -> list[tuple[Any, ...]]:
        """
        Load and unpack batch data.
        This method should be the inverse of an affiliated
        `Packer`'s `pack_data_bytes()`:
        <data list> -> pack   -> to bytes   -> blob
        <data list> <- unpack <- from bytes <- blob
        Returns:
            iterable of (partial) item tuples w/ data content
        """
        raise NotImplementedError
    @abstractmethod
    def _unpack_meta(self, batch_meta_bytes: bytes) -> list[tuple[Any, ...]]:
        """
        Load and unpack batch metadata.
        This method should be the inverse of an affiliated
        `Packer`'s `pack_meta_bytes()`:
        <data list> -> pack   -> to bytes   -> blob
        <data list> <- unpack <- from bytes <- blob
        Returns:
            iterable of (partial) item tuples w/ meta content
        """
        raise NotImplementedError
 class ZippedDataset(DiskDataset):
    """
    Dataset with samples stored in ZIP files.
    This dataset base is primarily used as the type of input dataset for a
    `Packer` object. This is compatible with most raw dataset structures,
    reading down arbitrarily packaged ZIP files and "re-batching" during
    access.
    """
    item_header: tuple[str, ...] = ("bytes",)
    def _get_uri_groups(self) -> list[tuple[str, ...]]:
        zip_file_paths: list[str] = [
            str(path)
            for path in Path(self.data_path).iterdir()
            if path.suffix == ".zip"
        ]        
        # will just zip a single list yielding 1-tuples (to match type sig)
        return list(zip(zip_file_paths))
    def _process_batch_data(
        self,
        batch_data: tuple[bytes, ...],
        batch_index: int,
    ) -> list[tuple[bytes, ...]]:
        items = []
        batch_zip_file = ZipFile(BytesIO(batch_data[0]))
        for zname in batch_zip_file.namelist():
            if Path(zname).suffix not in self.extensions:
                continue
            with batch_zip_file.open(zname, "r") as zfile:
                items.append((zfile.read(), zname))
        return list(items)
--- a/trainlib/datasets/memory.py
+++ b/trainlib/datasets/memory.py
@@ -0,0 +1,210 @@
 """
 Leaving the following here in case we return for some specifics in the future.
 At an earlier stage, we had a few specifically typed datasets/domains for
 in-memory structures, but these are almost entirely redundant with the generic
 ``SequenceDomain`` definition: the general retrieval and iteration behaviors
 there are fairly universal and can be type-tailored without new class
 definitions (unless you really want a new hierarchy).
 The following were design notes on a dict/named tuple list-based dataset.
 Dataset from list of (dict) records.
 This is designed such that a "batch" is just a single record in the base
 list. Each URI group is a singleton index tuple, which grabs its single
 corresponding record in the domain when read.
 One could alternatively have the entire record list be a single batch, and
 have a single URI group with N null references. The null reference would
 need to be the sole URI for the domain definition, and ``.read(null)``
 would always return the entire list. Everything will then be handled as
 expected: in ``get_item()``,
 - ``._get_batch_for_item(item_index)`` maps to the singleton batch
 - ``.get_batch(batch_index)[index_in_batch]`` simply indexes directly in
  the record list
 This is pretty unnatural, since now a batch as returned by
 ``.read_resources()`` will be N references to the entire record list. It
 nevertheless sidesteps full list reconstruction and allows the propagation
 of direct indexing, so it has that in its corner.
 Another viable approach (least preferred): have a single batch, but where
 URIs map to individual row indices. In some respects, this is the most
 intuitive interpretation of "batch" and how we'd map to items, but it's the
 *least efficient* because the batch logic of ``BatchedDataset`` will
 *reconstruct the entire record list*. In ``.read_resources()``, we have
 ```
 return tuple(self.domain.read(uri) for uri in uri_group)
 ```
 So, despite being a natural interpretation, it pulls apart the domain "too
 well" and rebuilds it in-house. This makes sense when resources are
 external (e.g., files on disk, data over network), but when already
 part of the Python process, that model is wasteful.
 Note: inheriting datasets will need to implement an appropriate
 ``_item_header``. This will be trivial for consistent datasets, since it'll
 just be the keys of any of the dict records.
 Left to define:
 + ``_process_item_data()``: map T to final tuple
 class RecordDataset[T: NamedTuple](HomogenousDataset[int, T, T]):
    domain: TupleListDomain[T]
    def _process_batch_data(
        self,
        batch_data: T,
        batch_index: int,
    ) -> list[T]:
        Produce collection of item tuples.
        Note: no interaction with ``item_tuple`` is needed given batches are
        already singular ``item_tuple`` shaped items.
        return [batch_data]
 """
 from typing import Unpack
 import torch.nn.functional as F
 from torch import Tensor
 from trainlib.domain import SequenceDomain
 from trainlib.dataset import TupleDataset, DatasetKwargs
 class SlidingWindowDataset[T: Tensor](TupleDataset[T]):
    def __init__(
        self,
        domain: SequenceDomain[tuple[T, ...]],
        lookback: int,
        offset: int = 0,
        lookahead: int = 1,
        num_windows: int = 1,
        **kwargs: Unpack[DatasetKwargs],
    ) -> None:
        self.lookback = lookback
        self.offset = offset
        self.lookahead = lookahead
        self.num_windows = num_windows
        super().__init__(domain, **kwargs)
    def _process_batch_data(
        self,
        batch_data: tuple[T, ...],
        batch_index: int,
    ) -> list[tuple[T, ...]]:
        """
        Backward pads first sequence over (lookback-1) length, and steps the
        remaining items forward by the lookahead.
        Batch data:
            (Tensor[C1, T], ..., Tensor[CN, T])
        + lookback determines window size; pad left to create lookback size
          with the first element at the right:
          |-lookback-|
          [0 ... 0 T0]
          `lookback` is strictly positive, unbounded.
        + offset shifts the first such window forward. `0 <= offset < L`;
          think of it as the number of additional non-padded items we
          slide into the window from the right. At its largest value of `L-1`,
          we'll have L-sized windows with `T0` as the leftmost element.
          offset=2 
          [0 ... T0 T1 T2]
          |---lookback---|
          In effect, the index of the rightmost item of the first window
          will be equal to the value of `offset`. There are `T - offset`
          total possible windows over a given sequence (regardless of
          lookback).
        + lookahead determines the offset of the "label" slices from the
          first index, regardless of any value of `offset`.
          lookahead=3
          [0 ... 0 T0] T1 T2 [T3]
          |-lookback-|
          0 [0 .. T0 T1] T2 T3 [T4]
            |-lookback-|
          There are `T - lookahead` allowed slices, assuming the lookahead
          exceeds the offset.
        To get windows starting with the first index at the left: we first set
        out window size (call it L), determined by `lookback`. Then the
        rightmost index we want will be `L-1`, which determines our `offset`
        setting.
        lookahead=L, offset=L-1
        [ T_0 ... T_{L-1} ]
        To get a one-step lookahead in front of that rightmost item, the
        `lookahead` can be set to the index of the first label we want:
        lookback=L, offset=L-1   lookahead=L
        [ T_0 ... T_{L-1} ]      [ T_L ]
        """
        if self.pre_transform is not None:
            batch_data = self.pre_transform(batch_data)
        lb = self.lookback
        off = min(self.offset, lb-1)
        la = self.lookahead
        ws = []
        for t in batch_data[:self.num_windows]:
            # for window sized `lb`, we pad with `lb-1` zeros. We then take off
            # the amount of our offset, which in the extreme cases does no
            # padding.
            xip = F.pad(t, ((lb-1) - off, 0))
            # extract sliding windows over the padded tensor
            # unfold(-1, lb, 1) slides over the last dim, 1 step at a time, for
            # `lb`-sized windows. We turn (C_i, pad+T) shape into 
            # (C_i, T-offset, lb), giving `T-offset` total `lb` windows.
            wi = xip.unfold(-1, lb, 1)
            # (C_i, T-offset, lb) -> (T-offset, C_i, lb)
            wi = wi.permute(1, 0, 2)
            # if lookahead exceeds offset, there are some windows for which we
            # won't be able to assign a lookahead label. Cut those off here
            if la - off > 0:
                wi = wi[:-(la-off)]
            ws.append(wi)
        ys = []
        for t in batch_data[self.num_windows:]:
            # tensors (C_i, T) shaped, align with lookahead, giving a 
            # (C_i, T-lookahead)
            y = t[:, la:]
            # (C_i, T-lookahead) -> (T-lookahead, C_i)
            y = y.permute(1, 0)
            # cut off any elements if offset exceeds lookahead
            if off - la > 0:
                y = y[:-(off-la)]
            ys.append(y)
        return list(zip(*ws, *ys, strict=True))
--- a/trainlib/domain.py
+++ b/trainlib/domain.py
@@ -0,0 +1,85 @@
 """
 Defines a knowledge domain. Wraps a Dataset / Simulator / Knowledge
 Downstream exploration might include
 - Calibrating Simulator / Knowledge with a Dataset
 - Amending Dataset with Simulator / Knowledge
 - Positioning Knowledge within Simulator context
  * Where to replace Simulator subsystem with Knowledge?
 Other variations:
 - Multi-fidelity simulators
 - Multi-scale models
 - Multi-system
 - Incomplete knowledge / divergence among sources
 Questions:
 - Should Simulator / Knowledge be unified as one (e.g., "Expert")
 """
 from collections.abc import Mapping, Iterator, Sequence
 class Domain[U, R](Mapping[U, R]):
    """
    Domain base class, generic to a URI type ``U`` and resource type ``R``.
    Domains are just Mappings where the iterator behavior is specifically typed
    to range over keys (URIs). Defining a specific class here gives us a base
    for a nominal hierarchy, but functionally the Mapping core (sized iterables
    with accessors).
    """
    def __call__(self, uri: U) -> R:
        """
        Get the resource for a given URI (call-based alias).
        """
        return self[uri]
    def __getitem__(self, uri: U) -> R:
        """
        Get the resource for a given URI.
        """
        raise NotImplementedError
    def __iter__(self) -> Iterator[U]:
        """
        Provide an iterator over domain URIs.
        """
        raise NotImplementedError
    def __len__(self) -> int:
        """
        Measure the size the domain.
        """
        raise NotImplementedError
 class SequenceDomain[R](Domain[int, R]):
    """
    Trivial domain implementation for wrapping sequences that can be seen as
    Mappings with 0-indexed keys.
    Why define this? Domains provide iterators over their *keys*, sequences
    often iterate over *values*.
    """
    def __init__(self, sequence: Sequence[R]) -> None:
        self.sequence = sequence
    def __getitem__(self, uri: int) -> R:
        return self.sequence[uri]
    def __iter__(self) -> Iterator[int]:
        return iter(range(len(self.sequence)))
    def __len__(self) -> int:
        return len(self.sequence)
--- a/trainlib/domains/init.py
+++ b/trainlib/domains/init.py
--- a/trainlib/domains/disk.py
+++ b/trainlib/domains/disk.py
@@ -0,0 +1,37 @@
 from pathlib import Path
 from collections.abc import Iterator
 from mema.domain import Domain
 class DiskDomain(Domain[Path, bytes]):
    def __init__(
        self,
        root: Path,
        extensions: list[str] | None = None,
    ) -> None:
        """
        Parameters:
            extensions: list of file extensions to filter for when determining
                data file paths to read. This is a whitelist, except when left
                as ``None`` (which is default), in which case all extensions
                all allowed.
        """
        self.root = root
        self.extensions = extensions
    def __getitem__(self, uri: Path) -> bytes:
        return uri.read_bytes()
    def __iter__(self) -> Iterator[Path]:
        return (
            path for path in self.root.iterdir()
            if path.is_file() and (
                self.extensions is None
                or path.suffix in self.extensions
            )
        )
    def __len__(self) -> int:
        return sum(1 for _ in iter(self))
--- a/trainlib/domains/functional.py
+++ b/trainlib/domains/functional.py
@@ -0,0 +1,58 @@
 from collections.abc import Callable, Iterator, Sequence
 from trainlib.domain import Domain
 class SimulatorDomain[P, R](Domain[int, R]):
    """
    Base simulator domain, generic to arbitrary callables.
    Note: we don't store simulation results here; that's left to a downstream
    object, like a `BatchedDataset`, to cache if needed. We also don't subclass
    `SequenceDataset` because the item getter type doesn't align: we accept an
    `int` in the parameter list, but don't return the items directly from that
    collection (we transform them first).
    """
    def __init__(
        self,
        simulator: Callable[[P], R],
        parameters: Sequence[P],
    ) -> None:
        self.simulator = simulator
        self.parameters = parameters
    def __getitem__(self, uri: int) -> R:
        return self.simulator(self.parameters[uri])
    def __iter__(self) -> Iterator[int]:
        return iter(range(len(self.parameters)))
    def __len__(self) -> int:
        return len(self.parameters)
 class SimulatorPredictiveDomain[P, R](Domain[int, tuple[R, ...]]):
    def __init__(
        self,
        simulator: Callable[[P], R],
        parameters: Sequence[P],
        predictives: Sequence[Callable[[R], R]],
    ) -> None:
        self.simulator = simulator
        self.parameters = parameters
        self.predictives = predictives
    def __getitem__(self, uri: int) -> tuple[R, ...]:
        sample = self.simulator(self.parameters[uri])
        return (
            sample,
            *(p(sample) for p in self.predictives)
        )
    def __iter__(self) -> Iterator[int]:
        return iter(range(len(self.parameters)))
    def __len__(self) -> int:
        return len(self.parameters)
--- a/trainlib/estimator.py
+++ b/trainlib/estimator.py
@@ -0,0 +1,195 @@
 """
 Development note
 I'd rather lay out bare args and kwargs in the estimator methods, but the
 allowed variance in subclasses makes this difficult. To ease the pain of typing
 coordination with datasets, `loss()` / `forward()` / `metrics()` specify their
 arguments through a TypedDict. This makes those signatures more portable, and
 can bound type variables in other places.
 In theory, even the single tensor input base currently in place could be
 relaxed given the allowed variability in dataset / dataloader outputs. The
 default collate function in the PyTorch `DataLoader` source leaves types like
 strings and bytes unchanged, so not all kinds of data are batched into a
 `Sequence`, let alone a tensor. Nevertheless, estimators are `nn.Module`
 derivatives, so it's at minimum a safe assumption we'll need at least one
 tensor (or *should* have one).
 """
 import logging
 from typing import Unpack, TypedDict
 from collections.abc import Generator
 from torch import nn, Tensor
 from torch.optim import Optimizer
 from torch.utils.tensorboard import SummaryWriter
 from trainlib.util.type import OptimizerKwargs
 logger: logging.Logger = logging.getLogger(__name__)
 class EstimatorKwargs(TypedDict):
    inputs: Tensor
 class Estimator[Kw: EstimatorKwargs](nn.Module):
    """
    Estimator base class.
    All methods that raise ``NotImplementedErrors`` are directly invoked in the
    ``Trainer.train(...)`` loop.
    Note the flexibility afforded to the signatures of `forward()`, `loss()`,
    and `metrics()` methods, which should generally have identical sets of
    arguments in inheriting classes. The base class is generic to a type `K`,
    which should be a `TypedDict` for inheriting classes (despite not being
    enforceable as an upper bound) that reflects the keyword argument
    structure for these methods.
    For instance, in a sequence prediction model with labels and masking, you
    might have something like:
    .. code-block:: python
        class PredictorKwargs(TypedDict, total=False):
            labels: list[Tensor]
            lengths: Tensor
            mask: Tensor
        class SequencePredictor(Estimator[PredictorKwargs]):
            def forward(
                self,
                input: Tensor,
                **kwargs: Unpack[PredictorKwargs],
            ) -> tuple[Tensor, ...]:
                ...
            def loss(
                self,
                input: Tensor,
                **kwargs: Unpack[PredictorKwargs],
            ) -> Generator[Tensor]:
                ...
            def metrics(
                self,
                input: Tensor,
                **kwargs: Unpack[PredictorKwargs],
            ) -> dict[str, float]:
                ...
    While `loss` and `metrics` should leverage the full set of keyword
    arguments, `forward` may not (e.g., in the example above, it shouldn't use
    `labels`).
    Subclasses of `SequencePredictor` should then be generic over subtypes of
    `PredictorKwargs`.
    """
    def forward(
        self,
        **kwargs: Unpack[Kw],
    ) -> tuple[Tensor, ...]:
        raise NotImplementedError
    def loss(
        self,
        **kwargs: Unpack[Kw],
    ) -> Generator:
        """
        Compute model loss for the given input.
        Note that the loss is implemented as a generator to support
        multi-objective estimator setups. That is, losses can be yielded in
        sequence, allowing for a training loop to propagate model parameter
        updates before the next loss function is calculated. For instance, in a
        GAN-like setup, one might first emit the D-loss, update D parameters,
        then compute the G-loss (*depending* on the updated D parameters). Such
        a scheme is not otherwise possible without bringing the intermediate
        parameter update "in house" (breaking a separation of duties with the
        train loop).
        """
        raise NotImplementedError
    def metrics(
        self,
        **kwargs: Unpack[Kw],
    ) -> dict[str, float]:
        """
        Compute metrics for the given input.
        """
        raise NotImplementedError
    def optimizers(
        self,
        **kwargs: Unpack[OptimizerKwargs],
    ) -> tuple[Optimizer, ...]:
        """
        Get optimizers for the estimator to use in training loops.
        Example providing a singular Adam-based optimizer:
        .. code-block:: python
            def optimizers(...):
                optimizer = torch.optim.AdamW(
                    self.parameters(),
                    lr=1e-3,
                    eps=1e-8,
                )
                return (optimizer,)
        """
        raise NotImplementedError
    def epoch_step(self) -> None:
        """
        Step epoch-dependent model state.
        This method should not include optimization of primary model
        parameters; that should be left to external optimizers. Instead, this
        method should step forward things like internal hyperparameter
        schedules in accordance with the expected call rate (e.g., every
        epoch).
        """
        raise NotImplementedError
    def epoch_write(
        self,
        writer: SummaryWriter,
        step: int | None = None,
        val: bool = False,
        **kwargs: Unpack[Kw],
    ) -> None:
        """
        Write epoch-dependent Tensorboard items.
        If implemented, this should supplement that which is provided in
        ``metrics()``. Tensors provided as ``input`` should include raw
        training/validation data; examples include writing raw embeddings,
        canonical visualizations of the samples (e.g., previewing images), etc.
        Parameters:
            input: batch of tensors from the current epoch
            writer: tensorboard writer instance
            step: current step in the optimization loop
            val: whether input is a validation sample
        """
        raise NotImplementedError
    def log_arch(self) -> None:
        """
        Log Estimator architecture details.
        """
        logger.info(f"> Estimator :: {self.__class__.__name__}")
        num_params = sum(
            p.numel() for p in self.parameters() if p.requires_grad
        )
        logger.info(f"| > # model parameters: {num_params}")
--- a/trainlib/estimators/init.py
+++ b/trainlib/estimators/init.py
--- a/trainlib/estimators/rnn.py
+++ b/trainlib/estimators/rnn.py
@@ -0,0 +1,491 @@
 import logging
 from typing import Unpack, NotRequired
 from collections.abc import Generator
 import torch
 import torch.nn.functional as F
 from torch import nn, Tensor
 from torch.optim import Optimizer
 from torch.utils.tensorboard import SummaryWriter
 from mema.estimator import Estimator, EstimatorKwargs
 from mema.util.type import OptimizerKwargs
 from mema.util.module import get_grad_norm
 from mema.estimators.tdnn import TDNNLayer
 logger: logging.Logger = logging.getLogger(__name__)
 class RNNKwargs(EstimatorKwargs):
    inputs: Tensor
    labels: NotRequired[Tensor]
 class LSTM[K: RNNKwargs](Estimator[K]):
    """
    Base RNN architecture.
    """
    def __init__(
        self,
        input_dim: int,
        output_dim: int,
        hidden_dim: int = 64,
        num_layers: int = 4,
        bidirectional: bool = False,
        verbose: bool = True,
    ) -> None:
        """
        Parameters:
            input_dim: dimensionality of the input
            output_dim: dimensionality of the output
            rnn_dim: dimensionality of each RNN layer output
            num_layers: number of LSTM layers pairs to use
        """
        super().__init__()
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.dense_in = nn.Linear(input_dim, hidden_dim)
        self.lstm = nn.LSTM(
            hidden_dim,
            hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            bidirectional=bidirectional,
        )
        lstm_out_dim = hidden_dim * (2 if bidirectional else 1)
        self.dense_z = nn.Linear(lstm_out_dim, output_dim)
        # weight initialization for LSTM layers
        def init_weights(m: nn.Module) -> None:
            if isinstance(m, nn.LSTM):
                for name, p in m.named_parameters():
                    if "weight_ih" in name:
                        nn.init.xavier_uniform_(p)
                    elif "weight_hh" in name:
                        nn.init.orthogonal_(p)
                    elif "bias" in name:
                        nn.init.zeros_(p)
        self.apply(init_weights)
        if verbose:
            self.log_arch()
    def _clamp_rand(self, x: Tensor) -> Tensor:
        return torch.clamp(
            x + (1.0 / 127.0) * (torch.rand_like(x) - 0.5),
            min=-1.0,
            max=1.0,
        )
    def forward(self, **kwargs: Unpack[K]) -> tuple[Tensor, ...]:
        inputs = kwargs["inputs"]
        # data shaped (B, C, T); map to (B, T, C)
        x = inputs.permute(0, 2, 1)
        x = torch.tanh(self.dense_in(x))
        x = self._clamp_rand(x)
        x, hidden = self.lstm(x)
        z = self.dense_z(x)
        return z[:, -1, :], hidden
    def loss(self, **kwargs: Unpack[K]) -> Generator[Tensor]:
        predictions = self(**kwargs)[0]
        labels = kwargs["labels"]
        yield F.mse_loss(predictions, labels)
    def metrics(self, **kwargs: Unpack[K]) -> dict[str, float]:
        with torch.no_grad():
            loss = next(self.loss(**kwargs)).item()
        return {
            "loss": loss,
            "grad_norm": get_grad_norm(self)
        }
    def optimizers(
        self,
        **kwargs: Unpack[OptimizerKwargs],
    ) -> tuple[Optimizer, ...]:
        """
        """
        default_kwargs: Unpack[OptimizerKwargs] = {
            "lr": 1e-3,
            "eps": 1e-8,
        }
        opt_kwargs = {**default_kwargs, **kwargs}
        optimizer = torch.optim.AdamW(
            self.parameters(),
            **opt_kwargs,
        )
        return (optimizer,)
    def epoch_step(self) -> None:
        return None
    def epoch_write(
        self,
        writer: SummaryWriter,
        step: int | None = None,
        val: bool = False,
        **kwargs: Unpack[K],
    ) -> None:
        return None
    def log_arch(self) -> None:
        super().log_arch()
        logger.info(f"| > {self.input_dim=}")
        logger.info(f"| > {self.hidden_dim=}")
        logger.info(f"| > {self.num_layers=}")
        logger.info(f"| > {self.output_dim=}")
 class MultiheadLSTMKwargs(EstimatorKwargs):
    inputs: Tensor
    labels: NotRequired[Tensor]
    auxiliary: NotRequired[Tensor]
 class MultiheadLSTM[K: MultiheadLSTMKwargs](Estimator[K]):
    def __init__(
        self,
        input_dim: int,
        output_dim: int,
        hidden_dim: int = 64,
        num_layers: int = 4,
        bidirectional: bool = False,
        head_dims: list[int] | None = None,
        verbose: bool = True,
    ) -> None:
        super().__init__()
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.head_dims = head_dims if head_dims is not None else []
        self.dense_in = nn.Linear(input_dim, hidden_dim)
        self.lstm = nn.LSTM(
            hidden_dim,
            hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            bidirectional=bidirectional,
        )
        lstm_out_dim = hidden_dim * (2 if bidirectional else 1)
        self.dense_z_out = nn.Linear(lstm_out_dim, output_dim)
        self.dense_z_heads = nn.ModuleList([
            nn.Linear(lstm_out_dim, head_dim)
            for head_dim in self.head_dims
        ])
        # weight initialization for LSTM layers
        def init_weights(m: nn.Module) -> None:
            if isinstance(m, nn.LSTM):
                for name, p in m.named_parameters():
                    if "weight_ih" in name:
                        nn.init.xavier_uniform_(p)
                    elif "weight_hh" in name:
                        nn.init.orthogonal_(p)
                    elif "bias" in name:
                        nn.init.zeros_(p)
        self.apply(init_weights)
        if verbose:
            self.log_arch()
    def _clamp_rand(self, x: Tensor) -> Tensor:
        return torch.clamp(
            x + (1.0 / 127.0) * (torch.rand_like(x) - 0.5),
            min=-1.0,
            max=1.0,
        )
    def forward(self, **kwargs: Unpack[K]) -> tuple[Tensor, ...]:
        inputs = kwargs["inputs"]
        # data shaped (B, C, T); map to (B, T, C)
        x = inputs.permute(0, 2, 1)
        x = torch.tanh(self.dense_in(x))
        x = self._clamp_rand(x)
        x, hidden = self.lstm(x)
        z = self.dense_z_out(x)
        zs = torch.cat([head(x) for head in self.dense_z_heads], dim=-1)
        return z[:, -1, :], zs[:, -1, :]
    def loss(self, **kwargs: Unpack[K]) -> Generator[Tensor]:
        pred, pred_aux = self(**kwargs)
        labels = kwargs["labels"]
        aux_labels = kwargs.get("auxiliary")
        if aux_labels is None:
            yield F.mse_loss(pred, labels)
        else:
            yield F.mse_loss(pred, labels) + F.mse_loss(pred_aux, aux_labels)
    def metrics(self, **kwargs: Unpack[K]) -> dict[str, float]:
        with torch.no_grad():
            loss = next(self.loss(**kwargs)).item()
        return {
            "loss": loss,
            "grad_norm": get_grad_norm(self)
        }
    def optimizers(
        self,
        **kwargs: Unpack[OptimizerKwargs],
    ) -> tuple[Optimizer, ...]:
        """
        """
        default_kwargs: Unpack[OptimizerKwargs] = {
            "lr": 1e-3,
            "eps": 1e-8,
        }
        opt_kwargs = {**default_kwargs, **kwargs}
        optimizer = torch.optim.AdamW(
            self.parameters(),
            **opt_kwargs,
        )
        return (optimizer,)
    def epoch_step(self) -> None:
        return None
    def epoch_write(
        self,
        writer: SummaryWriter,
        step: int | None = None,
        val: bool = False,
        **kwargs: Unpack[K],
    ) -> None:
        return None
    def log_arch(self) -> None:
        super().log_arch()
        logger.info(f"| > {self.input_dim=}")
        logger.info(f"| > {self.hidden_dim=}")
        logger.info(f"| > {self.num_layers=}")
        logger.info(f"| > {self.output_dim=}")
 class ConvRNN[K: RNNKwargs](Estimator[K]):
    """
    Base recurrent convolutional architecture.
    Computes latents, initial states, and rate estimates from features and
    lambda parameter.
    """
    def __init__(
        self,
        input_dim: int,
        output_dim: int,
        temporal_dim: int,
        gru_dim: int = 64,
        conv_dim: int = 96,
        num_layers: int = 4,
        conv_kernel_sizes: list[int] | None = None,
        conv_dilations: list[int] | None = None,
        verbose: bool = True,
    ) -> None:
        """
        Parameters:
            input_dim: dimensionality of the input
            output_dim: dimensionality of the output
            gru_dim: dimensionality of each GRU layer output
            conv_dim: dimensionality of each conv layer output
            num_layers: number of gru-conv layer pairs to use
            conv_kernel_sizes: kernel sizes for conv layers
            conv_dilations: dilation settings for conv layers
        """
        super().__init__()
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.gru_dim = gru_dim
        self.conv_dim = conv_dim
        self.num_layers = num_layers
        self.receptive_field = 0
        self.conv_kernel_sizes: list[int]
        if conv_kernel_sizes is None:
            self.conv_kernel_sizes = [4] * num_layers
        else:
            self.conv_kernel_sizes = conv_kernel_sizes
        self.conv_dilations: list[int]
        if conv_dilations is None:
            self.conv_dilations = [1] + [2] * (num_layers - 1)
        else:
            self.conv_dilations = conv_dilations
        self._gru_layers: nn.ModuleList = nn.ModuleList()
        self._conv_layers: nn.ModuleList = nn.ModuleList()
        layer_in_dim = gru_dim
        for i in range(self.num_layers):
            gru_layer = nn.GRU(layer_in_dim, gru_dim, batch_first=True)
            self._gru_layers.append(gru_layer)
            layer_in_dim += gru_dim
            tdnn_layer = TDNNLayer(
                layer_in_dim,
                conv_dim,
                kernel_size=self.conv_kernel_sizes[i],
                dilation=self.conv_dilations[i],
                #pad=False,
            )
            self.receptive_field += tdnn_layer.receptive_field
            self._conv_layers.append(tdnn_layer)
            layer_in_dim += conv_dim
        # self.dense_in = nn.Linear(self.input_dim, gru_dim)
        self.dense_in = TDNNLayer(
            self.input_dim,
            gru_dim,
            kernel_size=1,
            pad=False
        )
        # will be (B, T, C), applies indep at each time step across channels
        # self.dense_z = nn.Linear(layer_in_dim, self.output_dim)
        # will be (B, C, T), applies indep at each time step across channels
        self.dense_z = TDNNLayer(
            layer_in_dim,
            self.output_dim,
            kernel_size=temporal_dim,
            pad=False,
        )
        # weight initialization for GRU layers
        def init_weights(module: nn.Module) -> None:
            if isinstance(module, nn.GRU):
                for p in module.named_parameters():
                    if p[0].startswith("weight_hh_"):
                        nn.init.orthogonal_(p[1])
        self.apply(init_weights)
        if verbose:
            self.log_arch()
    def _clamp_rand(self, x: Tensor) -> Tensor:
        return torch.clamp(
            x + (1.0 / 127.0) * (torch.rand_like(x) - 0.5),
            min=-1.0,
            max=1.0,
        )
    def forward(self, **kwargs: Unpack[K]) -> tuple[Tensor, ...]:
        inputs = kwargs["inputs"]
        # embedding shaped (B, C, T)
        x = self._clamp_rand(torch.tanh(self.dense_in(inputs)))
        # prepare shape (B, T, C) -- for GRU
        x = x.transpose(-2, -1)
        for gru, conv in zip(self._gru_layers, self._conv_layers, strict=True):
            xg = self._clamp_rand(gru(x)[0])
            x = torch.cat([x, xg], -1)
            xc = self._clamp_rand(conv(x.transpose(-2, -1)))
            xc = xc.transpose(-2, -1)
            x = torch.cat([x, xc], -1)
        # z = self.dense_z(x)
        # z = z.transpose(-2, -1)
        x = x.transpose(-2, -1)
        # map to (B, C, T)
        z = self.dense_z(x)
        return (z,)
    def loss(self, **kwargs: Unpack[K]) -> Generator[Tensor]:
        predictions = self(**kwargs)[0]
        labels = kwargs["labels"]
        # squeeze last dim; we've mapped T -> 1
        predictions = predictions.squeeze(-1)
        yield F.mse_loss(predictions, labels, reduction="mean")
    def metrics(self, **kwargs: Unpack[K]) -> dict[str, float]:
        with torch.no_grad():
            loss = next(self.loss(**kwargs)).item()
        return {
            "loss": loss,
            "grad_norm": get_grad_norm(self)
        }
    def optimizers(
        self,
        **kwargs: Unpack[OptimizerKwargs],
    ) -> tuple[Optimizer, ...]:
        """
        """
        default_kwargs: Unpack[OptimizerKwargs] = {
            "lr": 1e-3,
            "eps": 1e-8,
        }
        opt_kwargs = {**default_kwargs, **kwargs}
        optimizer = torch.optim.AdamW(
            self.parameters(),
            **opt_kwargs,
        )
        return (optimizer,)
    def epoch_step(self) -> None:
        return None
    def epoch_write(
        self,
        writer: SummaryWriter,
        step: int | None = None,
        val: bool = False,
        **kwargs: Unpack[K],
    ) -> None:
        return None
    def log_arch(self) -> None:
        super().log_arch()
        logger.info(f"| > {self.input_dim=}")
        logger.info(f"| > {self.gru_dim=}")
        logger.info(f"| > {self.conv_dim=}")
        logger.info(f"| > {self.num_layers=}")
        logger.info(f"| > {self.conv_kernel_sizes=}")
        logger.info(f"| > {self.conv_dilations=}")
        logger.info(f"| > {self.receptive_field=}")
        logger.info(f"| > {self.output_dim=}")
--- a/trainlib/estimators/tdnn.py
+++ b/trainlib/estimators/tdnn.py
@@ -0,0 +1,114 @@
 import logging
 from collections.abc import Generator
 import torch
 import torch.nn.functional as F
 from torch import nn, Tensor
 from torch.optim import Optimizer
 from torch.nn.utils.parametrizations import weight_norm
 logger: logging.Logger = logging.getLogger(__name__)
 class TDNNLayer(nn.Module):
    """
    Time delay neural network layer.
    Built on torch Conv1D layers, with additional support for automatic
    padding.
    """
    def __init__(
        self,
        input_channels: int,
        output_channels: int,
        kernel_size: int = 3,
        dilation: int = 1,
        lookahead: int = 0,
        pad: bool = True,
    ) -> None:
        """
        Implements a fast TDNN layer via `torch.Conv1d`.
        Note that we're restricted to the kernel shapes producible by `Conv1d`
        objects, implying (primarily) that the kernel must be symmetric and
        have equal spacing.
        For example, the symmetric but non-uniform context [-3, -2, 0, +2, +3]
        cannot be represented, while [-6, -3, 0, 3, 6] can. A few other kernel
        examples:
        kernel_size=3; dilation=1 -> [-1, 0, 1]
        kernel_size=3; dilation=3 -> [-3, 0, 3]
        kernel_size=4; dilation=2 -> [-3, -1, 1, 3]
        By default, the TDNN layer left pads the input to ensure the output has
        the same sequence length as the original input. For example, with a
        kernel size of 3 (dilation of 1) and sequence length of T=3, the
        sequence will be left padded with 2 zeros:
        [0, 0, 1, 1, 1] -> [x1, x2, x3]
        If a lookahead is specified, some number of those left zeros will be
        moved to the right. If lookahead=1, for instance, indicating 1
        additional "future frame" of context, padding will look like
        [0, 1, 1, 1, 0] -> [x1, x2, x3]
        The output x_i now sees through time step i+1.
        Parameters:
            input_channels: number of input channels
            output_channels: number of channels produced by the temporal
                convolution
            kernel_size: total size of the kernel
            dilation: dilation of receptive field, i.e., the size of the gaps
            lookahead: number of allowed lookahead frames
            pad: whether the input should be padded, producing an output with
                the same sequence length as the input
        """
        super().__init__()
        self.td_conv: nn.Module = weight_norm(
            nn.Conv1d(
                input_channels,
                output_channels,
                kernel_size=kernel_size,
                dilation=dilation,
            )
        )
        self.pad: bool = pad
        self.lookahead = lookahead
        self.receptive_field: int = (kernel_size - 1) * dilation + 1
        assert (
            self.lookahead < self.receptive_field
        ), "Lookahead cannot exceed receptive field"
    def forward(self, x: Tensor) -> Tensor:
        """
        Dimension definitions:
        - B: batch size
        - Di: input dimension at single time step (aka input channels)
        - Do: output dimension at single time step (aka output channels)
        - T: sequence length
        Parameters:
            x: input tensor, shaped [B, Di, T] (optionally w/o a batch dim)
        Returns:
            tensor shaped [B, Do, T-kernel_size]
        """
        # pad according to receptive field and lookahead s.t. output
        # shape [B, *, T]
        if self.pad:
            x = F.pad(
                x,
                (self.receptive_field - self.lookahead - 1, self.lookahead)
            )
        return self.td_conv(x)
--- a/trainlib/trainer.py
+++ b/trainlib/trainer.py
@@ -0,0 +1,509 @@
 import os
 import time
 import logging
 from io import BytesIO
 from copy import deepcopy
 from typing import Any, Self
 from pathlib import Path
 from collections import defaultdict
 from collections.abc import Callable
 import torch
 from tqdm import tqdm
 from torch import cuda, Tensor
 from torch.nn.utils import clip_grad_norm_
 from torch.utils.data import Dataset, DataLoader
 from torch.utils.tensorboard import SummaryWriter
 from trainlib.dataset import BatchedDataset
 from trainlib.estimator import Estimator, EstimatorKwargs
 from trainlib.transform import Transform
 from trainlib.util.type import (
    SplitKwargs,
    LoaderKwargs,
    BalanceKwargs,
 )
 from trainlib.util.module import ModelWrapper
 logger: logging.Logger = logging.getLogger(__name__)
 class Trainer[I, K: EstimatorKwargs]:
    """
    Training interface for updating ``Estimators`` with ``Datasets``.
    """
    def __init__(
        self,
        estimator: Estimator[K],
        device: str | None = None,
        chkpt_dir: str = "chkpt/",
        tblog_dir: str = "tblog/",
    ) -> None:
        """
        Parameters:
            estimator: `Estimator` model object
            device: device on which to carry out training
        """
        self.device: str
        if device is None:
            self.device = "cuda" if cuda.is_available() else "cpu"
        else:
            self.device = device
        logger.info(f"> Trainer device: {self.device}")
        if self.device.startswith("cuda"):
            if torch.cuda.is_available():
                # extra cuda details
                logger.info(f"| > {cuda.device_count()=}")
                logger.info(f"| > {cuda.current_device()=}")
                logger.info(f"| > {cuda.get_device_name()=}")
                logger.info(f"| > {cuda.get_device_capability()=}")
                # memory info (in GB)
                gb = 1024**3
                memory_allocated = cuda.memory_allocated() / gb
                memory_reserved = cuda.memory_reserved() / gb
                memory_total = cuda.get_device_properties(0).total_memory / gb
                logger.info("| > CUDA memory:")
                logger.info(f"| >   {memory_total=:.2f}GB")
                logger.info(f"| >   {memory_reserved=:.2f}GB")
                logger.info(f"| >   {memory_allocated=:.2f}GB")
            else:
                logger.warning("| > CUDA device specified but not available")
        else:
            logger.info("| > Using CPU device - no additional device info")
        self.estimator = estimator
        self.estimator.to(self.device)
        self.chkpt_dir = Path(chkpt_dir).resolve()
        self.tblog_dir = Path(tblog_dir).resolve()
        self.reset()
    def reset(self) -> None:
        """
        Set base tracking parameters.
        """
        self._step: int = 0
        self._epoch: int = 0
        self._summary: dict[str, list[tuple[float, int]]] = defaultdict(list)
        self._val_loss = float("inf")
        self._best_val_loss = float("inf")
        self._stagnant_epochs = 0
        self._best_model_state_dict: dict[str, Any] = {}
    def train(
        self,
        dataset: BatchedDataset[..., ..., I],
        batch_estimator_map: Callable[[I, Self], K],
        lr: float = 1e-3,
        eps: float = 1e-8,
        max_grad_norm: float | None = None,
        max_epochs: int = 10,
        stop_after_epochs: int = 5,
        batch_size: int = 256,
        val_frac: float = 0.1,
        train_transform: Transform | None = None,
        val_transform: Transform | None = None,
        dataset_split_kwargs: SplitKwargs | None = None,
        dataset_balance_kwargs: BalanceKwargs | None = None,
        dataloader_kwargs: LoaderKwargs | None = None,
        summarize_every: int = 1,
        chkpt_every: int = 1,
        resume_latest: bool = False,
        summary_writer: SummaryWriter | None = None,
    ) -> Estimator:
        """
        Note: this method attempts to implement a general scheme for passing
        needed items to the estimator's loss function from the dataloader. The
        abstract `Estimator` base only requires the model output be provided
        for any given loss calculation, but concrete estimators will often
        require additional arguments (e.g., labels or length masks, as
        is the case with sequential models). In any case, this method defers
        any further logic to the `loss` method of the underlying estimator, so
        one should take care to synchronize the sample structure with `dataset`
        to match that expected by `self.estimator.loss(...)`.
        On batch_estimator_map:
        Dataloader collate functions are responsible for mapping a collection
        of items into an item of collections, roughly speaking. If items are
        tuples of tensors,
        [
            ( [1, 1], [1, 1] ),
            ( [2, 2], [2, 2] ),
            ( [3, 3], [3, 3] ),
        ]
        the collate function maps back into the item skeleton, producing a
        single tuple of (stacked) tensors
        ( [[1, 1],
           [2, 2],
           [3, 3]],
          [[1, 1],
           [2, 2],
           [3, 3]] )
        This function should map from batches (which should be *item shaped*,
        i.e., have an `I` skeleton, even if stacked items may be different on
        the inside) into estimator keyword arguments (type `K`).
        Parameters:
            lr: learning rate (default: 1e-3)
            eps: adam EPS (default: 1e-8)
            max_epochs: maximum number of training epochs
            stop_after_epochs: number of epochs with stagnant validation losses
                to allow before early stopping. If training stops earlier, the
                parameters for the best recorded validation score are loaded
                into the estimator before the method returns. If
                `stop_after_epochs >= max_epochs`, the estimator will train
                over all epochs and return as is, irrespective of validation
                scores.
            batch_size: size of batch to use when training on the provided
                dataset
            val_split_frac: fraction of dataset to use for validation
            chkpt_every: how often model checkpoints should be saved
            resume_latest: resume training from the latest available checkpoint
                in the `chkpt_dir`
        """
        logger.info("> Begin train loop:")
        logger.info(f"| > {lr=}")
        logger.info(f"| > {eps=}")
        logger.info(f"| > {max_epochs=}")
        logger.info(f"| > {batch_size=}")
        logger.info(f"| > {val_frac=}")
        logger.info(f"| > {chkpt_every=}")
        logger.info(f"| > {resume_latest=}")
        logger.info(f"| > with device: {self.device}")
        logger.info(f"| > core count: {os.cpu_count()}")
        writer: SummaryWriter
        dir_prefix = str(int(time.time()))
        if summary_writer is None:
            writer = SummaryWriter(f"{self.tblog_dir}")
        else:
            writer = summary_writer
        train_loader, val_loader = self.get_dataloaders(
            dataset,
            batch_size,
            val_frac=val_frac,
            train_transform=train_transform,
            val_transform=val_transform,
            dataset_split_kwargs=dataset_split_kwargs,
            dataset_balance_kwargs=dataset_balance_kwargs,
            dataloader_kwargs=dataloader_kwargs,
        )
        optimizers = self.estimator.optimizers(lr=lr, eps=eps)
        self._step = 0
        self._epoch = 1  # start from 1 for logging convenience
        while self._epoch <= max_epochs and not self._converged(
            self._epoch, stop_after_epochs
        ):
            print(f"Training epoch {self._epoch}/{max_epochs}...")
            print(f"Stagnant epochs {self._stagnant_epochs}/{stop_after_epochs}...")
            epoch_start_time = time.time()
            train_loss_sums = []
            self.estimator.train()
            with tqdm(train_loader, unit="batch") as train_epoch:
                for i, batch_data in enumerate(train_epoch):
                    est_kwargs = batch_estimator_map(batch_data, self)
                    inputs = est_kwargs["inputs"]
                    # one-time logging
                    if self._step == 0:
                        writer.add_graph(ModelWrapper(self.estimator), est_kwargs)
                    # once-per-epoch logging
                    if i == 0:
                        self.estimator.epoch_write(
                            writer,
                            step=self._step,
                            val=False,
                            **est_kwargs
                        )
                    train_losses = self.estimator.loss(**est_kwargs)
                    train_loss_items = []
                    for o_idx, optimizer in enumerate(optimizers):
                        optimizer.zero_grad()
                        train_loss = next(train_losses)
                        if len(train_loss_sums) <= o_idx:
                            train_loss_sums.append(0.0)
                        train_loss_item = train_loss.item()
                        train_loss_sums[o_idx] += train_loss_item
                        train_loss_items.append(train_loss_item)
                        train_loss.backward()
                        # clip gradients for optimizer's parameters
                        if max_grad_norm is not None:
                            opt_params = self._get_optimizer_parameters(optimizer)
                            clip_grad_norm_(opt_params, max_norm=max_grad_norm)
                        optimizer.step()
                    self._step += len(inputs)
                    for train_loss_item, train_loss_sum in zip(
                        train_loss_items,
                        train_loss_sums,
                        strict=True,
                    ):
                        train_epoch.set_postfix(loss=f"{train_loss_sum/(i+1):8.2f}")
                        self._add_summary_item("train_loss", train_loss_item)
                    estimator_metrics = self.estimator.metrics(**est_kwargs)
                    for metric_name, metric_value in estimator_metrics.items():
                        self._add_summary_item(f"train_{metric_name}", metric_value)
                self.estimator.epoch_step()
                for li, train_loss_sum in enumerate(train_loss_sums):
                    self._add_summary_item(
                        f"train_loss{li}_epoch", train_loss_sum / len(train_loader)
                    )
            if val_frac > 0:
                val_loss_sums = []
                self.estimator.eval()
                with tqdm(val_loader, unit="batch") as val_epoch:
                    for i, batch_data in enumerate(val_epoch):
                        est_kwargs = batch_estimator_map(batch_data, self)
                        inputs = est_kwargs["inputs"]
                        # once-per-epoch logging
                        if i == 0:
                            self.estimator.epoch_write(
                                writer,
                                step=self._step,
                                val=True,
                                **est_kwargs
                            )
                        val_losses = self.estimator.loss(**est_kwargs)
                        val_loss_items = []
                        for o_idx in range(len(optimizers)):
                            val_loss = next(val_losses)
                            if len(val_loss_sums) <= o_idx:
                                val_loss_sums.append(0.0)
                            val_loss_item = val_loss.item()
                            val_loss_sums[o_idx] += val_loss_item
                            val_loss_items.append(val_loss_item)
                        for val_loss_item, val_loss_sum in zip(
                            val_loss_items,
                            val_loss_sums,
                            strict=True,
                        ):
                            val_epoch.set_postfix(loss=f"{val_loss_sum/(i+1):8.2f}")
                            self._add_summary_item("val_loss", val_loss_item)
                        estimator_metrics = self.estimator.metrics(**est_kwargs)
                        for metric_name, metric_value in estimator_metrics.items():
                            self._add_summary_item(f"val_{metric_name}", metric_value)
                    for li, val_loss_sum in enumerate(val_loss_sums):
                        self._add_summary_item(
                            f"val_loss{li}_epoch", val_loss_sum / len(val_loader)
                        )
                    # convergence of multiple losses may be ambiguous
                    self._val_loss = sum(val_loss_sums) / len(val_loader)
            self._add_summary_item("epoch_time_sec", time.time() - epoch_start_time)
            if self._epoch % summarize_every == 0:
                self._summarize(writer, self._epoch)
            # save checkpoint
            if self._epoch % chkpt_every == 0:
                self.save_model(
                    self._epoch, self.chkpt_dir, dir_prefix
                )
            self._epoch += 1
        return self.estimator
    def _converged(self, epoch: int, stop_after_epochs: int) -> bool:
        converged = False
        if epoch == 1 or self._val_loss < self._best_val_loss:
            self._best_val_loss = self._val_loss
            self._stagnant_epochs = 0
            self._best_model_state_dict = deepcopy(self.estimator.state_dict())
        else:
            self._stagnant_epochs += 1
        if self._stagnant_epochs >= stop_after_epochs:
            self.estimator.load_state_dict(self._best_model_state_dict)
            converged = True
        return converged
    @staticmethod
    def get_dataloaders(
        dataset: BatchedDataset,
        batch_size: int,
        val_frac: float = 0.1,
        train_transform: Transform | None = None,
        val_transform: Transform | None = None,
        dataset_split_kwargs: SplitKwargs | None = None,
        dataset_balance_kwargs: BalanceKwargs | None = None,
        dataloader_kwargs: LoaderKwargs | None = None,
    ) -> tuple[DataLoader, DataLoader]:
        """
        Create training and validation dataloaders for the provided dataset.
        """
        if dataset_split_kwargs is None:
            dataset_split_kwargs = {}
        if dataset_balance_kwargs is not None:
            dataset.balance(**dataset_balance_kwargs)
        if val_frac <= 0:
            dataset.post_transform = train_transform
            train_loader_kwargs: LoaderKwargs = {
                "batch_size": min(batch_size, len(dataset)),
                "num_workers": 0,
                "shuffle": True,
            }
            if dataloader_kwargs is not None:
                train_loader_kwargs: LoaderKwargs = {
                    **train_loader_kwargs,
                    **dataloader_kwargs
                }
            return (
                DataLoader(dataset, **train_loader_kwargs),
                DataLoader(Dataset())
            )
        train_dataset, val_dataset = dataset.split(
            [1 - val_frac, val_frac],
            **dataset_split_kwargs,
        )
        # Dataset.split() returns light Subset objects of shallow copies of the
        # underlying dataset; can change the transform attribute of both splits
        # w/o overwriting
        train_dataset.post_transform = train_transform
        val_dataset.post_transform = val_transform
        train_loader_kwargs: LoaderKwargs = {
            "batch_size": min(batch_size, len(train_dataset)),
            "num_workers": 0,
            "shuffle": True,
        }
        val_loader_kwargs: LoaderKwargs = {
            "batch_size": min(batch_size, len(val_dataset)),
            "num_workers": 0,
            "shuffle": True,  # shuffle to prevent homogeneous val batches
        }
        if dataloader_kwargs is not None:
            train_loader_kwargs = {**train_loader_kwargs, **dataloader_kwargs}
            val_loader_kwargs = {**val_loader_kwargs, **dataloader_kwargs}
        train_loader = DataLoader(train_dataset, **train_loader_kwargs)
        val_loader = DataLoader(val_dataset, **val_loader_kwargs)
        return train_loader, val_loader
    def _summarize(self, writer: SummaryWriter, epoch: int) -> None:
        """
        Flush the training summary to the TB summary writer.
        """
        summary_values = defaultdict(list)
        for name, records in self._summary.items():
            for value, step in records:
                writer.add_scalar(name, value, step)
                summary_values[name].append(value)
        print(f"==== Epoch [{epoch}] summary ====")
        for name, values in summary_values.items():
            mean_value = torch.tensor(values).mean().item()
            print(f"> ({len(values)}) {name} :: {mean_value:.2f}")
        writer.flush()
        self._summary = defaultdict(list)
    def _get_optimizer_parameters(
        self,
        optimizer: torch.optim.Optimizer,
    ) -> list[Tensor]:
        return [
            param
            for param_group in optimizer.param_groups
            for param in param_group["params"]
            if param.grad is not None
        ]
    def _add_summary_item(self, name: str, value: float) -> None:
        self._summary[name].append((value, self._step))
    def save_model(
        self,
        epoch: int,
        chkpt_dir: str | Path,
        dir_prefix: str,
    ) -> None:
        """
        Save a model checkpoint.
        """
        model_buff = BytesIO()
        torch.save(self.estimator.state_dict(), model_buff)
        model_buff.seek(0)
        model_class = self.estimator.__class__.__name__
        chkpt_name = f"m_{model_class}-e_{epoch}.pth"
        chkpt_dir = Path(chkpt_dir, dir_prefix)
        chkpt_path = Path(chkpt_dir, chkpt_name)
        chkpt_dir.mkdir(parents=True, exist_ok=True)
        chkpt_path.write_bytes(model_buff.getvalue())
    def load_model(
        self,
        epoch: int,
        chkpt_dir: str,
    ) -> None:
        """
        Load a model checkpoint from a given epoch.
        Note that this assumes the model was saved via `Trainer.save_model()`,
        and the estimator provided to this `Trainer` instance matches the
        architecture of the checkpoint model being loaded.
        """
        model_class = self.estimator.__class__.__name__
        chkpt_name = f"m_{model_class}-e_{epoch}.pth"
        chkpt_path = Path(chkpt_dir, chkpt_name)
        model_buff = BytesIO(chkpt_path.read_bytes())
        model_buff.seek(0)
        model_dict = torch.load(model_buff, weights_only=True)
        self.estimator.load_state_dict(model_dict)
--- a/trainlib/transform.py
+++ b/trainlib/transform.py
@@ -0,0 +1,11 @@
 class Transform[I]:
    """
    Dataset transform base class.
    In places that directly reference a base ``Transform[I]``, a hint
    ``Callable[[I], I]`` would suffice. This class exists to allow nominal
    checks for purpose-built transforms.
    """
    def __call__(self, item: I) -> I:
        raise NotImplementedError
--- a/trainlib/utils/init.py
+++ b/trainlib/utils/init.py
--- a/trainlib/utils/job.py
+++ b/trainlib/utils/job.py
@@ -0,0 +1,54 @@
 import logging
 import concurrent
 from concurrent.futures import Future, as_completed
 from tqdm import tqdm
 from colorama import Fore, Style
 from mema.util.text import color_text
 logger: logging.Logger = logging.getLogger(__name__)
 def process_futures(
    futures: list[Future],
    desc: str | None = None,
    unit: str | None = None,
 ) -> None:
    if desc is None:
        desc = "Awaiting futures"
    if unit is None:
        unit = "it"
    success = 0
    cancelled = 0
    errored = 0
    submitted = len(futures)
    progress_bar = tqdm(
        total=len(futures),
        desc=f"{desc} [submitted {len(futures)}]",
        unit=unit,
    )
    for future in as_completed(futures):
        try:
            future.result()
            success += 1
        except concurrent.futures.CancelledError as e:
            cancelled += 1
            logger.error(f'Future cancelled; "{e}"')
        except Exception as e:
            errored += 1
            logger.warning(f'Future failed with unknown exception "{e}"')
        suc_txt = color_text(f"{success}", Fore.GREEN)
        can_txt = color_text(f"{cancelled}", Fore.YELLOW)
        err_txt = color_text(f"{errored}", Fore.RED)
        tot_txt = color_text(f"{success+cancelled+errored}", Style.BRIGHT)
        progress_bar.set_description(
            f"{desc} [{tot_txt} / {submitted} | {suc_txt} {can_txt} {err_txt}]"
        )
        progress_bar.update(n=1)
    progress_bar.close()
--- a/trainlib/utils/module.py
+++ b/trainlib/utils/module.py
@@ -0,0 +1,25 @@
 import torch
 from torch import nn
 class ModelWrapper(nn.Module):
    def __init__(self, model: nn.Module) -> None:
        super().__init__()
        self.model = model
    #def forward(self, inputs, **kwargs): 
        #return self.model(**{"inputs": inputs, **kwargs})
    def forward(self, kwargs): 
        return self.model(**kwargs)
 def get_grad_norm(model: nn.Module, p: int = 2) -> float:
    norm = 0
    for param in model.parameters():
        if not param.requires_grad or param.grad is None:
            continue
        grad_item = torch.abs(param.grad).pow(p).sum().item()
        norm += float(grad_item)
    return norm ** (1 / p)
--- a/trainlib/utils/text.py
+++ b/trainlib/utils/text.py
@@ -0,0 +1,8 @@
 from typing import Any
 from colorama import Style
 def color_text(text: str, *colorama_args: Any) -> str:
    return f"{''.join(colorama_args)}{text}{Style.RESET_ALL}"
--- a/trainlib/utils/type.py
+++ b/trainlib/utils/type.py
@@ -0,0 +1,52 @@
 from typing import Any, TypedDict
 from collections.abc import Callable, Iterable
 from torch import Tensor
 from torch.utils.data.sampler import Sampler
 from mema.dataset import BatchedDataset
 class LoaderKwargs(TypedDict, total=False):
    batch_size: int
    shuffle: bool
    sampler: Sampler | Iterable | None
    batch_sampler: Sampler[list] | Iterable[list] | None
    num_workers: int
    collate_fn: Callable[[list], Any]
    pin_memory: bool
    drop_last: bool
    timeout: float
    worker_init_fn: Callable[[int], None]
    multiprocessing_context: object
    generator: object
    prefetch_factor: int
    persistent_workers: bool
    pin_memory_device: str
    in_order: bool
 class SplitKwargs(TypedDict, total=False):
    dataset: BatchedDataset | None
    by_attr: str | list[str | None] | None
    shuffle_strata: bool
 class BalanceKwargs(TypedDict, total=False):
    by_attr: str | list[str | None] | None
    split_min_sizes: list[int] | None
    split_max_sizes: list[int] | None
    shuffle_strata: bool
 class OptimizerKwargs(TypedDict, total=False):
    lr: float | Tensor
    betas: tuple[float | Tensor, float | Tensor]
    eps: float
    weight_decay: float
    amsgrad: bool
    maximize: bool
    foreach: bool | None
    capturable: bool
    differentiable: bool
    fused: bool | None
--- a/uv.lock
+++ b/uv.lock