update module docs and sphinx config

2026-03-07 19:46:00 -08:00
parent e867bc0e7f
commit c2e4294c8c
11 changed files with 241 additions and 238 deletions
--- a/README.md
+++ b/README.md
@@ -1,5 +1,6 @@
 # Overview
-Package summary goes here, ideally with a diagram
+Minimal framework for ML modeling, supporting advanced dataset operations and
 streamlined training workflows.
 # Install
 The `trainlib` package can be installed from PyPI:
@@ -85,7 +86,7 @@ pip install trainlib
  class SequenceDataset[I, **P](HomogenousDataset[int, I, I, P]):
      ...
-  class TupleDataset[I](SequenceDataset[tuple[I, ...], ??]):
+  class TupleDataset[I](SequenceDataset[tuple[I, ...], "?"]):
      ...
  ```
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -3,6 +3,31 @@
 # For the full list of built-in configuration values, see the documentation:
 # https://www.sphinx-doc.org/en/master/usage/configuration.html
 # -- Styling: type hints ------------------------------------------------------
 # There are several possible style combinations for rendering types, none of
 # which are optimal in my view. The main switches are:
 # 
 # - Parameter type hints in the signature vs in the separate parameter list
 # - Show type hints as plaintext vs rendered HTML elements
 #
 # The `sphinx_autodoc_typehints` extension enables more context-aware
 # rendering, but it's often way too explicit (e.g., unwrapping type variables)
 # and makes things difficult to read. It does, however, allow for automatic
 # inclusion of default values, which is nice.
 #
 # I'd like type hints to be rendered in an inline code element, but that
 # doesn't happen by default in either case unless you render them in the
 # signature. This is sloppy, however, often just a jumbled mess or parameter
 # names and types. The current preferred option is to just use the native
 # `autodoc` settings for rendering type hints, leaving them out of the
 # signature (for easy heading readability). Type hints in the parameter list
 # are also as short as possible, not rendered crazily (by default this is in
 # italics; not my favorite but it's what we have). No
 # `sphinx_autodoc_typehints` needed at this point; you can toggle it if you
 # want automatic default values or different formatting for type hints.
 # -- Project information ------------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
@@ -10,22 +35,32 @@ project = "trainlib"
 copyright = "2026, Sam Griesemer"
 author = "Sam Griesemer"
 # -- General configuration ----------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
 extensions = [
    "sphinx.ext.autodoc",
    # enables a directive to be specified manually that gathers module/object
    # summary details in a table
    "sphinx.ext.autosummary",
    # allow viewing source in the HTML pages
    "sphinx.ext.viewcode",
    # only really applies to manual docs; docstrings still need RST-like
    "myst_parser",
    # enables Google-style docstring formats
    "sphinx.ext.napoleon",
-    # external extension that allows arg types to be inferred by type hints
+
-    "sphinx_autodoc_typehints",
+    # external extension that allows arg types to be inferred by type hints;
    # without this, type hints show up inside method signatures as plaintext,
    # but when enabled they are pulled into the parameter/description block and
    # rendered as native nested markup. What's best for a given package may
    # vary.
    # "sphinx_autodoc_typehints",
 ]
 autosummary_generate = True
 autosummary_imported_members = True
@@ -39,10 +74,37 @@ templates_path = ["_templates"]
 exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
 # -- Options for autodoc ------------------------------------------------------
 # class signatures show up only in __init__ rather than at the class header;
 # generally cleaner, avoids redundancy
 autodoc_class_signature = "separated"
 # if `sphinx_autodoc_typehints` extension is enabled, this is redundant: type
 # hints are rendered natively and already show up in the parameter block. If
 # it's disabled, this setting will do the same job of moving the types to the
 # parameter block, but it renders them in plaintext (with links to in-package
 # type refs).
 autodoc_typehints = "description"  # "signature"
 autodoc_typehints_format = "short"
 autodoc_preserve_defaults = True
 autodoc_use_type_comments = False
 python_use_unqualified_type_names = True
 # push parameters to their own lines in the signature block
 # python_maximum_signature_line_length = 60
 # -- Options for autodoc_typehints --------------------------------------------
 # always_use_bars_union = True  # always on for Python 3.14+
 # typehints_defaults = "braces-after"  # render defaults in param block
 # typehints_use_signature = False  # False is default; enable if wanted in sig
 # always_document_param_types = True  # show types even when not in docstring
 # -- Options for HTML output --------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
-html_theme = "furo"
+html_theme = "furo"  # "pydata_sphinx_theme"
 html_static_path = ["_static"]
 # html_sidebars = {
--- a/doc/index.md
+++ b/doc/index.md
@@ -1,29 +1,37 @@
 # `trainlib` package docs 
 {ref}`genindex`
 {ref}`modindex`
 {ref}`search`
 ```{eval-rst}
 .. autosummary::
   :nosignatures:
   :recursive:
   :caption: Modules
-    # list modules here for quick links
+    trainlib.dataset
    trainlib.domain
    trainlib.estimator
    trainlib.trainer
    trainlib.transform
 ```
 ```{toctree}
 :maxdepth: 3
 :caption: Autoref
 :hidden:
-_autoref/index.rst
+_autoref/trainlib.rst
 ```
 ```{toctree}
 :maxdepth: 3
 :caption: Contents
 :hidden:
 reference/documentation/index
 reference/site/index
 ```
 ```{include} ../README.md
 :heading-offset: 1
 ```
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "trainlib"
-version = "0.1.1"
+version = "0.1.2"
 description = "Minimal framework for ML modeling. Supports advanced dataset operations and streamlined training."
 requires-python = ">=3.13"
 authors = [
@@ -41,6 +41,7 @@ dev = [
 ]
 doc = [
    "furo",
    # "pydata-sphinx-theme",
    "myst-parser",
    "sphinx",
    "sphinx-togglebutton",
--- a/trainlib/dataset.py
+++ b/trainlib/dataset.py
@@ -1,129 +1,62 @@
 """
-.. admonition:: Marginalizing out the modality layer
+Domain-generic dataset base with attribute-based splitting and balancing
-    With ``domain`` being an instance variable, one possible interpretation of
+**Marginalizing out the modality layer**
    the object structures here is that one could completely abstract away
    the domain model, defining only item structures and processing data. You
    could have a single dataset definition for a particular concrete dataset,
    and so long as we're talking about the same items, it can be instantiated
    using *any domain*. You wouldn't need specific subclasses for disk or
    network or in-memory structures; you can tell it directly at runtime.
-    That's an eventually possibility, anyway. As it stands, however, this is
+With ``domain`` being an instance variable, one possible interpretation of
-    effectively impossible:
+the object structures here is that one could completely abstract away
 the domain model, defining only item structures and processing data. You
 could have a single dataset definition for a particular concrete dataset,
 and so long as we're talking about the same items, it can be instantiated
 using *any domain*. You wouldn't need specific subclasses for disk or
 network or in-memory structures; you can tell it directly at runtime.
-    You can't easily abstract the batch-to-item splitting process, i.e.,
+That's an eventually possibility, anyway. As it stands, however, this is
-    ``_process_batch_data()``. A list-based version of the dataset you're
+effectively impossible:
    trying to define might have an individual item tuple at every index,
    whereas a disk-based version might have tuples batched across a few files.
    This can't reliably be inferred, nor can it be pushed to the
    ``Domain``-level without needing equal levels of specialization (you'd just
    end up needing the exact same structural distinctions in the ``Domain``
    hierarchy). So *somewhere* you need a batch splitting implementation that
    is both item structure-dependent *and* domain-dependent...the question is
    how dynamic you're willing to be about where it comes from. Right now, we
    require this actually be defined in the ``_process_batch_data()`` method,
    meaning you'll need a specific ``Dataset`` class for each domain you want
    to support (e.g., ``MNISTDisk``, ``MNISTList``, ``MNISTNetwork``, etc), or
    at least for each domain where "interpreting" a batch could possibly
    differ. This is a case where the interface is all that enforces a
    distinction: if you've got two domains that can be counted on to yield
    batches in the exact same way and can use the same processing, then you
    could feasibly provide ``Domain`` objects from either at runtime and have
    no issues. We're "structurally blind" to any differentiation beyond the URI
    and resource types by design, so two different domain implementations with
    the same type signature ``Domain[U, R]`` should be expected to work fine at
    runtime (again, so long as they don't also need different batch
    processing), but that's not affording us much flexibility, i.e., most of
    the time we'll still be defining new dataset classes for each domain.
-    I initially flagged this as feasible, however, because one could imagine
+You can't easily abstract the batch-to-item splitting process, i.e.,
-    accepting a batch processing method upon instantiation rather than
+``_process_batch_data()``. A list-based version of the dataset you're trying to
-    structurally bolting it into the ``Dataset`` definition. This would require
+define might have an individual item tuple at every index, whereas a disk-based
-    knowledge of the item structure ``I`` as well as the ``Domain[U, R]``, so
+version might have tuples batched across a few files. This can't reliably be
-    such a function will always have to be ``(I, U, R)``-dependent. It
+inferred, nor can it be pushed to the ``Domain``-level without needing equal
-    nevertheless would take out some of the pain of having to define new
+levels of specialization (you'd just end up needing the exact same structural
-    dataset classes; instead, you'd just need to define the batch processing
+distinctions in the ``Domain`` hierarchy). So *somewhere* you need a batch
-    method. I see this as a worse alternative to just defining *inside* a safe
+splitting implementation that is both item structure-dependent *and*
-    context like a new dataset class: you know the types you have to respect,
+domain-dependent...the question is how dynamic you're willing to be about where
-    and you stick that method exactly in a context where it's understood.
+it comes from. Right now, we require this actually be defined in the
-    Freeing this up doesn't lighten the burden of processing logic, it just
+``_process_batch_data()`` method, meaning you'll need a specific ``Dataset``
-    changes *when* it has to be provided, and that's not worth much (to me) in
+class for each domain you want to support (e.g., ``MNISTDisk``, ``MNISTList``,
-    this case given the bump in complexity. (Taking this to the extreme: you
+``MNISTNetwork``, etc), or at least for each domain where "interpreting" a
-    could supply *all* of an object's methods "dynamically" and glue them
+batch could possibly differ. This is a case where the interface is all that
-    together at runtime so long as they all played nice. But wherever you were
+enforces a distinction: if you've got two domains that can be counted on to
-    "laying them out" beforehand is exactly the job of a class to begin with,
+yield batches in the exact same way and can use the same processing, then you
-    so you don't end up with anything more dynamic. All we're really discussing
+could feasibly provide ``Domain`` objects from either at runtime and have no
-    here is pushing around unavoidable complexity inside and outside of the
+issues. We're "structurally blind" to any differentiation beyond the URI and
-    "class walls," and in the particular case of ``_process_batch_data()``, it
+resource types by design, so two different domain implementations with the same
-    feels much better when it's on the inside.)
+type signature ``Domain[U, R]`` should be expected to work fine at runtime
 (again, so long as they don't also need different batch processing), but that's
 not affording us much flexibility, i.e., most of the time we'll still be
 defining new dataset classes for each domain.
-.. admonition:: Holding area
+I initially flagged this as feasible, however, because one could imagine
-
+accepting a batch processing method upon instantiation rather than structurally
-    .. code-block:: python
+bolting it into the ``Dataset`` definition. This would require knowledge of the
-
+item structure ``I`` as well as the ``Domain[U, R]``, so such a function will
-        @abstractmethod
+always have to be ``(I, U, R)``-dependent. It nevertheless would take out some
-        def _get_uri_groups(self) -> Iterable[tuple[U, ...]]:
+of the pain of having to define new dataset classes; instead, you'd just need
-            Get URI groups for each batch.
+to define the batch processing method. I see this as a worse alternative to
-
+just defining *inside* a safe context like a new dataset class: you know the
-            If there's more than one URI per batch (e.g., a data file and a
+types you have to respect, and you stick that method exactly in a context where
-            metadata file), zip the URIs such that we have a tuple of URIs per
+it's understood. Freeing this up doesn't lighten the burden of processing
-            batch.
+logic, it just changes *when* it has to be provided, and that's not worth much
-
+(to me) in this case given the bump in complexity. (Taking this to the extreme:
-            Note that this effectively defines the index style over batches in
+you could supply *all* of an object's methods "dynamically" and glue them
-            the attached domain. We get an ``int -> tuple[U, ...]`` map that
+together at runtime so long as they all played nice. But wherever you were
-            turns batch indices into URIs that can be read under the domain.
+"laying them out" beforehand is exactly the job of a class to begin with, so
-            ``get_batch()`` turns an integer index into its corresponding
+you don't end up with anything more dynamic. All we're really discussing here
-            ``tuple[U, ...]``, reading the resources with ``_read_resources()``
+is pushing around unavoidable complexity inside and outside of the "class
-            in the tuple, treating them as providers of batched data.
+walls," and in the particular case of ``_process_batch_data()``, it feels much
-            ``_read_resources()`` passes through to the attached domain logic,
+better when it's on the inside.)
            which, although common, need not supply an explicit iterable of
            batch items: we just access items with ``__getitem__()`` and may
            ask for ``__len__``. So the returned URI group collection (this
            method) does need to be iterable to measure the number of batches,
            but the batch objects that are ultimately produced by these URI
            groups need not be iterables themselves.
            raise NotImplementedError
        def _read_resources(
            self,
            uri_group: tuple[U, ...],
            batch_index: int
        ) -> tuple[R, ...]:
            Read batch files at the provided paths.
            This method should operate on a single tuple from the list of batch
            tuples returned by the ``_get_uri_groups()`` method. That is, it
            reads all of the resources for a single batch and returns a tuple
            of the same size with their contents.
            Note: the dependence on a batch index is mostly here to make
            multi-dataset composition easier later. In-dataset, you don't need
            to know the batch index to to simply process URIs, but across
            datasets you need it to find out the origin of the batch (and
            process those URIs accordingly).
            return tuple(self.domain.read(uri) for uri in uri_group)
    .. code-block:: python
        # pulling the type variable out of the inline generic b/c `ty` has
        # trouble understanding bound type variables in subclasses
        # (specifically with Self@)
        T = TypeVar("T", bound=NamedTuple)
        class NamedTupleDataset[I](Dataset):
            def __init__(self, data_list: list[I]) -> None:
                self.data_list = data_list
            def __len__(self) -> int:
                return len(self.data_list)
            def __getitem__(self, index: int) -> I:
                return self.data_list[index]
 """
 import math
@@ -164,74 +97,75 @@ class BatchedDataset[U, R, I](Dataset):
    which are used to concretize a domain ``Domain[U, R]``), and an item type
    ``I``.
-    .. admonition:: Batch and item processing flow
+    **Batch and item processing flow**
-        .. code-block:: text
+    .. code-block:: text
-            Domain -> [U] :: self._batch_uris = list(domain)
+        Domain -> [U] :: self._batch_uris = list(domain)
-                Grab all URIs from Domain iterators. This is made concrete
+            Grab all URIs from Domain iterators. This is made concrete early to
-                early to allow for Dataset sizing, and we need a Sequence
+            allow for Dataset sizing, and we need a Sequence representation to
-                representation to map integer batch indices into Domains, i.e.,
+            map integer batch indices into Domains, i.e., when getting the
-                when getting the corresponding URI:
+            corresponding URI:
-                batch_uri = self._batch_uris[batch_index]
+            batch_uri = self._batch_uris[batch_index]
-                We let Domains implement iterators over their URIs, but
+            We let Domains implement iterators over their URIs, but explicitly
-                explicitly exhaust when initializing Datasets.
+            exhaust when initializing Datasets.
-            U -> R        :: batch_data = self.domain[batch_uri]
+        U -> R        :: batch_data = self.domain[batch_uri]
-                Retrieve resource from domain. Resources are viewed as batched
+            Retrieve resource from domain. Resources are viewed as batched
-                data, even if only wrapping single items (happens in trivial
+            data, even if only wrapping single items (happens in trivial
-                settings).
+            settings).
-            R -> [I]      :: self._process_batch_data(batch_data, batch_index)
+        R -> [I]      :: self._process_batch_data(batch_data, batch_index)
-                Possibly domain-specific batch processing of resource data into
+            Possibly domain-specific batch processing of resource data into
-                explicit Sequence-like structures of items, each of which is
+            explicit Sequence-like structures of items, each of which is
-                subject to the provided pre_transform. Processed batches at
+            subject to the provided pre_transform. Processed batches at this
-                this stage are cached (if enabled).
+            stage are cached (if enabled).
-            [I] -> I      :: self.get_batch(batch_index)[index_in_batch]
+        [I] -> I      :: self.get_batch(batch_index)[index_in_batch]
-                Select individual items from batches in _get_item. At this
+            Select individual items from batches in _get_item. At this stage,
-                stage, items are in intermediate states and pulled from the
+            items are in intermediate states and pulled from the cached
-                cached batches.
+            batches.
-            I -> I        :: self._process_item_data(item_data, index)
+        I -> I        :: self._process_item_data(item_data, index)
-                
+            
-                Produce final items with __getitem__, getting intermediate
+            Produce final items with __getitem__, getting intermediate items
-                items via _get_item and applying the provided post_transform.
+            via _get_item and applying the provided post_transform.
    .. note::
-        Note^1: as far as positioning, this class is meant to play nice with
+
-        PyTorch DataLoaders, hence the inheritance from ``torch.Dataset``. The
+        As far as positioning, this class is meant to play nice with PyTorch
-        value add for this over the ``torch.Dataset`` base is almost entirely
+        DataLoaders, hence the inheritance from ``torch.Dataset``. The value
-        in the logic it implements to map out of *batched resources* that are
+        add for this over the ``torch.Dataset`` base is almost entirely in the
-        holding data, and flattening it out into typical dataset items. There
+        logic it implements to map out of *batched resources* that are holding
-        are also some QoL features when it comes to splitting and balancing
+        data, and flattening it out into typical dataset items. There are also
-        samples.
+        some QoL features when it comes to splitting and balancing samples.
    .. note::
        Even though ``Domains`` implement iterators over their URIs, this
-        doesn't imply a ``BatchedDataset`` is iterable. This just means we
+        doesn't imply a ``BatchedDataset`` is iterable. This just means we can
-        can walk over the resources that provide data, but we don't
+        walk over the resources that provide data, but we don't necessarily
-        necessarily presuppose an ordered walk over samples within batches.
+        presuppose an ordered walk over samples within batches. Point being:
-        Point being: ``torch.Dataset``, not ``torch.IterableDataset``, is
+        ``torch.Dataset``, not ``torch.IterableDataset``, is the appropriate
-        the appropriate superclass, even when we're working around iterable
+        superclass, even when we're working around iterable ``Domains``.
        ``Domains``.
    .. note::
        Transforms are expected to operate on ``I``-items and produce
-        ``I``-items. They shouldn't be the "introducers" of ``I`` types
+        ``I``-items. They shouldn't be the "introducers" of ``I`` types from
-        from some other intermediate representation, nor should they map
+        some other intermediate representation, nor should they map from ``I``
-        from ``I`` to something else. Point being: the dataset definition
+        to something else. Point being: the dataset definition should be able
-        should be able to map resources ``R`` to ``I`` without a transform:
+        to map resources ``R`` to ``I`` without a transform: that much should
-        that much should be baked into the class definition. If you find
+        be baked into the class definition. If you find you're expecting the
-        you're expecting the transform to do that for you, you should
+        transform to do that for you, you should consider pulling in some
-        consider pulling in some common structure across the allowed
+        common structure across the allowed transforms and make it a fixed part
-        transforms and make it a fixed part of the class.
+        of the class.
    """
    def __init__(
@@ -397,10 +331,10 @@ class BatchedDataset[U, R, I](Dataset):
        they're always connected, and nothing would notice if you waited
        between steps. The only way this could matter is if you split the
        resource reading and batch processing steps across methods, but when it
-        actually comes to accessing/caching the batch, you'd have to expand
+        actually comes to accessing/caching the batch, you'd have to expand any
-        any delayed reads here. There's no way around needing to see all batch
+        delayed reads here. There's no way around needing to see all batch data
-        data at once here, and we don't want to make that ambiguous: ``list``
+        at once here, and we don't want to make that ambiguous: ``list`` output
-        output type it is.
+        type it is.
        Parameters:
            batch_index: index of batch
@@ -458,27 +392,31 @@ class BatchedDataset[U, R, I](Dataset):
        """
        Split dataset into fractional pieces by data attribute.
-        If `by_attr` is None, recovers typical fractional splitting of dataset
+        If ``by_attr`` is None, recovers typical fractional splitting of
-        items, partitioning by size. Using None anywhere will index each item
+        dataset items, partitioning by size. Using None anywhere will index
-        into its own bucket, i.e., by its index. For instance,
+        each item into its own bucket, i.e., by its index. For instance:
-        - by_attr=["color"] -> {("red", 1), ("red", 2)},
+        - Splits on the attribute such that each subset contains entire strata
-                               {("blue", 1), ("blue", 2)}
+          of the attribute. "Homogeneity within clusters:"
-          Splits on the attribute such that each subset contains entire strata
+          .. code-block::
          of the attribute. "Homogeneity within clusters"
-        - `by_attr=["color", None]` -> {("red", 1), ("blue", 1)},
+              by_attr=["color"] -> {("red", 1), ("red", 2)},
-                                       {("red", 2), ("blue", 2)}
+                                   {("blue", 1), ("blue", 2)}
-          Stratifies by attribute and then splits "by index" within, uniformly
+        - Stratifies by attribute and then splits "by index" within, uniformly
          grabbing samples across strata to form new clusters. "Homogeneity
          across clusters"
          .. code-block::
              by_attr=["color", None] -> {("red", 1), ("blue", 1)},
                                         {("red", 2), ("blue", 2)}
        Note that the final list of Subsets returned are built from shallow
-        copies of the underlying dataset (i.e., `self`) to allow manual
+        copies of the underlying dataset (i.e., ``self``) to allow manual
        intervention with dataset attributes (e.g., setting the splits to have
-        different `transform`s). This is subject to possibly unexpected
+        different ``transforms``). This is subject to possibly unexpected
        behavior if re-caching data or you need a true copy of all data in
        memory, but should otherwise leave most interactions unchanged.
@@ -645,6 +583,8 @@ class BatchedDataset[U, R, I](Dataset):
        shuffle_strata: bool = True,
    ) -> list[int]:
        """
        Recursive balancing of items by attribute.
        .. note::
            Behavior is a little odd for nested behavior; not exactly perfectly
@@ -813,9 +753,10 @@ class CompositeBatchedDataset[U, R, I](BatchedDataset[U, R, I]):
    """
    Dataset class for wrapping individual datasets.
-    Note: because this remains a valid ``BatchedDataset``, we re-thread the
+    .. note::
-    generic type variables through the set of composed datasets. That is, they
+        Because this remains a valid ``BatchedDataset``, we re-thread the
-    must have a common domain type ``Domain[U, R]``.
+        generic type variables through the set of composed datasets. That is,
        they must have a common domain type ``Domain[U, R]``.
    """
    def __init__(
--- a/trainlib/datasets/disk.py
+++ b/trainlib/datasets/disk.py
@@ -12,39 +12,38 @@ class DiskDataset[T: NamedTuple](HomogenousDataset[Path, bytes, T]):
    """
    The following line is to satisfy the type checker, which
-    1. Can't recognize an appropriately re-typed constructor arg like
+    1. Can't recognize an appropriately re-typed constructor arg like::
       def __init__(
           self,
           domain: DiskDomain,
           ...
        ): ...
-       This *does* match the parent generic for the U=Path, R=bytes context
+           def __init__(
               self,
               domain: DiskDomain,
               ...
           ): ...
-       def __init__(
+       This *does* match the parent generic for the ``U=Path``, ``R=bytes``
-           self,
+       context::
-           domain: Domain[U, R],
+
-           ...
+           def __init__(
-        ): ...
+               self,
               domain: Domain[U, R],
               ...
           ): ...
       but the type checker doesn't see this.
    2. "Lifted" type variables out of generics can't be used as upper bounds,
       at least not without throwing type checker warnings (thanks to PEP695).
-       So I'm not allowed to have
+       So I'm not allowed to have::
-       ```
+           class BatchedDataset[U, R, D: Domain[U, R]]:
-       class BatchedDataset[U, R, D: Domain[U, R]]:
+               ...
           ...
       ```
-       which could bring appropriately dynamic typing for ``Domain``s, but is
+       which could bring appropriately dynamic typing for ``Domains``, but is
       not a sufficiently concrete upper bound.
    So: we settle for a class-level type declaration, which despite not being
    technically appropriately scoped, it's not harming anything and satisfies
-    ``ty`` type checks downstream (e.g., when we access ``DiskDomain.root``.
+    ``ty`` type checks downstream (e.g., when we access ``DiskDomain.root``).
    """
    domain: DiskDomain
--- a/trainlib/domain.py
+++ b/trainlib/domain.py
@@ -1,24 +1,5 @@
 """
-Defines a knowledge domain. Wraps a Dataset / Simulator / Knowledge
+Generic URI-resource mapping structure
 Downstream exploration might include
 - Calibrating Simulator / Knowledge with a Dataset
 - Amending Dataset with Simulator / Knowledge
 - Positioning Knowledge within Simulator context
  * Where to replace Simulator subsystem with Knowledge?
 Other variations:
 - Multi-fidelity simulators
 - Multi-scale models
 - Multi-system
 - Incomplete knowledge / divergence among sources
 Questions:
 - Should Simulator / Knowledge be unified as one (e.g., "Expert")
 """
 from collections.abc import Mapping, Iterator, Sequence
--- a/trainlib/estimator.py
+++ b/trainlib/estimator.py
@@ -1,4 +1,6 @@
 """
 Base class for trainable models
 Development note
 I'd rather lay out bare args and kwargs in the estimator methods, but the
--- a/trainlib/trainer.py
+++ b/trainlib/trainer.py
@@ -1,3 +1,7 @@
 """
 Core interface for training ``Estimators`` with ``Datasets``
 """
 import os
 import time
 import logging
--- a/trainlib/transform.py
+++ b/trainlib/transform.py
@@ -1,3 +1,7 @@
 """
 Transform base for dataset records
 """
 class Transform[I]:
    """
    Dataset transform base class.
--- a/uv.lock
+++ b/uv.lock
@@ -1637,7 +1637,7 @@ wheels = [
 [[package]]
 name = "trainlib"
-version = "0.1.1"
+version = "0.1.2"
 source = { editable = "." }
 dependencies = [
    { name = "colorama" },