update module docs and sphinx config
This commit is contained in:
@@ -1,5 +1,6 @@
|
|||||||
# Overview
|
# Overview
|
||||||
Package summary goes here, ideally with a diagram
|
Minimal framework for ML modeling, supporting advanced dataset operations and
|
||||||
|
streamlined training workflows.
|
||||||
|
|
||||||
# Install
|
# Install
|
||||||
The `trainlib` package can be installed from PyPI:
|
The `trainlib` package can be installed from PyPI:
|
||||||
@@ -85,7 +86,7 @@ pip install trainlib
|
|||||||
class SequenceDataset[I, **P](HomogenousDataset[int, I, I, P]):
|
class SequenceDataset[I, **P](HomogenousDataset[int, I, I, P]):
|
||||||
...
|
...
|
||||||
|
|
||||||
class TupleDataset[I](SequenceDataset[tuple[I, ...], ??]):
|
class TupleDataset[I](SequenceDataset[tuple[I, ...], "?"]):
|
||||||
...
|
...
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|||||||
68
doc/conf.py
68
doc/conf.py
@@ -3,6 +3,31 @@
|
|||||||
# For the full list of built-in configuration values, see the documentation:
|
# For the full list of built-in configuration values, see the documentation:
|
||||||
# https://www.sphinx-doc.org/en/master/usage/configuration.html
|
# https://www.sphinx-doc.org/en/master/usage/configuration.html
|
||||||
|
|
||||||
|
|
||||||
|
# -- Styling: type hints ------------------------------------------------------
|
||||||
|
# There are several possible style combinations for rendering types, none of
|
||||||
|
# which are optimal in my view. The main switches are:
|
||||||
|
#
|
||||||
|
# - Parameter type hints in the signature vs in the separate parameter list
|
||||||
|
# - Show type hints as plaintext vs rendered HTML elements
|
||||||
|
#
|
||||||
|
# The `sphinx_autodoc_typehints` extension enables more context-aware
|
||||||
|
# rendering, but it's often way too explicit (e.g., unwrapping type variables)
|
||||||
|
# and makes things difficult to read. It does, however, allow for automatic
|
||||||
|
# inclusion of default values, which is nice.
|
||||||
|
#
|
||||||
|
# I'd like type hints to be rendered in an inline code element, but that
|
||||||
|
# doesn't happen by default in either case unless you render them in the
|
||||||
|
# signature. This is sloppy, however, often just a jumbled mess or parameter
|
||||||
|
# names and types. The current preferred option is to just use the native
|
||||||
|
# `autodoc` settings for rendering type hints, leaving them out of the
|
||||||
|
# signature (for easy heading readability). Type hints in the parameter list
|
||||||
|
# are also as short as possible, not rendered crazily (by default this is in
|
||||||
|
# italics; not my favorite but it's what we have). No
|
||||||
|
# `sphinx_autodoc_typehints` needed at this point; you can toggle it if you
|
||||||
|
# want automatic default values or different formatting for type hints.
|
||||||
|
|
||||||
|
|
||||||
# -- Project information ------------------------------------------------------
|
# -- Project information ------------------------------------------------------
|
||||||
# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
|
# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
|
||||||
|
|
||||||
@@ -10,22 +35,32 @@ project = "trainlib"
|
|||||||
copyright = "2026, Sam Griesemer"
|
copyright = "2026, Sam Griesemer"
|
||||||
author = "Sam Griesemer"
|
author = "Sam Griesemer"
|
||||||
|
|
||||||
|
|
||||||
# -- General configuration ----------------------------------------------------
|
# -- General configuration ----------------------------------------------------
|
||||||
# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
|
# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
|
||||||
|
|
||||||
extensions = [
|
extensions = [
|
||||||
"sphinx.ext.autodoc",
|
"sphinx.ext.autodoc",
|
||||||
|
|
||||||
# enables a directive to be specified manually that gathers module/object
|
# enables a directive to be specified manually that gathers module/object
|
||||||
# summary details in a table
|
# summary details in a table
|
||||||
"sphinx.ext.autosummary",
|
"sphinx.ext.autosummary",
|
||||||
|
|
||||||
# allow viewing source in the HTML pages
|
# allow viewing source in the HTML pages
|
||||||
"sphinx.ext.viewcode",
|
"sphinx.ext.viewcode",
|
||||||
|
|
||||||
# only really applies to manual docs; docstrings still need RST-like
|
# only really applies to manual docs; docstrings still need RST-like
|
||||||
"myst_parser",
|
"myst_parser",
|
||||||
|
|
||||||
# enables Google-style docstring formats
|
# enables Google-style docstring formats
|
||||||
"sphinx.ext.napoleon",
|
"sphinx.ext.napoleon",
|
||||||
# external extension that allows arg types to be inferred by type hints
|
|
||||||
"sphinx_autodoc_typehints",
|
# external extension that allows arg types to be inferred by type hints;
|
||||||
|
# without this, type hints show up inside method signatures as plaintext,
|
||||||
|
# but when enabled they are pulled into the parameter/description block and
|
||||||
|
# rendered as native nested markup. What's best for a given package may
|
||||||
|
# vary.
|
||||||
|
# "sphinx_autodoc_typehints",
|
||||||
]
|
]
|
||||||
autosummary_generate = True
|
autosummary_generate = True
|
||||||
autosummary_imported_members = True
|
autosummary_imported_members = True
|
||||||
@@ -39,10 +74,37 @@ templates_path = ["_templates"]
|
|||||||
exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
|
exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
|
||||||
|
|
||||||
|
|
||||||
|
# -- Options for autodoc ------------------------------------------------------
|
||||||
|
# class signatures show up only in __init__ rather than at the class header;
|
||||||
|
# generally cleaner, avoids redundancy
|
||||||
|
autodoc_class_signature = "separated"
|
||||||
|
|
||||||
|
# if `sphinx_autodoc_typehints` extension is enabled, this is redundant: type
|
||||||
|
# hints are rendered natively and already show up in the parameter block. If
|
||||||
|
# it's disabled, this setting will do the same job of moving the types to the
|
||||||
|
# parameter block, but it renders them in plaintext (with links to in-package
|
||||||
|
# type refs).
|
||||||
|
autodoc_typehints = "description" # "signature"
|
||||||
|
autodoc_typehints_format = "short"
|
||||||
|
autodoc_preserve_defaults = True
|
||||||
|
autodoc_use_type_comments = False
|
||||||
|
python_use_unqualified_type_names = True
|
||||||
|
|
||||||
|
# push parameters to their own lines in the signature block
|
||||||
|
# python_maximum_signature_line_length = 60
|
||||||
|
|
||||||
|
|
||||||
|
# -- Options for autodoc_typehints --------------------------------------------
|
||||||
|
# always_use_bars_union = True # always on for Python 3.14+
|
||||||
|
# typehints_defaults = "braces-after" # render defaults in param block
|
||||||
|
# typehints_use_signature = False # False is default; enable if wanted in sig
|
||||||
|
# always_document_param_types = True # show types even when not in docstring
|
||||||
|
|
||||||
|
|
||||||
# -- Options for HTML output --------------------------------------------------
|
# -- Options for HTML output --------------------------------------------------
|
||||||
# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
|
# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
|
||||||
|
|
||||||
html_theme = "furo"
|
html_theme = "furo" # "pydata_sphinx_theme"
|
||||||
html_static_path = ["_static"]
|
html_static_path = ["_static"]
|
||||||
|
|
||||||
# html_sidebars = {
|
# html_sidebars = {
|
||||||
|
|||||||
16
doc/index.md
16
doc/index.md
@@ -1,29 +1,37 @@
|
|||||||
# `trainlib` package docs
|
# `trainlib` package docs
|
||||||
|
|
||||||
{ref}`genindex`
|
{ref}`genindex`
|
||||||
{ref}`modindex`
|
{ref}`modindex`
|
||||||
{ref}`search`
|
|
||||||
|
|
||||||
```{eval-rst}
|
```{eval-rst}
|
||||||
.. autosummary::
|
.. autosummary::
|
||||||
:nosignatures:
|
:nosignatures:
|
||||||
|
:recursive:
|
||||||
|
:caption: Modules
|
||||||
|
|
||||||
# list modules here for quick links
|
trainlib.dataset
|
||||||
|
trainlib.domain
|
||||||
|
trainlib.estimator
|
||||||
|
trainlib.trainer
|
||||||
|
trainlib.transform
|
||||||
```
|
```
|
||||||
|
|
||||||
```{toctree}
|
```{toctree}
|
||||||
:maxdepth: 3
|
:maxdepth: 3
|
||||||
:caption: Autoref
|
:caption: Autoref
|
||||||
|
:hidden:
|
||||||
|
|
||||||
_autoref/index.rst
|
_autoref/trainlib.rst
|
||||||
```
|
```
|
||||||
|
|
||||||
```{toctree}
|
```{toctree}
|
||||||
:maxdepth: 3
|
:maxdepth: 3
|
||||||
:caption: Contents
|
:caption: Contents
|
||||||
|
:hidden:
|
||||||
|
|
||||||
reference/documentation/index
|
reference/documentation/index
|
||||||
reference/site/index
|
|
||||||
```
|
```
|
||||||
|
|
||||||
```{include} ../README.md
|
```{include} ../README.md
|
||||||
|
:heading-offset: 1
|
||||||
```
|
```
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|||||||
|
|
||||||
[project]
|
[project]
|
||||||
name = "trainlib"
|
name = "trainlib"
|
||||||
version = "0.1.1"
|
version = "0.1.2"
|
||||||
description = "Minimal framework for ML modeling. Supports advanced dataset operations and streamlined training."
|
description = "Minimal framework for ML modeling. Supports advanced dataset operations and streamlined training."
|
||||||
requires-python = ">=3.13"
|
requires-python = ">=3.13"
|
||||||
authors = [
|
authors = [
|
||||||
@@ -41,6 +41,7 @@ dev = [
|
|||||||
]
|
]
|
||||||
doc = [
|
doc = [
|
||||||
"furo",
|
"furo",
|
||||||
|
# "pydata-sphinx-theme",
|
||||||
"myst-parser",
|
"myst-parser",
|
||||||
"sphinx",
|
"sphinx",
|
||||||
"sphinx-togglebutton",
|
"sphinx-togglebutton",
|
||||||
|
|||||||
@@ -1,129 +1,62 @@
|
|||||||
"""
|
"""
|
||||||
.. admonition:: Marginalizing out the modality layer
|
Domain-generic dataset base with attribute-based splitting and balancing
|
||||||
|
|
||||||
With ``domain`` being an instance variable, one possible interpretation of
|
**Marginalizing out the modality layer**
|
||||||
the object structures here is that one could completely abstract away
|
|
||||||
the domain model, defining only item structures and processing data. You
|
|
||||||
could have a single dataset definition for a particular concrete dataset,
|
|
||||||
and so long as we're talking about the same items, it can be instantiated
|
|
||||||
using *any domain*. You wouldn't need specific subclasses for disk or
|
|
||||||
network or in-memory structures; you can tell it directly at runtime.
|
|
||||||
|
|
||||||
That's an eventually possibility, anyway. As it stands, however, this is
|
With ``domain`` being an instance variable, one possible interpretation of
|
||||||
effectively impossible:
|
the object structures here is that one could completely abstract away
|
||||||
|
the domain model, defining only item structures and processing data. You
|
||||||
|
could have a single dataset definition for a particular concrete dataset,
|
||||||
|
and so long as we're talking about the same items, it can be instantiated
|
||||||
|
using *any domain*. You wouldn't need specific subclasses for disk or
|
||||||
|
network or in-memory structures; you can tell it directly at runtime.
|
||||||
|
|
||||||
You can't easily abstract the batch-to-item splitting process, i.e.,
|
That's an eventually possibility, anyway. As it stands, however, this is
|
||||||
``_process_batch_data()``. A list-based version of the dataset you're
|
effectively impossible:
|
||||||
trying to define might have an individual item tuple at every index,
|
|
||||||
whereas a disk-based version might have tuples batched across a few files.
|
|
||||||
This can't reliably be inferred, nor can it be pushed to the
|
|
||||||
``Domain``-level without needing equal levels of specialization (you'd just
|
|
||||||
end up needing the exact same structural distinctions in the ``Domain``
|
|
||||||
hierarchy). So *somewhere* you need a batch splitting implementation that
|
|
||||||
is both item structure-dependent *and* domain-dependent...the question is
|
|
||||||
how dynamic you're willing to be about where it comes from. Right now, we
|
|
||||||
require this actually be defined in the ``_process_batch_data()`` method,
|
|
||||||
meaning you'll need a specific ``Dataset`` class for each domain you want
|
|
||||||
to support (e.g., ``MNISTDisk``, ``MNISTList``, ``MNISTNetwork``, etc), or
|
|
||||||
at least for each domain where "interpreting" a batch could possibly
|
|
||||||
differ. This is a case where the interface is all that enforces a
|
|
||||||
distinction: if you've got two domains that can be counted on to yield
|
|
||||||
batches in the exact same way and can use the same processing, then you
|
|
||||||
could feasibly provide ``Domain`` objects from either at runtime and have
|
|
||||||
no issues. We're "structurally blind" to any differentiation beyond the URI
|
|
||||||
and resource types by design, so two different domain implementations with
|
|
||||||
the same type signature ``Domain[U, R]`` should be expected to work fine at
|
|
||||||
runtime (again, so long as they don't also need different batch
|
|
||||||
processing), but that's not affording us much flexibility, i.e., most of
|
|
||||||
the time we'll still be defining new dataset classes for each domain.
|
|
||||||
|
|
||||||
I initially flagged this as feasible, however, because one could imagine
|
You can't easily abstract the batch-to-item splitting process, i.e.,
|
||||||
accepting a batch processing method upon instantiation rather than
|
``_process_batch_data()``. A list-based version of the dataset you're trying to
|
||||||
structurally bolting it into the ``Dataset`` definition. This would require
|
define might have an individual item tuple at every index, whereas a disk-based
|
||||||
knowledge of the item structure ``I`` as well as the ``Domain[U, R]``, so
|
version might have tuples batched across a few files. This can't reliably be
|
||||||
such a function will always have to be ``(I, U, R)``-dependent. It
|
inferred, nor can it be pushed to the ``Domain``-level without needing equal
|
||||||
nevertheless would take out some of the pain of having to define new
|
levels of specialization (you'd just end up needing the exact same structural
|
||||||
dataset classes; instead, you'd just need to define the batch processing
|
distinctions in the ``Domain`` hierarchy). So *somewhere* you need a batch
|
||||||
method. I see this as a worse alternative to just defining *inside* a safe
|
splitting implementation that is both item structure-dependent *and*
|
||||||
context like a new dataset class: you know the types you have to respect,
|
domain-dependent...the question is how dynamic you're willing to be about where
|
||||||
and you stick that method exactly in a context where it's understood.
|
it comes from. Right now, we require this actually be defined in the
|
||||||
Freeing this up doesn't lighten the burden of processing logic, it just
|
``_process_batch_data()`` method, meaning you'll need a specific ``Dataset``
|
||||||
changes *when* it has to be provided, and that's not worth much (to me) in
|
class for each domain you want to support (e.g., ``MNISTDisk``, ``MNISTList``,
|
||||||
this case given the bump in complexity. (Taking this to the extreme: you
|
``MNISTNetwork``, etc), or at least for each domain where "interpreting" a
|
||||||
could supply *all* of an object's methods "dynamically" and glue them
|
batch could possibly differ. This is a case where the interface is all that
|
||||||
together at runtime so long as they all played nice. But wherever you were
|
enforces a distinction: if you've got two domains that can be counted on to
|
||||||
"laying them out" beforehand is exactly the job of a class to begin with,
|
yield batches in the exact same way and can use the same processing, then you
|
||||||
so you don't end up with anything more dynamic. All we're really discussing
|
could feasibly provide ``Domain`` objects from either at runtime and have no
|
||||||
here is pushing around unavoidable complexity inside and outside of the
|
issues. We're "structurally blind" to any differentiation beyond the URI and
|
||||||
"class walls," and in the particular case of ``_process_batch_data()``, it
|
resource types by design, so two different domain implementations with the same
|
||||||
feels much better when it's on the inside.)
|
type signature ``Domain[U, R]`` should be expected to work fine at runtime
|
||||||
|
(again, so long as they don't also need different batch processing), but that's
|
||||||
|
not affording us much flexibility, i.e., most of the time we'll still be
|
||||||
|
defining new dataset classes for each domain.
|
||||||
|
|
||||||
.. admonition:: Holding area
|
I initially flagged this as feasible, however, because one could imagine
|
||||||
|
accepting a batch processing method upon instantiation rather than structurally
|
||||||
.. code-block:: python
|
bolting it into the ``Dataset`` definition. This would require knowledge of the
|
||||||
|
item structure ``I`` as well as the ``Domain[U, R]``, so such a function will
|
||||||
@abstractmethod
|
always have to be ``(I, U, R)``-dependent. It nevertheless would take out some
|
||||||
def _get_uri_groups(self) -> Iterable[tuple[U, ...]]:
|
of the pain of having to define new dataset classes; instead, you'd just need
|
||||||
Get URI groups for each batch.
|
to define the batch processing method. I see this as a worse alternative to
|
||||||
|
just defining *inside* a safe context like a new dataset class: you know the
|
||||||
If there's more than one URI per batch (e.g., a data file and a
|
types you have to respect, and you stick that method exactly in a context where
|
||||||
metadata file), zip the URIs such that we have a tuple of URIs per
|
it's understood. Freeing this up doesn't lighten the burden of processing
|
||||||
batch.
|
logic, it just changes *when* it has to be provided, and that's not worth much
|
||||||
|
(to me) in this case given the bump in complexity. (Taking this to the extreme:
|
||||||
Note that this effectively defines the index style over batches in
|
you could supply *all* of an object's methods "dynamically" and glue them
|
||||||
the attached domain. We get an ``int -> tuple[U, ...]`` map that
|
together at runtime so long as they all played nice. But wherever you were
|
||||||
turns batch indices into URIs that can be read under the domain.
|
"laying them out" beforehand is exactly the job of a class to begin with, so
|
||||||
``get_batch()`` turns an integer index into its corresponding
|
you don't end up with anything more dynamic. All we're really discussing here
|
||||||
``tuple[U, ...]``, reading the resources with ``_read_resources()``
|
is pushing around unavoidable complexity inside and outside of the "class
|
||||||
in the tuple, treating them as providers of batched data.
|
walls," and in the particular case of ``_process_batch_data()``, it feels much
|
||||||
``_read_resources()`` passes through to the attached domain logic,
|
better when it's on the inside.)
|
||||||
which, although common, need not supply an explicit iterable of
|
|
||||||
batch items: we just access items with ``__getitem__()`` and may
|
|
||||||
ask for ``__len__``. So the returned URI group collection (this
|
|
||||||
method) does need to be iterable to measure the number of batches,
|
|
||||||
but the batch objects that are ultimately produced by these URI
|
|
||||||
groups need not be iterables themselves.
|
|
||||||
|
|
||||||
raise NotImplementedError
|
|
||||||
|
|
||||||
def _read_resources(
|
|
||||||
self,
|
|
||||||
uri_group: tuple[U, ...],
|
|
||||||
batch_index: int
|
|
||||||
) -> tuple[R, ...]:
|
|
||||||
Read batch files at the provided paths.
|
|
||||||
|
|
||||||
This method should operate on a single tuple from the list of batch
|
|
||||||
tuples returned by the ``_get_uri_groups()`` method. That is, it
|
|
||||||
reads all of the resources for a single batch and returns a tuple
|
|
||||||
of the same size with their contents.
|
|
||||||
|
|
||||||
Note: the dependence on a batch index is mostly here to make
|
|
||||||
multi-dataset composition easier later. In-dataset, you don't need
|
|
||||||
to know the batch index to to simply process URIs, but across
|
|
||||||
datasets you need it to find out the origin of the batch (and
|
|
||||||
process those URIs accordingly).
|
|
||||||
|
|
||||||
return tuple(self.domain.read(uri) for uri in uri_group)
|
|
||||||
|
|
||||||
.. code-block:: python
|
|
||||||
|
|
||||||
# pulling the type variable out of the inline generic b/c `ty` has
|
|
||||||
# trouble understanding bound type variables in subclasses
|
|
||||||
# (specifically with Self@)
|
|
||||||
|
|
||||||
T = TypeVar("T", bound=NamedTuple)
|
|
||||||
|
|
||||||
|
|
||||||
class NamedTupleDataset[I](Dataset):
|
|
||||||
def __init__(self, data_list: list[I]) -> None:
|
|
||||||
self.data_list = data_list
|
|
||||||
|
|
||||||
def __len__(self) -> int:
|
|
||||||
return len(self.data_list)
|
|
||||||
|
|
||||||
def __getitem__(self, index: int) -> I:
|
|
||||||
return self.data_list[index]
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import math
|
import math
|
||||||
@@ -164,74 +97,75 @@ class BatchedDataset[U, R, I](Dataset):
|
|||||||
which are used to concretize a domain ``Domain[U, R]``), and an item type
|
which are used to concretize a domain ``Domain[U, R]``), and an item type
|
||||||
``I``.
|
``I``.
|
||||||
|
|
||||||
.. admonition:: Batch and item processing flow
|
**Batch and item processing flow**
|
||||||
|
|
||||||
.. code-block:: text
|
.. code-block:: text
|
||||||
|
|
||||||
Domain -> [U] :: self._batch_uris = list(domain)
|
Domain -> [U] :: self._batch_uris = list(domain)
|
||||||
|
|
||||||
Grab all URIs from Domain iterators. This is made concrete
|
Grab all URIs from Domain iterators. This is made concrete early to
|
||||||
early to allow for Dataset sizing, and we need a Sequence
|
allow for Dataset sizing, and we need a Sequence representation to
|
||||||
representation to map integer batch indices into Domains, i.e.,
|
map integer batch indices into Domains, i.e., when getting the
|
||||||
when getting the corresponding URI:
|
corresponding URI:
|
||||||
|
|
||||||
batch_uri = self._batch_uris[batch_index]
|
batch_uri = self._batch_uris[batch_index]
|
||||||
|
|
||||||
We let Domains implement iterators over their URIs, but
|
We let Domains implement iterators over their URIs, but explicitly
|
||||||
explicitly exhaust when initializing Datasets.
|
exhaust when initializing Datasets.
|
||||||
|
|
||||||
U -> R :: batch_data = self.domain[batch_uri]
|
U -> R :: batch_data = self.domain[batch_uri]
|
||||||
|
|
||||||
Retrieve resource from domain. Resources are viewed as batched
|
Retrieve resource from domain. Resources are viewed as batched
|
||||||
data, even if only wrapping single items (happens in trivial
|
data, even if only wrapping single items (happens in trivial
|
||||||
settings).
|
settings).
|
||||||
|
|
||||||
R -> [I] :: self._process_batch_data(batch_data, batch_index)
|
R -> [I] :: self._process_batch_data(batch_data, batch_index)
|
||||||
|
|
||||||
Possibly domain-specific batch processing of resource data into
|
Possibly domain-specific batch processing of resource data into
|
||||||
explicit Sequence-like structures of items, each of which is
|
explicit Sequence-like structures of items, each of which is
|
||||||
subject to the provided pre_transform. Processed batches at
|
subject to the provided pre_transform. Processed batches at this
|
||||||
this stage are cached (if enabled).
|
stage are cached (if enabled).
|
||||||
|
|
||||||
[I] -> I :: self.get_batch(batch_index)[index_in_batch]
|
[I] -> I :: self.get_batch(batch_index)[index_in_batch]
|
||||||
|
|
||||||
Select individual items from batches in _get_item. At this
|
Select individual items from batches in _get_item. At this stage,
|
||||||
stage, items are in intermediate states and pulled from the
|
items are in intermediate states and pulled from the cached
|
||||||
cached batches.
|
batches.
|
||||||
|
|
||||||
I -> I :: self._process_item_data(item_data, index)
|
I -> I :: self._process_item_data(item_data, index)
|
||||||
|
|
||||||
Produce final items with __getitem__, getting intermediate
|
Produce final items with __getitem__, getting intermediate items
|
||||||
items via _get_item and applying the provided post_transform.
|
via _get_item and applying the provided post_transform.
|
||||||
|
|
||||||
.. note::
|
.. note::
|
||||||
Note^1: as far as positioning, this class is meant to play nice with
|
|
||||||
PyTorch DataLoaders, hence the inheritance from ``torch.Dataset``. The
|
As far as positioning, this class is meant to play nice with PyTorch
|
||||||
value add for this over the ``torch.Dataset`` base is almost entirely
|
DataLoaders, hence the inheritance from ``torch.Dataset``. The value
|
||||||
in the logic it implements to map out of *batched resources* that are
|
add for this over the ``torch.Dataset`` base is almost entirely in the
|
||||||
holding data, and flattening it out into typical dataset items. There
|
logic it implements to map out of *batched resources* that are holding
|
||||||
are also some QoL features when it comes to splitting and balancing
|
data, and flattening it out into typical dataset items. There are also
|
||||||
samples.
|
some QoL features when it comes to splitting and balancing samples.
|
||||||
|
|
||||||
.. note::
|
.. note::
|
||||||
|
|
||||||
Even though ``Domains`` implement iterators over their URIs, this
|
Even though ``Domains`` implement iterators over their URIs, this
|
||||||
doesn't imply a ``BatchedDataset`` is iterable. This just means we
|
doesn't imply a ``BatchedDataset`` is iterable. This just means we can
|
||||||
can walk over the resources that provide data, but we don't
|
walk over the resources that provide data, but we don't necessarily
|
||||||
necessarily presuppose an ordered walk over samples within batches.
|
presuppose an ordered walk over samples within batches. Point being:
|
||||||
Point being: ``torch.Dataset``, not ``torch.IterableDataset``, is
|
``torch.Dataset``, not ``torch.IterableDataset``, is the appropriate
|
||||||
the appropriate superclass, even when we're working around iterable
|
superclass, even when we're working around iterable ``Domains``.
|
||||||
``Domains``.
|
|
||||||
|
|
||||||
.. note::
|
.. note::
|
||||||
|
|
||||||
Transforms are expected to operate on ``I``-items and produce
|
Transforms are expected to operate on ``I``-items and produce
|
||||||
``I``-items. They shouldn't be the "introducers" of ``I`` types
|
``I``-items. They shouldn't be the "introducers" of ``I`` types from
|
||||||
from some other intermediate representation, nor should they map
|
some other intermediate representation, nor should they map from ``I``
|
||||||
from ``I`` to something else. Point being: the dataset definition
|
to something else. Point being: the dataset definition should be able
|
||||||
should be able to map resources ``R`` to ``I`` without a transform:
|
to map resources ``R`` to ``I`` without a transform: that much should
|
||||||
that much should be baked into the class definition. If you find
|
be baked into the class definition. If you find you're expecting the
|
||||||
you're expecting the transform to do that for you, you should
|
transform to do that for you, you should consider pulling in some
|
||||||
consider pulling in some common structure across the allowed
|
common structure across the allowed transforms and make it a fixed part
|
||||||
transforms and make it a fixed part of the class.
|
of the class.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
@@ -397,10 +331,10 @@ class BatchedDataset[U, R, I](Dataset):
|
|||||||
they're always connected, and nothing would notice if you waited
|
they're always connected, and nothing would notice if you waited
|
||||||
between steps. The only way this could matter is if you split the
|
between steps. The only way this could matter is if you split the
|
||||||
resource reading and batch processing steps across methods, but when it
|
resource reading and batch processing steps across methods, but when it
|
||||||
actually comes to accessing/caching the batch, you'd have to expand
|
actually comes to accessing/caching the batch, you'd have to expand any
|
||||||
any delayed reads here. There's no way around needing to see all batch
|
delayed reads here. There's no way around needing to see all batch data
|
||||||
data at once here, and we don't want to make that ambiguous: ``list``
|
at once here, and we don't want to make that ambiguous: ``list`` output
|
||||||
output type it is.
|
type it is.
|
||||||
|
|
||||||
Parameters:
|
Parameters:
|
||||||
batch_index: index of batch
|
batch_index: index of batch
|
||||||
@@ -458,27 +392,31 @@ class BatchedDataset[U, R, I](Dataset):
|
|||||||
"""
|
"""
|
||||||
Split dataset into fractional pieces by data attribute.
|
Split dataset into fractional pieces by data attribute.
|
||||||
|
|
||||||
If `by_attr` is None, recovers typical fractional splitting of dataset
|
If ``by_attr`` is None, recovers typical fractional splitting of
|
||||||
items, partitioning by size. Using None anywhere will index each item
|
dataset items, partitioning by size. Using None anywhere will index
|
||||||
into its own bucket, i.e., by its index. For instance,
|
each item into its own bucket, i.e., by its index. For instance:
|
||||||
|
|
||||||
- by_attr=["color"] -> {("red", 1), ("red", 2)},
|
- Splits on the attribute such that each subset contains entire strata
|
||||||
{("blue", 1), ("blue", 2)}
|
of the attribute. "Homogeneity within clusters:"
|
||||||
|
|
||||||
Splits on the attribute such that each subset contains entire strata
|
.. code-block::
|
||||||
of the attribute. "Homogeneity within clusters"
|
|
||||||
|
|
||||||
- `by_attr=["color", None]` -> {("red", 1), ("blue", 1)},
|
by_attr=["color"] -> {("red", 1), ("red", 2)},
|
||||||
{("red", 2), ("blue", 2)}
|
{("blue", 1), ("blue", 2)}
|
||||||
|
|
||||||
Stratifies by attribute and then splits "by index" within, uniformly
|
- Stratifies by attribute and then splits "by index" within, uniformly
|
||||||
grabbing samples across strata to form new clusters. "Homogeneity
|
grabbing samples across strata to form new clusters. "Homogeneity
|
||||||
across clusters"
|
across clusters"
|
||||||
|
|
||||||
|
.. code-block::
|
||||||
|
|
||||||
|
by_attr=["color", None] -> {("red", 1), ("blue", 1)},
|
||||||
|
{("red", 2), ("blue", 2)}
|
||||||
|
|
||||||
Note that the final list of Subsets returned are built from shallow
|
Note that the final list of Subsets returned are built from shallow
|
||||||
copies of the underlying dataset (i.e., `self`) to allow manual
|
copies of the underlying dataset (i.e., ``self``) to allow manual
|
||||||
intervention with dataset attributes (e.g., setting the splits to have
|
intervention with dataset attributes (e.g., setting the splits to have
|
||||||
different `transform`s). This is subject to possibly unexpected
|
different ``transforms``). This is subject to possibly unexpected
|
||||||
behavior if re-caching data or you need a true copy of all data in
|
behavior if re-caching data or you need a true copy of all data in
|
||||||
memory, but should otherwise leave most interactions unchanged.
|
memory, but should otherwise leave most interactions unchanged.
|
||||||
|
|
||||||
@@ -645,6 +583,8 @@ class BatchedDataset[U, R, I](Dataset):
|
|||||||
shuffle_strata: bool = True,
|
shuffle_strata: bool = True,
|
||||||
) -> list[int]:
|
) -> list[int]:
|
||||||
"""
|
"""
|
||||||
|
Recursive balancing of items by attribute.
|
||||||
|
|
||||||
.. note::
|
.. note::
|
||||||
|
|
||||||
Behavior is a little odd for nested behavior; not exactly perfectly
|
Behavior is a little odd for nested behavior; not exactly perfectly
|
||||||
@@ -813,9 +753,10 @@ class CompositeBatchedDataset[U, R, I](BatchedDataset[U, R, I]):
|
|||||||
"""
|
"""
|
||||||
Dataset class for wrapping individual datasets.
|
Dataset class for wrapping individual datasets.
|
||||||
|
|
||||||
Note: because this remains a valid ``BatchedDataset``, we re-thread the
|
.. note::
|
||||||
generic type variables through the set of composed datasets. That is, they
|
Because this remains a valid ``BatchedDataset``, we re-thread the
|
||||||
must have a common domain type ``Domain[U, R]``.
|
generic type variables through the set of composed datasets. That is,
|
||||||
|
they must have a common domain type ``Domain[U, R]``.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
|
|||||||
@@ -12,39 +12,38 @@ class DiskDataset[T: NamedTuple](HomogenousDataset[Path, bytes, T]):
|
|||||||
"""
|
"""
|
||||||
The following line is to satisfy the type checker, which
|
The following line is to satisfy the type checker, which
|
||||||
|
|
||||||
1. Can't recognize an appropriately re-typed constructor arg like
|
1. Can't recognize an appropriately re-typed constructor arg like::
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
domain: DiskDomain,
|
|
||||||
...
|
|
||||||
): ...
|
|
||||||
|
|
||||||
This *does* match the parent generic for the U=Path, R=bytes context
|
def __init__(
|
||||||
|
self,
|
||||||
|
domain: DiskDomain,
|
||||||
|
...
|
||||||
|
): ...
|
||||||
|
|
||||||
def __init__(
|
This *does* match the parent generic for the ``U=Path``, ``R=bytes``
|
||||||
self,
|
context::
|
||||||
domain: Domain[U, R],
|
|
||||||
...
|
def __init__(
|
||||||
): ...
|
self,
|
||||||
|
domain: Domain[U, R],
|
||||||
|
...
|
||||||
|
): ...
|
||||||
|
|
||||||
but the type checker doesn't see this.
|
but the type checker doesn't see this.
|
||||||
|
|
||||||
2. "Lifted" type variables out of generics can't be used as upper bounds,
|
2. "Lifted" type variables out of generics can't be used as upper bounds,
|
||||||
at least not without throwing type checker warnings (thanks to PEP695).
|
at least not without throwing type checker warnings (thanks to PEP695).
|
||||||
So I'm not allowed to have
|
So I'm not allowed to have::
|
||||||
|
|
||||||
```
|
class BatchedDataset[U, R, D: Domain[U, R]]:
|
||||||
class BatchedDataset[U, R, D: Domain[U, R]]:
|
...
|
||||||
...
|
|
||||||
```
|
|
||||||
|
|
||||||
which could bring appropriately dynamic typing for ``Domain``s, but is
|
which could bring appropriately dynamic typing for ``Domains``, but is
|
||||||
not a sufficiently concrete upper bound.
|
not a sufficiently concrete upper bound.
|
||||||
|
|
||||||
So: we settle for a class-level type declaration, which despite not being
|
So: we settle for a class-level type declaration, which despite not being
|
||||||
technically appropriately scoped, it's not harming anything and satisfies
|
technically appropriately scoped, it's not harming anything and satisfies
|
||||||
``ty`` type checks downstream (e.g., when we access ``DiskDomain.root``.
|
``ty`` type checks downstream (e.g., when we access ``DiskDomain.root``).
|
||||||
"""
|
"""
|
||||||
|
|
||||||
domain: DiskDomain
|
domain: DiskDomain
|
||||||
|
|||||||
@@ -1,24 +1,5 @@
|
|||||||
"""
|
"""
|
||||||
Defines a knowledge domain. Wraps a Dataset / Simulator / Knowledge
|
Generic URI-resource mapping structure
|
||||||
|
|
||||||
Downstream exploration might include
|
|
||||||
|
|
||||||
- Calibrating Simulator / Knowledge with a Dataset
|
|
||||||
- Amending Dataset with Simulator / Knowledge
|
|
||||||
- Positioning Knowledge within Simulator context
|
|
||||||
* Where to replace Simulator subsystem with Knowledge?
|
|
||||||
|
|
||||||
Other variations:
|
|
||||||
|
|
||||||
- Multi-fidelity simulators
|
|
||||||
- Multi-scale models
|
|
||||||
- Multi-system
|
|
||||||
- Incomplete knowledge / divergence among sources
|
|
||||||
|
|
||||||
Questions:
|
|
||||||
|
|
||||||
- Should Simulator / Knowledge be unified as one (e.g., "Expert")
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from collections.abc import Mapping, Iterator, Sequence
|
from collections.abc import Mapping, Iterator, Sequence
|
||||||
|
|||||||
@@ -1,4 +1,6 @@
|
|||||||
"""
|
"""
|
||||||
|
Base class for trainable models
|
||||||
|
|
||||||
Development note
|
Development note
|
||||||
|
|
||||||
I'd rather lay out bare args and kwargs in the estimator methods, but the
|
I'd rather lay out bare args and kwargs in the estimator methods, but the
|
||||||
|
|||||||
@@ -1,3 +1,7 @@
|
|||||||
|
"""
|
||||||
|
Core interface for training ``Estimators`` with ``Datasets``
|
||||||
|
"""
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import time
|
import time
|
||||||
import logging
|
import logging
|
||||||
|
|||||||
@@ -1,3 +1,7 @@
|
|||||||
|
"""
|
||||||
|
Transform base for dataset records
|
||||||
|
"""
|
||||||
|
|
||||||
class Transform[I]:
|
class Transform[I]:
|
||||||
"""
|
"""
|
||||||
Dataset transform base class.
|
Dataset transform base class.
|
||||||
|
|||||||
Reference in New Issue
Block a user