update module docs and sphinx config
This commit is contained in:
@@ -1,5 +1,6 @@
|
||||
# Overview
|
||||
Package summary goes here, ideally with a diagram
|
||||
Minimal framework for ML modeling, supporting advanced dataset operations and
|
||||
streamlined training workflows.
|
||||
|
||||
# Install
|
||||
The `trainlib` package can be installed from PyPI:
|
||||
@@ -85,7 +86,7 @@ pip install trainlib
|
||||
class SequenceDataset[I, **P](HomogenousDataset[int, I, I, P]):
|
||||
...
|
||||
|
||||
class TupleDataset[I](SequenceDataset[tuple[I, ...], ??]):
|
||||
class TupleDataset[I](SequenceDataset[tuple[I, ...], "?"]):
|
||||
...
|
||||
```
|
||||
|
||||
|
||||
68
doc/conf.py
68
doc/conf.py
@@ -3,6 +3,31 @@
|
||||
# For the full list of built-in configuration values, see the documentation:
|
||||
# https://www.sphinx-doc.org/en/master/usage/configuration.html
|
||||
|
||||
|
||||
# -- Styling: type hints ------------------------------------------------------
|
||||
# There are several possible style combinations for rendering types, none of
|
||||
# which are optimal in my view. The main switches are:
|
||||
#
|
||||
# - Parameter type hints in the signature vs in the separate parameter list
|
||||
# - Show type hints as plaintext vs rendered HTML elements
|
||||
#
|
||||
# The `sphinx_autodoc_typehints` extension enables more context-aware
|
||||
# rendering, but it's often way too explicit (e.g., unwrapping type variables)
|
||||
# and makes things difficult to read. It does, however, allow for automatic
|
||||
# inclusion of default values, which is nice.
|
||||
#
|
||||
# I'd like type hints to be rendered in an inline code element, but that
|
||||
# doesn't happen by default in either case unless you render them in the
|
||||
# signature. This is sloppy, however, often just a jumbled mess or parameter
|
||||
# names and types. The current preferred option is to just use the native
|
||||
# `autodoc` settings for rendering type hints, leaving them out of the
|
||||
# signature (for easy heading readability). Type hints in the parameter list
|
||||
# are also as short as possible, not rendered crazily (by default this is in
|
||||
# italics; not my favorite but it's what we have). No
|
||||
# `sphinx_autodoc_typehints` needed at this point; you can toggle it if you
|
||||
# want automatic default values or different formatting for type hints.
|
||||
|
||||
|
||||
# -- Project information ------------------------------------------------------
|
||||
# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
|
||||
|
||||
@@ -10,22 +35,32 @@ project = "trainlib"
|
||||
copyright = "2026, Sam Griesemer"
|
||||
author = "Sam Griesemer"
|
||||
|
||||
|
||||
# -- General configuration ----------------------------------------------------
|
||||
# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
|
||||
|
||||
extensions = [
|
||||
"sphinx.ext.autodoc",
|
||||
|
||||
# enables a directive to be specified manually that gathers module/object
|
||||
# summary details in a table
|
||||
"sphinx.ext.autosummary",
|
||||
|
||||
# allow viewing source in the HTML pages
|
||||
"sphinx.ext.viewcode",
|
||||
|
||||
# only really applies to manual docs; docstrings still need RST-like
|
||||
"myst_parser",
|
||||
|
||||
# enables Google-style docstring formats
|
||||
"sphinx.ext.napoleon",
|
||||
# external extension that allows arg types to be inferred by type hints
|
||||
"sphinx_autodoc_typehints",
|
||||
|
||||
# external extension that allows arg types to be inferred by type hints;
|
||||
# without this, type hints show up inside method signatures as plaintext,
|
||||
# but when enabled they are pulled into the parameter/description block and
|
||||
# rendered as native nested markup. What's best for a given package may
|
||||
# vary.
|
||||
# "sphinx_autodoc_typehints",
|
||||
]
|
||||
autosummary_generate = True
|
||||
autosummary_imported_members = True
|
||||
@@ -39,10 +74,37 @@ templates_path = ["_templates"]
|
||||
exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
|
||||
|
||||
|
||||
# -- Options for autodoc ------------------------------------------------------
|
||||
# class signatures show up only in __init__ rather than at the class header;
|
||||
# generally cleaner, avoids redundancy
|
||||
autodoc_class_signature = "separated"
|
||||
|
||||
# if `sphinx_autodoc_typehints` extension is enabled, this is redundant: type
|
||||
# hints are rendered natively and already show up in the parameter block. If
|
||||
# it's disabled, this setting will do the same job of moving the types to the
|
||||
# parameter block, but it renders them in plaintext (with links to in-package
|
||||
# type refs).
|
||||
autodoc_typehints = "description" # "signature"
|
||||
autodoc_typehints_format = "short"
|
||||
autodoc_preserve_defaults = True
|
||||
autodoc_use_type_comments = False
|
||||
python_use_unqualified_type_names = True
|
||||
|
||||
# push parameters to their own lines in the signature block
|
||||
# python_maximum_signature_line_length = 60
|
||||
|
||||
|
||||
# -- Options for autodoc_typehints --------------------------------------------
|
||||
# always_use_bars_union = True # always on for Python 3.14+
|
||||
# typehints_defaults = "braces-after" # render defaults in param block
|
||||
# typehints_use_signature = False # False is default; enable if wanted in sig
|
||||
# always_document_param_types = True # show types even when not in docstring
|
||||
|
||||
|
||||
# -- Options for HTML output --------------------------------------------------
|
||||
# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
|
||||
|
||||
html_theme = "furo"
|
||||
html_theme = "furo" # "pydata_sphinx_theme"
|
||||
html_static_path = ["_static"]
|
||||
|
||||
# html_sidebars = {
|
||||
|
||||
16
doc/index.md
16
doc/index.md
@@ -1,29 +1,37 @@
|
||||
# `trainlib` package docs
|
||||
|
||||
{ref}`genindex`
|
||||
{ref}`modindex`
|
||||
{ref}`search`
|
||||
|
||||
```{eval-rst}
|
||||
.. autosummary::
|
||||
:nosignatures:
|
||||
:recursive:
|
||||
:caption: Modules
|
||||
|
||||
# list modules here for quick links
|
||||
trainlib.dataset
|
||||
trainlib.domain
|
||||
trainlib.estimator
|
||||
trainlib.trainer
|
||||
trainlib.transform
|
||||
```
|
||||
|
||||
```{toctree}
|
||||
:maxdepth: 3
|
||||
:caption: Autoref
|
||||
:hidden:
|
||||
|
||||
_autoref/index.rst
|
||||
_autoref/trainlib.rst
|
||||
```
|
||||
|
||||
```{toctree}
|
||||
:maxdepth: 3
|
||||
:caption: Contents
|
||||
:hidden:
|
||||
|
||||
reference/documentation/index
|
||||
reference/site/index
|
||||
```
|
||||
|
||||
```{include} ../README.md
|
||||
:heading-offset: 1
|
||||
```
|
||||
|
||||
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
||||
|
||||
[project]
|
||||
name = "trainlib"
|
||||
version = "0.1.1"
|
||||
version = "0.1.2"
|
||||
description = "Minimal framework for ML modeling. Supports advanced dataset operations and streamlined training."
|
||||
requires-python = ">=3.13"
|
||||
authors = [
|
||||
@@ -41,6 +41,7 @@ dev = [
|
||||
]
|
||||
doc = [
|
||||
"furo",
|
||||
# "pydata-sphinx-theme",
|
||||
"myst-parser",
|
||||
"sphinx",
|
||||
"sphinx-togglebutton",
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
"""
|
||||
.. admonition:: Marginalizing out the modality layer
|
||||
Domain-generic dataset base with attribute-based splitting and balancing
|
||||
|
||||
**Marginalizing out the modality layer**
|
||||
|
||||
With ``domain`` being an instance variable, one possible interpretation of
|
||||
the object structures here is that one could completely abstract away
|
||||
@@ -13,117 +15,48 @@
|
||||
effectively impossible:
|
||||
|
||||
You can't easily abstract the batch-to-item splitting process, i.e.,
|
||||
``_process_batch_data()``. A list-based version of the dataset you're
|
||||
trying to define might have an individual item tuple at every index,
|
||||
whereas a disk-based version might have tuples batched across a few files.
|
||||
This can't reliably be inferred, nor can it be pushed to the
|
||||
``Domain``-level without needing equal levels of specialization (you'd just
|
||||
end up needing the exact same structural distinctions in the ``Domain``
|
||||
hierarchy). So *somewhere* you need a batch splitting implementation that
|
||||
is both item structure-dependent *and* domain-dependent...the question is
|
||||
how dynamic you're willing to be about where it comes from. Right now, we
|
||||
require this actually be defined in the ``_process_batch_data()`` method,
|
||||
meaning you'll need a specific ``Dataset`` class for each domain you want
|
||||
to support (e.g., ``MNISTDisk``, ``MNISTList``, ``MNISTNetwork``, etc), or
|
||||
at least for each domain where "interpreting" a batch could possibly
|
||||
differ. This is a case where the interface is all that enforces a
|
||||
distinction: if you've got two domains that can be counted on to yield
|
||||
batches in the exact same way and can use the same processing, then you
|
||||
could feasibly provide ``Domain`` objects from either at runtime and have
|
||||
no issues. We're "structurally blind" to any differentiation beyond the URI
|
||||
and resource types by design, so two different domain implementations with
|
||||
the same type signature ``Domain[U, R]`` should be expected to work fine at
|
||||
runtime (again, so long as they don't also need different batch
|
||||
processing), but that's not affording us much flexibility, i.e., most of
|
||||
the time we'll still be defining new dataset classes for each domain.
|
||||
``_process_batch_data()``. A list-based version of the dataset you're trying to
|
||||
define might have an individual item tuple at every index, whereas a disk-based
|
||||
version might have tuples batched across a few files. This can't reliably be
|
||||
inferred, nor can it be pushed to the ``Domain``-level without needing equal
|
||||
levels of specialization (you'd just end up needing the exact same structural
|
||||
distinctions in the ``Domain`` hierarchy). So *somewhere* you need a batch
|
||||
splitting implementation that is both item structure-dependent *and*
|
||||
domain-dependent...the question is how dynamic you're willing to be about where
|
||||
it comes from. Right now, we require this actually be defined in the
|
||||
``_process_batch_data()`` method, meaning you'll need a specific ``Dataset``
|
||||
class for each domain you want to support (e.g., ``MNISTDisk``, ``MNISTList``,
|
||||
``MNISTNetwork``, etc), or at least for each domain where "interpreting" a
|
||||
batch could possibly differ. This is a case where the interface is all that
|
||||
enforces a distinction: if you've got two domains that can be counted on to
|
||||
yield batches in the exact same way and can use the same processing, then you
|
||||
could feasibly provide ``Domain`` objects from either at runtime and have no
|
||||
issues. We're "structurally blind" to any differentiation beyond the URI and
|
||||
resource types by design, so two different domain implementations with the same
|
||||
type signature ``Domain[U, R]`` should be expected to work fine at runtime
|
||||
(again, so long as they don't also need different batch processing), but that's
|
||||
not affording us much flexibility, i.e., most of the time we'll still be
|
||||
defining new dataset classes for each domain.
|
||||
|
||||
I initially flagged this as feasible, however, because one could imagine
|
||||
accepting a batch processing method upon instantiation rather than
|
||||
structurally bolting it into the ``Dataset`` definition. This would require
|
||||
knowledge of the item structure ``I`` as well as the ``Domain[U, R]``, so
|
||||
such a function will always have to be ``(I, U, R)``-dependent. It
|
||||
nevertheless would take out some of the pain of having to define new
|
||||
dataset classes; instead, you'd just need to define the batch processing
|
||||
method. I see this as a worse alternative to just defining *inside* a safe
|
||||
context like a new dataset class: you know the types you have to respect,
|
||||
and you stick that method exactly in a context where it's understood.
|
||||
Freeing this up doesn't lighten the burden of processing logic, it just
|
||||
changes *when* it has to be provided, and that's not worth much (to me) in
|
||||
this case given the bump in complexity. (Taking this to the extreme: you
|
||||
could supply *all* of an object's methods "dynamically" and glue them
|
||||
accepting a batch processing method upon instantiation rather than structurally
|
||||
bolting it into the ``Dataset`` definition. This would require knowledge of the
|
||||
item structure ``I`` as well as the ``Domain[U, R]``, so such a function will
|
||||
always have to be ``(I, U, R)``-dependent. It nevertheless would take out some
|
||||
of the pain of having to define new dataset classes; instead, you'd just need
|
||||
to define the batch processing method. I see this as a worse alternative to
|
||||
just defining *inside* a safe context like a new dataset class: you know the
|
||||
types you have to respect, and you stick that method exactly in a context where
|
||||
it's understood. Freeing this up doesn't lighten the burden of processing
|
||||
logic, it just changes *when* it has to be provided, and that's not worth much
|
||||
(to me) in this case given the bump in complexity. (Taking this to the extreme:
|
||||
you could supply *all* of an object's methods "dynamically" and glue them
|
||||
together at runtime so long as they all played nice. But wherever you were
|
||||
"laying them out" beforehand is exactly the job of a class to begin with,
|
||||
so you don't end up with anything more dynamic. All we're really discussing
|
||||
here is pushing around unavoidable complexity inside and outside of the
|
||||
"class walls," and in the particular case of ``_process_batch_data()``, it
|
||||
feels much better when it's on the inside.)
|
||||
|
||||
.. admonition:: Holding area
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
@abstractmethod
|
||||
def _get_uri_groups(self) -> Iterable[tuple[U, ...]]:
|
||||
Get URI groups for each batch.
|
||||
|
||||
If there's more than one URI per batch (e.g., a data file and a
|
||||
metadata file), zip the URIs such that we have a tuple of URIs per
|
||||
batch.
|
||||
|
||||
Note that this effectively defines the index style over batches in
|
||||
the attached domain. We get an ``int -> tuple[U, ...]`` map that
|
||||
turns batch indices into URIs that can be read under the domain.
|
||||
``get_batch()`` turns an integer index into its corresponding
|
||||
``tuple[U, ...]``, reading the resources with ``_read_resources()``
|
||||
in the tuple, treating them as providers of batched data.
|
||||
``_read_resources()`` passes through to the attached domain logic,
|
||||
which, although common, need not supply an explicit iterable of
|
||||
batch items: we just access items with ``__getitem__()`` and may
|
||||
ask for ``__len__``. So the returned URI group collection (this
|
||||
method) does need to be iterable to measure the number of batches,
|
||||
but the batch objects that are ultimately produced by these URI
|
||||
groups need not be iterables themselves.
|
||||
|
||||
raise NotImplementedError
|
||||
|
||||
def _read_resources(
|
||||
self,
|
||||
uri_group: tuple[U, ...],
|
||||
batch_index: int
|
||||
) -> tuple[R, ...]:
|
||||
Read batch files at the provided paths.
|
||||
|
||||
This method should operate on a single tuple from the list of batch
|
||||
tuples returned by the ``_get_uri_groups()`` method. That is, it
|
||||
reads all of the resources for a single batch and returns a tuple
|
||||
of the same size with their contents.
|
||||
|
||||
Note: the dependence on a batch index is mostly here to make
|
||||
multi-dataset composition easier later. In-dataset, you don't need
|
||||
to know the batch index to to simply process URIs, but across
|
||||
datasets you need it to find out the origin of the batch (and
|
||||
process those URIs accordingly).
|
||||
|
||||
return tuple(self.domain.read(uri) for uri in uri_group)
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
# pulling the type variable out of the inline generic b/c `ty` has
|
||||
# trouble understanding bound type variables in subclasses
|
||||
# (specifically with Self@)
|
||||
|
||||
T = TypeVar("T", bound=NamedTuple)
|
||||
|
||||
|
||||
class NamedTupleDataset[I](Dataset):
|
||||
def __init__(self, data_list: list[I]) -> None:
|
||||
self.data_list = data_list
|
||||
|
||||
def __len__(self) -> int:
|
||||
return len(self.data_list)
|
||||
|
||||
def __getitem__(self, index: int) -> I:
|
||||
return self.data_list[index]
|
||||
"laying them out" beforehand is exactly the job of a class to begin with, so
|
||||
you don't end up with anything more dynamic. All we're really discussing here
|
||||
is pushing around unavoidable complexity inside and outside of the "class
|
||||
walls," and in the particular case of ``_process_batch_data()``, it feels much
|
||||
better when it's on the inside.)
|
||||
"""
|
||||
|
||||
import math
|
||||
@@ -164,21 +97,21 @@ class BatchedDataset[U, R, I](Dataset):
|
||||
which are used to concretize a domain ``Domain[U, R]``), and an item type
|
||||
``I``.
|
||||
|
||||
.. admonition:: Batch and item processing flow
|
||||
**Batch and item processing flow**
|
||||
|
||||
.. code-block:: text
|
||||
|
||||
Domain -> [U] :: self._batch_uris = list(domain)
|
||||
|
||||
Grab all URIs from Domain iterators. This is made concrete
|
||||
early to allow for Dataset sizing, and we need a Sequence
|
||||
representation to map integer batch indices into Domains, i.e.,
|
||||
when getting the corresponding URI:
|
||||
Grab all URIs from Domain iterators. This is made concrete early to
|
||||
allow for Dataset sizing, and we need a Sequence representation to
|
||||
map integer batch indices into Domains, i.e., when getting the
|
||||
corresponding URI:
|
||||
|
||||
batch_uri = self._batch_uris[batch_index]
|
||||
|
||||
We let Domains implement iterators over their URIs, but
|
||||
explicitly exhaust when initializing Datasets.
|
||||
We let Domains implement iterators over their URIs, but explicitly
|
||||
exhaust when initializing Datasets.
|
||||
|
||||
U -> R :: batch_data = self.domain[batch_uri]
|
||||
|
||||
@@ -190,48 +123,49 @@ class BatchedDataset[U, R, I](Dataset):
|
||||
|
||||
Possibly domain-specific batch processing of resource data into
|
||||
explicit Sequence-like structures of items, each of which is
|
||||
subject to the provided pre_transform. Processed batches at
|
||||
this stage are cached (if enabled).
|
||||
subject to the provided pre_transform. Processed batches at this
|
||||
stage are cached (if enabled).
|
||||
|
||||
[I] -> I :: self.get_batch(batch_index)[index_in_batch]
|
||||
|
||||
Select individual items from batches in _get_item. At this
|
||||
stage, items are in intermediate states and pulled from the
|
||||
cached batches.
|
||||
Select individual items from batches in _get_item. At this stage,
|
||||
items are in intermediate states and pulled from the cached
|
||||
batches.
|
||||
|
||||
I -> I :: self._process_item_data(item_data, index)
|
||||
|
||||
Produce final items with __getitem__, getting intermediate
|
||||
items via _get_item and applying the provided post_transform.
|
||||
Produce final items with __getitem__, getting intermediate items
|
||||
via _get_item and applying the provided post_transform.
|
||||
|
||||
.. note::
|
||||
Note^1: as far as positioning, this class is meant to play nice with
|
||||
PyTorch DataLoaders, hence the inheritance from ``torch.Dataset``. The
|
||||
value add for this over the ``torch.Dataset`` base is almost entirely
|
||||
in the logic it implements to map out of *batched resources* that are
|
||||
holding data, and flattening it out into typical dataset items. There
|
||||
are also some QoL features when it comes to splitting and balancing
|
||||
samples.
|
||||
|
||||
As far as positioning, this class is meant to play nice with PyTorch
|
||||
DataLoaders, hence the inheritance from ``torch.Dataset``. The value
|
||||
add for this over the ``torch.Dataset`` base is almost entirely in the
|
||||
logic it implements to map out of *batched resources* that are holding
|
||||
data, and flattening it out into typical dataset items. There are also
|
||||
some QoL features when it comes to splitting and balancing samples.
|
||||
|
||||
.. note::
|
||||
|
||||
Even though ``Domains`` implement iterators over their URIs, this
|
||||
doesn't imply a ``BatchedDataset`` is iterable. This just means we
|
||||
can walk over the resources that provide data, but we don't
|
||||
necessarily presuppose an ordered walk over samples within batches.
|
||||
Point being: ``torch.Dataset``, not ``torch.IterableDataset``, is
|
||||
the appropriate superclass, even when we're working around iterable
|
||||
``Domains``.
|
||||
doesn't imply a ``BatchedDataset`` is iterable. This just means we can
|
||||
walk over the resources that provide data, but we don't necessarily
|
||||
presuppose an ordered walk over samples within batches. Point being:
|
||||
``torch.Dataset``, not ``torch.IterableDataset``, is the appropriate
|
||||
superclass, even when we're working around iterable ``Domains``.
|
||||
|
||||
.. note::
|
||||
|
||||
Transforms are expected to operate on ``I``-items and produce
|
||||
``I``-items. They shouldn't be the "introducers" of ``I`` types
|
||||
from some other intermediate representation, nor should they map
|
||||
from ``I`` to something else. Point being: the dataset definition
|
||||
should be able to map resources ``R`` to ``I`` without a transform:
|
||||
that much should be baked into the class definition. If you find
|
||||
you're expecting the transform to do that for you, you should
|
||||
consider pulling in some common structure across the allowed
|
||||
transforms and make it a fixed part of the class.
|
||||
``I``-items. They shouldn't be the "introducers" of ``I`` types from
|
||||
some other intermediate representation, nor should they map from ``I``
|
||||
to something else. Point being: the dataset definition should be able
|
||||
to map resources ``R`` to ``I`` without a transform: that much should
|
||||
be baked into the class definition. If you find you're expecting the
|
||||
transform to do that for you, you should consider pulling in some
|
||||
common structure across the allowed transforms and make it a fixed part
|
||||
of the class.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
@@ -397,10 +331,10 @@ class BatchedDataset[U, R, I](Dataset):
|
||||
they're always connected, and nothing would notice if you waited
|
||||
between steps. The only way this could matter is if you split the
|
||||
resource reading and batch processing steps across methods, but when it
|
||||
actually comes to accessing/caching the batch, you'd have to expand
|
||||
any delayed reads here. There's no way around needing to see all batch
|
||||
data at once here, and we don't want to make that ambiguous: ``list``
|
||||
output type it is.
|
||||
actually comes to accessing/caching the batch, you'd have to expand any
|
||||
delayed reads here. There's no way around needing to see all batch data
|
||||
at once here, and we don't want to make that ambiguous: ``list`` output
|
||||
type it is.
|
||||
|
||||
Parameters:
|
||||
batch_index: index of batch
|
||||
@@ -458,27 +392,31 @@ class BatchedDataset[U, R, I](Dataset):
|
||||
"""
|
||||
Split dataset into fractional pieces by data attribute.
|
||||
|
||||
If `by_attr` is None, recovers typical fractional splitting of dataset
|
||||
items, partitioning by size. Using None anywhere will index each item
|
||||
into its own bucket, i.e., by its index. For instance,
|
||||
If ``by_attr`` is None, recovers typical fractional splitting of
|
||||
dataset items, partitioning by size. Using None anywhere will index
|
||||
each item into its own bucket, i.e., by its index. For instance:
|
||||
|
||||
- by_attr=["color"] -> {("red", 1), ("red", 2)},
|
||||
- Splits on the attribute such that each subset contains entire strata
|
||||
of the attribute. "Homogeneity within clusters:"
|
||||
|
||||
.. code-block::
|
||||
|
||||
by_attr=["color"] -> {("red", 1), ("red", 2)},
|
||||
{("blue", 1), ("blue", 2)}
|
||||
|
||||
Splits on the attribute such that each subset contains entire strata
|
||||
of the attribute. "Homogeneity within clusters"
|
||||
|
||||
- `by_attr=["color", None]` -> {("red", 1), ("blue", 1)},
|
||||
{("red", 2), ("blue", 2)}
|
||||
|
||||
Stratifies by attribute and then splits "by index" within, uniformly
|
||||
- Stratifies by attribute and then splits "by index" within, uniformly
|
||||
grabbing samples across strata to form new clusters. "Homogeneity
|
||||
across clusters"
|
||||
|
||||
.. code-block::
|
||||
|
||||
by_attr=["color", None] -> {("red", 1), ("blue", 1)},
|
||||
{("red", 2), ("blue", 2)}
|
||||
|
||||
Note that the final list of Subsets returned are built from shallow
|
||||
copies of the underlying dataset (i.e., `self`) to allow manual
|
||||
copies of the underlying dataset (i.e., ``self``) to allow manual
|
||||
intervention with dataset attributes (e.g., setting the splits to have
|
||||
different `transform`s). This is subject to possibly unexpected
|
||||
different ``transforms``). This is subject to possibly unexpected
|
||||
behavior if re-caching data or you need a true copy of all data in
|
||||
memory, but should otherwise leave most interactions unchanged.
|
||||
|
||||
@@ -645,6 +583,8 @@ class BatchedDataset[U, R, I](Dataset):
|
||||
shuffle_strata: bool = True,
|
||||
) -> list[int]:
|
||||
"""
|
||||
Recursive balancing of items by attribute.
|
||||
|
||||
.. note::
|
||||
|
||||
Behavior is a little odd for nested behavior; not exactly perfectly
|
||||
@@ -813,9 +753,10 @@ class CompositeBatchedDataset[U, R, I](BatchedDataset[U, R, I]):
|
||||
"""
|
||||
Dataset class for wrapping individual datasets.
|
||||
|
||||
Note: because this remains a valid ``BatchedDataset``, we re-thread the
|
||||
generic type variables through the set of composed datasets. That is, they
|
||||
must have a common domain type ``Domain[U, R]``.
|
||||
.. note::
|
||||
Because this remains a valid ``BatchedDataset``, we re-thread the
|
||||
generic type variables through the set of composed datasets. That is,
|
||||
they must have a common domain type ``Domain[U, R]``.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
|
||||
@@ -12,7 +12,7 @@ class DiskDataset[T: NamedTuple](HomogenousDataset[Path, bytes, T]):
|
||||
"""
|
||||
The following line is to satisfy the type checker, which
|
||||
|
||||
1. Can't recognize an appropriately re-typed constructor arg like
|
||||
1. Can't recognize an appropriately re-typed constructor arg like::
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -20,7 +20,8 @@ class DiskDataset[T: NamedTuple](HomogenousDataset[Path, bytes, T]):
|
||||
...
|
||||
): ...
|
||||
|
||||
This *does* match the parent generic for the U=Path, R=bytes context
|
||||
This *does* match the parent generic for the ``U=Path``, ``R=bytes``
|
||||
context::
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -32,19 +33,17 @@ class DiskDataset[T: NamedTuple](HomogenousDataset[Path, bytes, T]):
|
||||
|
||||
2. "Lifted" type variables out of generics can't be used as upper bounds,
|
||||
at least not without throwing type checker warnings (thanks to PEP695).
|
||||
So I'm not allowed to have
|
||||
So I'm not allowed to have::
|
||||
|
||||
```
|
||||
class BatchedDataset[U, R, D: Domain[U, R]]:
|
||||
...
|
||||
```
|
||||
|
||||
which could bring appropriately dynamic typing for ``Domain``s, but is
|
||||
which could bring appropriately dynamic typing for ``Domains``, but is
|
||||
not a sufficiently concrete upper bound.
|
||||
|
||||
So: we settle for a class-level type declaration, which despite not being
|
||||
technically appropriately scoped, it's not harming anything and satisfies
|
||||
``ty`` type checks downstream (e.g., when we access ``DiskDomain.root``.
|
||||
``ty`` type checks downstream (e.g., when we access ``DiskDomain.root``).
|
||||
"""
|
||||
|
||||
domain: DiskDomain
|
||||
|
||||
@@ -1,24 +1,5 @@
|
||||
"""
|
||||
Defines a knowledge domain. Wraps a Dataset / Simulator / Knowledge
|
||||
|
||||
Downstream exploration might include
|
||||
|
||||
- Calibrating Simulator / Knowledge with a Dataset
|
||||
- Amending Dataset with Simulator / Knowledge
|
||||
- Positioning Knowledge within Simulator context
|
||||
* Where to replace Simulator subsystem with Knowledge?
|
||||
|
||||
Other variations:
|
||||
|
||||
- Multi-fidelity simulators
|
||||
- Multi-scale models
|
||||
- Multi-system
|
||||
- Incomplete knowledge / divergence among sources
|
||||
|
||||
Questions:
|
||||
|
||||
- Should Simulator / Knowledge be unified as one (e.g., "Expert")
|
||||
|
||||
Generic URI-resource mapping structure
|
||||
"""
|
||||
|
||||
from collections.abc import Mapping, Iterator, Sequence
|
||||
|
||||
@@ -1,4 +1,6 @@
|
||||
"""
|
||||
Base class for trainable models
|
||||
|
||||
Development note
|
||||
|
||||
I'd rather lay out bare args and kwargs in the estimator methods, but the
|
||||
|
||||
@@ -1,3 +1,7 @@
|
||||
"""
|
||||
Core interface for training ``Estimators`` with ``Datasets``
|
||||
"""
|
||||
|
||||
import os
|
||||
import time
|
||||
import logging
|
||||
|
||||
@@ -1,3 +1,7 @@
|
||||
"""
|
||||
Transform base for dataset records
|
||||
"""
|
||||
|
||||
class Transform[I]:
|
||||
"""
|
||||
Dataset transform base class.
|
||||
|
||||
Reference in New Issue
Block a user