fix trainer epcoh logging

2026-03-22 20:24:10 -07:00
parent a395a08d5c
commit b59749c8d8
5 changed files with 55 additions and 12 deletions
--- a/TODO.md
+++ b/TODO.md
@@ -0,0 +1,11 @@
 # Long-term
 - Implement a dataloader in-house, with a clear, lightweight mechanism for
  collection-of-structures to structure-of-collections. For multi-proc handling
  (happens in torch's dataloader, as well as the BatchedDataset for two
  different purposes), we should rely on (a hopefully more stable) `execlib`.
 - `Domains` may be externalized (`co3` or `convlib`)
 - Up next: CLI, fully JSON-ification of model selection + train.
 - Consider a "multi-train" alternative (or arg support in `train()`) for
  training many "rollouts" from the same base estimator (basically forks under
  different seeds). For architecture benchmarking above all, seeing average
  training behavior. Consider corresponding `Plotter` methods (error bars)
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "trainlib"
-version = "0.1.2"
+version = "0.2.0"
 description = "Minimal framework for ML modeling. Supports advanced dataset operations and streamlined training."
 requires-python = ">=3.13"
 authors = [
--- a/trainlib/estimators/mlp.py
+++ b/trainlib/estimators/mlp.py
@@ -103,7 +103,7 @@ class MLP[Kw: MLPKwargs](Estimator[Kw]):
            mae = F.l1_loss(predictions, labels).item()
        return {
-            "mse": loss,
+            # "mse": loss,
            "mae": mae,
            "grad_norm": get_grad_norm(self)
        }
--- a/trainlib/plotter.py
+++ b/trainlib/plotter.py
@@ -26,6 +26,8 @@ class Plotter[Kw: EstimatorKwargs]:
      intervals broken over the training epochs at 0, 50, 100, 150, ... and
      highlight the best one, even if that's not actually the single best
      epoch)
    - Implement data and dimension limits; in the instance dataloaders have
      huge numbers of samples or labels are high-dimensional
    """
    def __init__(
@@ -255,6 +257,12 @@ class Plotter[Kw: EstimatorKwargs]:
        return fig, axes
    # def plot_ordered(...): ...
    #     """
    #     Simple ordered view of output dimensions, with actual and output
    #     overlaid.
    #     """
    def plot_actual_output(
        self,
        row_size: int | float = 2,
@@ -457,12 +465,12 @@ class Plotter[Kw: EstimatorKwargs]:
        combine_metrics: bool = False,
        transpose_layout: bool = False,
        figure_kwargs: SubplotsKwargs | None = None,
-    ):
+    ) -> tuple[plt.Figure, AxesArray]:
        session_map = self.trainer._event_log
        session_name = session_name or next(iter(session_map))
        groups = session_map[session_name]
        num_metrics = len(groups[next(iter(groups))])
-        colors = plt.rcParams["axes.prop_cycle"].by_key()["color"]
+        # colors = plt.rcParams["axes.prop_cycle"].by_key()["color"]
        rows = 1 if combine_groups else len(groups)
        cols = 1 if combine_metrics else num_metrics
@@ -513,6 +521,8 @@ class Plotter[Kw: EstimatorKwargs]:
                )
                ax.set_title(f"[{title_prefix}] Metrics over epochs")
-                ax.set_xlabel("epoch", fontstyle='italic')
+                ax.set_xlabel("epoch")
-                ax.set_ylabel("value", fontstyle='italic')
+                ax.set_ylabel("value")
                ax.legend()
        return fig, axes
--- a/trainlib/trainer.py
+++ b/trainlib/trainer.py
@@ -108,7 +108,7 @@ class Trainer[I, Kw: EstimatorKwargs]:
        Set initial tracking parameters for the primary training loop.
        """
-        self._epoch: int = 1
+        self._epoch: int = 0
        self._summary = defaultdict(lambda: defaultdict(list))
        self._conv_loss = float("inf")
@@ -231,7 +231,9 @@ class Trainer[I, Kw: EstimatorKwargs]:
                for metric_name, metric_value in estimator_metrics.items():
                    self._log_event(label, metric_name, metric_value)
-        return loss_sums
+        avg_losses = [loss_sum / (i+1) for loss_sum in loss_sums]
        return avg_losses
    def _eval_loaders(
        self,
@@ -252,6 +254,24 @@ class Trainer[I, Kw: EstimatorKwargs]:
        ``get_batch_outputs()`` or ``get_batch_metrics()`` while iterating over
        batches. This will have no internal side effects and provides much more
        information (just aggregated losses are provided here).
        .. admonition:: On epoch counting
            Epoch counts start at 0 to allow for a sensible place to benchmark
            the initial (potentially untrained/pre-trained) model before any
            training data is seen. In the train loop, we increment the epoch
            immediately, and all logging happens under the epoch value that's
            set at the start of the iteration (rather than incrementing at the
            end). Before beginning an additional training iteration, the
            convergence condition in the ``while`` is effectively checking what
            happened during the last epoch (the counter has not yet been
            incremented); if no convergence, we begin again. (This is only
            being noted because the epoch counting was previously quite
            different: indexing started at ``1``, we incremented at the end of
            the loop, and we didn't evaluate the model before the loop began.
            This affects how we interpret plots and TensorBoard records, for
            instance, so it's useful to spell out the approach clearly
            somewhere given the many possible design choices here.)
        """
        train_loss = self._eval_epoch(train_loader, "train")
@@ -418,9 +438,10 @@ class Trainer[I, Kw: EstimatorKwargs]:
        self._eval_loaders(train_loader, val_loader, aux_loaders)
        optimizers = self.estimator.optimizers(lr=lr, eps=eps)
-        while self._epoch <= max_epochs and not self._converged(
+        while self._epoch < max_epochs and not self._converged(
            self._epoch, stop_after_epochs
        ):
            self._epoch += 1
            train_frac = f"{self._epoch}/{max_epochs}"
            stag_frac = f"{self._stagnant_epochs}/{stop_after_epochs}"
            print(f"Training epoch {train_frac}...")
@@ -434,20 +455,21 @@ class Trainer[I, Kw: EstimatorKwargs]:
            train_loss, val_loss, _ = self._eval_loaders(
                train_loader, val_loader, aux_loaders
            )
-            self._conv_loss = sum(val_loss) if val_loss else sum(train_loss)
+            # determine loss to use for measuring convergence
            conv_loss = val_loss if val_loss else train_loss
            self._conv_loss = sum(conv_loss) / len(conv_loss)
            if self._epoch % summarize_every == 0:
                self._summarize()
            if self._epoch % chkpt_every == 0:
                self.save_model()
            self._epoch += 1
        return self.estimator
    def _converged(self, epoch: int, stop_after_epochs: int) -> bool:
        converged = False
-        if epoch == 1 or self._conv_loss < self._best_val_loss:
+        if epoch == 0 or self._conv_loss < self._best_val_loss:
            self._best_val_loss = self._conv_loss
            self._stagnant_epochs = 0
            self._best_model_state_dict = deepcopy(self.estimator.state_dict())