Skip to content

Train CNN with FSDP on MNISTβš“οΈŽ

Key API Functions

See:

ezpz launch python3 -m ezpz.examples.fsdp

Sourceβš“οΈŽ

src/ezpz/examples/fsdp.py
src/ezpz/examples/fsdp.py
"""FSDP training example on MNIST/OpenImages/ImageNet-style datasets.

Launch with:

    ezpz launch -m ezpz.examples.fsdp --dataset MNIST --batch-size 128

Help output (``python3 -m ezpz.examples.fsdp --help``):

    usage: fsdp.py [-h] [--num-workers N]
                   [--dataset {MNIST,OpenImages,ImageNet,ImageNet1k}]
                   [--batch-size N] [--dtype D] [--test-batch-size N] [--epochs N]
                   [--lr LR] [--gamma M] [--seed S] [--save-model]
                   [--data-prefix DATA_PREFIX]

    PyTorch MNIST Example using FSDP

    options:
      -h, --help            show this help message and exit
      --num-workers N       number of data loading workers (default: 4)
      --dataset {MNIST,OpenImages,ImageNet,ImageNet1k}
                            Dataset to use (default: MNIST)
      --batch-size N        input batch size for training (default: 64)
      --dtype D             Datatype for training (default=bf16).
      --test-batch-size N   input batch size for testing (default: 1000)
      --epochs N            number of epochs to train (default: 10)
      --lr LR               learning rate (default: 1e-3)
      --gamma M             Learning rate step gamma (default: 0.7)
      --seed S              random seed (default: 1)
      --save-model          For Saving the current Model
      --data-prefix DATA_PREFIX
                            data directory prefix
"""

# Based on: https://github.com/pytorch/examples/blob/master/mnist/main.py
from torch.nn.parallel import DistributedDataParallel
from torch.utils.data import DistributedSampler, DataLoader
import argparse
import os
from pathlib import Path
import sys
import time

import ezpz

# from ezpz.history import WANDB_DISABLED
import torch
import torch.distributed as dist
from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
from torch.distributed.fsdp import MixedPrecision
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR

from ezpz.models import summarize_model
from ezpz.examples import get_example_outdir

logger = ezpz.get_logger(__name__)

try:
    import wandb
except Exception:
    wandb = None  # type: ignore


fp = Path(__file__)
fname = f"{fp.parent.stem}.{fp.stem}"
WBPROJ_NAME = f"ezpz.{fp.parent.stem}.{fp.stem}"
OUTPUT_DIR = Path(os.getcwd()).joinpath("outputs", fname)

MODEL_PRESETS = {
    "debug": {
        "conv1_channels": 8,
        "conv2_channels": 16,
        "fc_dim": 64,
    },
    "small": {
        "conv1_channels": 16,
        "conv2_channels": 32,
        "fc_dim": 128,
    },
    "medium": {
        "conv1_channels": 32,
        "conv2_channels": 64,
        "fc_dim": 256,
    },
    "large": {
        "conv1_channels": 64,
        "conv2_channels": 128,
        "fc_dim": 512,
    },
}
MODEL_PRESET_FLAGS = {
    "conv1_channels": ["--conv1-channels"],
    "conv2_channels": ["--conv2-channels"],
    "fc_dim": ["--fc-dim"],
}


class Net(nn.Module):
    """Simple CNN classifier used in the FSDP example."""

    def __init__(
        self,
        num_classes: int = 10,
        img_size: int = 28,
        conv1_channels: int = 32,
        conv2_channels: int = 64,
        fc_dim: int = 128,
    ):
        """Initialize convolutional and fully connected layers.

        Args:
            num_classes: Number of output classes for the classifier.
            img_size: Input image size (assumes square inputs).
            conv1_channels: Number of output channels for conv1.
            conv2_channels: Number of output channels for conv2.
            fc_dim: Hidden dimension for the first fully connected layer.
        """
        super().__init__()
        self.conv1 = nn.Conv2d(1, conv1_channels, 3, 1)
        self.conv2 = nn.Conv2d(conv1_channels, conv2_channels, 3, 1)
        self.dropout1 = nn.Dropout(0.25)
        self.dropout2 = nn.Dropout(0.5)
        feature_size = self._feature_size(img_size, conv2_channels)
        self.fc1 = nn.Linear(feature_size, fc_dim)
        self.fc2 = nn.Linear(fc_dim, num_classes)

    @staticmethod
    def _feature_size(img_size: int, conv2_channels: int) -> int:
        conv1_size = img_size - 2
        conv2_size = conv1_size - 2
        pooled_size = conv2_size // 2
        return conv2_channels * pooled_size * pooled_size

    def forward(self, x):
        """Compute logits for input images."""
        x = self.conv1(x)
        x = F.relu(x)
        x = self.conv2(x)
        x = F.relu(x)
        x = F.max_pool2d(x, 2)
        x = self.dropout1(x)
        x = torch.flatten(x, 1)
        x = self.fc1(x)
        x = F.relu(x)
        x = self.dropout2(x)
        x = self.fc2(x)
        output = F.log_softmax(x, dim=1)
        return output


@ezpz.timeitlogit(rank=ezpz.get_rank())
def train(
    model: nn.Module | DistributedDataParallel | FSDP,
    train_loader: DataLoader,
    optimizer: torch.optim.Optimizer,
    epoch: int,
    sampler: DistributedSampler | None = None,
) -> dict:
    """One epoch of training and loss aggregation across ranks.

    Args:
        model: Wrapped model (DDP/FSDP).
        train_loader: Dataloader for training set.
        optimizer: Optimizer instance.
        epoch: Current epoch index.
        sampler: Optional distributed sampler to set epoch.

    Returns:
        Dict with epoch, wall-clock duration, and averaged train loss.
    """
    device_type = ezpz.distributed.get_torch_device_type()
    device = (
        torch.device("cpu")
        if device_type == "cpu"
        else torch.device(f"{device_type}:{ezpz.distributed.get_local_rank()}")
    )
    model.train()
    ddp_loss = torch.zeros(2).to(device)
    if sampler:
        sampler.set_epoch(epoch)
    ezpz.distributed.synchronize()
    t0 = time.perf_counter()
    batch, target = next(iter(train_loader))
    for _, (batch, target) in enumerate(train_loader):
        batch, target = batch.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(batch)
        loss = F.nll_loss(output, target, reduction="sum")
        loss.backward()
        optimizer.step()
        ddp_loss[0] += loss.item()
        ddp_loss[1] += len(batch)
    ezpz.distributed.synchronize()
    t1 = time.perf_counter()
    dist.all_reduce(ddp_loss, op=dist.ReduceOp.SUM)  # type:ignore
    return {
        "epoch": epoch,
        "dt": t1 - t0,
        "train_loss": ddp_loss[0] / ddp_loss[1],
    }


@ezpz.timeitlogit(rank=ezpz.get_rank())
def test(model, test_loader):
    """Evaluate model on validation data and gather metrics."""
    device_type = ezpz.distributed.get_torch_device_type()
    device = (
        torch.device("cpu")
        if device_type == "cpu"
        else torch.device(f"{device_type}:{ezpz.distributed.get_local_rank()}")
    )
    model.eval()
    # correct = 0
    ddp_loss = torch.zeros(3).to(device)
    with torch.no_grad():
        for batch, target in test_loader:
            batch, target = batch.to(device), target.to(device)
            output = model(batch)
            ddp_loss[0] += F.nll_loss(output, target, reduction="sum")
            pred = output.argmax(
                dim=1, keepdim=True
            )  # get the index of the max log-probability
            ddp_loss[1] += pred.eq(target.view_as(pred)).sum()
            ddp_loss[2] += len(batch)

    dist.all_reduce(ddp_loss, op=dist.ReduceOp.SUM)  # type:ignore

    test_loss = ddp_loss[0] / ddp_loss[2]

    return {
        "test_loss": test_loss,
        "test_acc": 100.0 * ddp_loss[1] / ddp_loss[2],
    }


def prepare_model_optimizer_and_scheduler(args: argparse.Namespace) -> dict:
    """Create the FSDP-wrapped model, optimizer, and LR scheduler."""
    device_type = ezpz.distributed.get_torch_device_type()
    device = (
        torch.device("cpu")
        if device_type == "cpu"
        else torch.device(f"{device_type}:{ezpz.distributed.get_local_rank()}")
    )
    if args.dataset == "MNIST":
        num_classes = 10
        img_size = 28
    elif args.dataset == "OpenImages":
        num_classes = 600
        img_size = 224
    elif args.dataset == "ImageNet":
        num_classes = 1000
        img_size = 224
    elif args.dataset == "ImageNet1k":
        num_classes = 1000
        img_size = 224
    else:
        raise ValueError(f"Unsupported dataset: {args.dataset}")
    model = Net(
        num_classes=num_classes,
        img_size=img_size,
        conv1_channels=args.conv1_channels,
        conv2_channels=args.conv2_channels,
        fc_dim=args.fc_dim,
    ).to(device)
    logger.info(f"\n{summarize_model(model, verbose=False, depth=2)}")
    dtypes = {
        "fp16": torch.float16,
        "bf16": torch.bfloat16,
        "bfloat16": torch.bfloat16,
        "fp32": torch.float32,
    }
    dtype = dtypes[args.dtype]
    model = FSDP(
        model,
        device_id=device,
        mixed_precision=MixedPrecision(
            param_dtype=dtype,
            cast_forward_inputs=True,
        ),
    )
    optimizer = optim.AdamW(model.parameters(), lr=args.lr)
    logger.info(f"{model=}")
    scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma)
    return {
        "model": model,
        "optimizer": optimizer,
        "scheduler": scheduler,
    }


def get_data(args: argparse.Namespace) -> dict:
    """Load train/test datasets according to args.dataset."""
    # data_prefix_fallback = Path(os.getcwd()).joinpath(
    #     ".cache", "ezpz", "data", f"{args.dataset.lower()}"
    # )
    # data_prefix = args.data_prefix or data_prefix_fallback
    # if args.dataset == "MNIST":
    #     from ezpz.data.vision import get_mnist
    #
    #     data = get_mnist(
    #         outdir=Path(data_prefix),
    #         train_batch_size=args.batch_size,
    #         test_batch_size=args.test_batch_size,
    #         pin_memory=True,
    #         num_workers=args.num_workers,
    #     )
    # else:
    #     raise ValueError(f"Unsupported dataset: {args.dataset}")
    data_prefix_fallback = Path(os.getcwd()).joinpath(
        ".cache", "ezpz", "data", f"{args.dataset.lower()}"
    )
    data_prefix = args.data_prefix or data_prefix_fallback
    if args.dataset == "MNIST":
        from ezpz.data.vision import get_mnist

        data = get_mnist(
            outdir=Path(data_prefix),
            train_batch_size=args.batch_size,
            test_batch_size=args.test_batch_size,
            pin_memory=True,
            num_workers=args.num_workers,
        )

    elif args.dataset == "ImageNet1k":
        from ezpz.data.vision import get_imagenet1k

        data = get_imagenet1k(
            outdir=Path(data_prefix),
            train_batch_size=args.batch_size,
            test_batch_size=args.test_batch_size,
            pin_memory=True,
            num_workers=args.num_workers,
        )

    elif args.dataset == "OpenImages":
        from ezpz.data.vision import get_openimages

        data = get_openimages(
            outdir=Path(data_prefix),
            train_batch_size=args.batch_size,
            test_batch_size=args.test_batch_size,
            shuffle=False,
            pin_memory=True,
            num_workers=args.num_workers,
        )
    elif args.dataset == "ImageNet":
        from ezpz.data.vision import get_imagenet

        data = get_imagenet(
            outdir=Path(data_prefix),
            train_batch_size=args.batch_size,
            test_batch_size=args.test_batch_size,
            shuffle=False,
            pin_memory=True,
            num_workers=args.num_workers,
        )
    else:
        raise ValueError(f"Unsupported dataset: {args.dataset}")

    return data


@ezpz.timeitlogit(rank=ezpz.get_rank())
def fsdp_main(args: argparse.Namespace) -> None:
    """Main training loop orchestrating data, model, and logging."""
    t0 = time.perf_counter()
    rank = ezpz.setup_torch(seed=args.seed)
    t_setup = time.perf_counter()
    if rank == 0:
        # try:
        fp = Path(__file__)
        run = ezpz.setup_wandb(project_name=f"ezpz.{fp.parent.stem}.{fp.stem}")
        if run is not None and wandb is not None and run is wandb.run:
            run.config.update({"args": {**vars(args)}})
            run.config.update({"ezpz.dist": {**ezpz.get_dist_info()}})

    data = get_data(args)
    ezpz.distributed.barrier()
    train_loader = data["train"]["loader"]
    test_loader = data["test"]["loader"]

    tmp = prepare_model_optimizer_and_scheduler(args)
    model = tmp["model"]
    optimizer = tmp["optimizer"]
    scheduler = tmp["scheduler"]

    outdir = get_example_outdir(WBPROJ_NAME)
    logger.info("Outputs will be saved to %s", outdir)
    metrics_path = outdir.joinpath(f"metrics-{rank}.jsonl")
    outdir.mkdir(parents=True, exist_ok=True)
    history = ezpz.history.History(
        report_dir=outdir,
        report_enabled=(rank == 0),
        jsonl_path=metrics_path,
        # jsonl_overwrite=True,
        distributed_history=(
            1 < ezpz.get_world_size() <= 384  # and not config.pytorch_profiler
        ),
    )
    start = time.perf_counter()
    for epoch in range(1, args.epochs + 1):
        train_metrics = train(
            model=model,
            train_loader=train_loader,
            optimizer=optimizer,
            epoch=epoch,
            sampler=data["train"]["sampler"],
        )
        test_metrics = test(model, test_loader)
        scheduler.step()
        logger.info(history.update({**train_metrics, **test_metrics}))

    train_end = time.perf_counter()
    logger.info(
        " ".join(
            [
                f"{args.epochs + 1} epochs took",
                f"{train_end - start:.1f}s",
            ]
        )
    )
    timings = {
        "main/setup_torch": t_setup - t0,
        "main/train": train_end - start,
        "main/total": train_end - t0,
        "timings/training_start": start - t0,
        "timings/train_duration": train_end - start,
        "timings/end-to-end": train_end - t0,
    }
    logger.info("Timings: %s", timings)
    if wandb is not None and getattr(wandb, "run", None) is not None:
        try:
            wandb.log(
                {
                    (f"timings/{k}" if not k.startswith("timings/") else k): v
                    for k, v in timings.items()
                }
            )
        except Exception:
            logger.warning("Failed to log timings to wandb")
    ezpz.distributed.barrier()

    if args.save_model:
        ezpz.distributed.barrier()  # wait for slowpokes
        states = model.state_dict()
        if rank == 0:
            torch.save(states, "mnist_cnn.pt")

    if rank == 0:
        dataset = history.finalize(
            run_name=WBPROJ_NAME,
            dataset_fname="train",
        )
        logger.info(f"{dataset=}")


def _arg_provided(argv: list[str], flags: list[str]) -> bool:
    return any(flag in argv for flag in flags)


def apply_model_preset(args: argparse.Namespace, argv: list[str]) -> None:
    if args.model is None:
        return
    preset = MODEL_PRESETS[args.model]
    for field_name, value in preset.items():
        flags = MODEL_PRESET_FLAGS.get(field_name, [])
        if not _arg_provided(argv, flags):
            setattr(args, field_name, value)


def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
    """CLI parser for the FSDP example."""
    if argv is None:
        argv = sys.argv[1:]
    parser = argparse.ArgumentParser(
        description="PyTorch MNIST Example using FSDP"
    )
    parser.add_argument(
        "--num-workers",
        type=int,
        default=0,
        metavar="N",
        help="number of data loading workers (default: 4)",
    )
    parser.add_argument(
        "--dataset",
        type=str,
        default="MNIST",
        choices=["MNIST", "OpenImages", "ImageNet", "ImageNet1k"],
        help="Dataset to use (default: MNIST)",
    )
    parser.add_argument(
        "--batch-size",
        type=int,
        default=64,
        metavar="N",
        help="input batch size for training (default: 64)",
    )
    parser.add_argument(
        "--model",
        type=str,
        default=None,
        choices=sorted(MODEL_PRESETS.keys()),
        help="Model size preset (overrides conv/fc defaults)",
    )
    parser.add_argument(
        "--conv1-channels",
        type=int,
        default=32,
        metavar="N",
        help="Number of output channels in conv1",
    )
    parser.add_argument(
        "--conv2-channels",
        type=int,
        default=64,
        metavar="N",
        help="Number of output channels in conv2",
    )
    parser.add_argument(
        "--fc-dim",
        type=int,
        default=128,
        metavar="N",
        help="Hidden dimension for the first linear layer",
    )
    parser.add_argument(
        "--dtype",
        type=str,
        default="bf16",
        metavar="D",
        help="Datatype for training (default=bf16).",
    )
    parser.add_argument(
        "--test-batch-size",
        type=int,
        default=1000,
        metavar="N",
        help="input batch size for testing (default: 1000)",
    )
    parser.add_argument(
        "--epochs",
        type=int,
        default=10,
        metavar="N",
        help="number of epochs to train (default: 10)",
    )
    parser.add_argument(
        "--lr",
        type=float,
        default=1e-3,
        metavar="LR",
        help="learning rate (default: 1e-3)",
    )
    parser.add_argument(
        "--gamma",
        type=float,
        default=0.7,
        metavar="M",
        help="Learning rate step gamma (default: 0.7)",
    )
    parser.add_argument(
        "--seed",
        type=int,
        default=None,
        metavar="S",
        help="random seed (default: 1)",
    )
    parser.add_argument(
        "--save-model",
        action="store_true",
        default=False,
        help="For Saving the current Model",
    )
    parser.add_argument(
        "--data-prefix",
        type=str,
        required=False,
        default=None,
        help="data directory prefix",
    )
    args = parser.parse_args(argv)
    apply_model_preset(args, argv)
    return args


if __name__ == "__main__":
    args = parse_args()
    fsdp_main(args=args)
    ezpz.cleanup()

Code Walkthroughβš“οΈŽ

Imports

The FSDP and MixedPrecision imports enable fully-sharded data parallelism with optional half-precision compute, which is the core distribution strategy this example demonstrates. ezpz replaces the manual init_process_group / device-selection boilerplate so the same script works on CUDA, XPU, and MPS without changes. summarize_model and get_example_outdir are convenience helpers for logging parameter counts and writing outputs to a timestamped directory.

src/ezpz/examples/fsdp.py
# Based on: https://github.com/pytorch/examples/blob/master/mnist/main.py
from torch.nn.parallel import DistributedDataParallel
from torch.utils.data import DistributedSampler, DataLoader
import argparse
import os
from pathlib import Path
import sys
import time

import ezpz

# from ezpz.history import WANDB_DISABLED
import torch
import torch.distributed as dist
from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
from torch.distributed.fsdp import MixedPrecision
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR

from ezpz.models import summarize_model
from ezpz.examples import get_example_outdir

logger = ezpz.get_logger(__name__)

try:
    import wandb
except Exception:
    wandb = None  # type: ignore
Model Presets

Named presets (debug, small, medium, large) let users scale the CNN architecture from the command line with --model <preset>. Each preset bundles conv1_channels, conv2_channels, and fc_dim so you can quickly compare FSDP overhead at different model sizes without manually tuning individual flags. Any CLI flag the user passes explicitly overrides the preset value.

src/ezpz/examples/fsdp.py
MODEL_PRESETS = {
    "debug": {
        "conv1_channels": 8,
        "conv2_channels": 16,
        "fc_dim": 64,
    },
    "small": {
        "conv1_channels": 16,
        "conv2_channels": 32,
        "fc_dim": 128,
    },
    "medium": {
        "conv1_channels": 32,
        "conv2_channels": 64,
        "fc_dim": 256,
    },
    "large": {
        "conv1_channels": 64,
        "conv2_channels": 128,
        "fc_dim": 512,
    },
}
Net -- CNN Architecture

A two-layer convolutional network with dropout and two fully connected layers. _feature_size computes the flattened dimension after convolutions and pooling so the first linear layer is sized correctly.

src/ezpz/examples/fsdp.py
class Net(nn.Module):
    """Simple CNN classifier used in the FSDP example."""

    def __init__(
        self,
        num_classes: int = 10,
        img_size: int = 28,
        conv1_channels: int = 32,
        conv2_channels: int = 64,
        fc_dim: int = 128,
    ):
        """Initialize convolutional and fully connected layers.

        Args:
            num_classes: Number of output classes for the classifier.
            img_size: Input image size (assumes square inputs).
            conv1_channels: Number of output channels for conv1.
            conv2_channels: Number of output channels for conv2.
            fc_dim: Hidden dimension for the first fully connected layer.
        """
        super().__init__()
        self.conv1 = nn.Conv2d(1, conv1_channels, 3, 1)
        self.conv2 = nn.Conv2d(conv1_channels, conv2_channels, 3, 1)
        self.dropout1 = nn.Dropout(0.25)
        self.dropout2 = nn.Dropout(0.5)
        feature_size = self._feature_size(img_size, conv2_channels)
        self.fc1 = nn.Linear(feature_size, fc_dim)
        self.fc2 = nn.Linear(fc_dim, num_classes)

    @staticmethod
    def _feature_size(img_size: int, conv2_channels: int) -> int:
        conv1_size = img_size - 2
        conv2_size = conv1_size - 2
        pooled_size = conv2_size // 2
        return conv2_channels * pooled_size * pooled_size

    def forward(self, x):
        """Compute logits for input images."""
        x = self.conv1(x)
        x = F.relu(x)
        x = self.conv2(x)
        x = F.relu(x)
        x = F.max_pool2d(x, 2)
        x = self.dropout1(x)
        x = torch.flatten(x, 1)
        x = self.fc1(x)
        x = F.relu(x)
        x = self.dropout2(x)
        x = self.fc2(x)
        output = F.log_softmax(x, dim=1)
        return output
train -- Single-Epoch Training

Runs one training epoch, accumulating loss across batches. After the loop, dist.all_reduce sums the loss and sample count across all ranks so every worker sees the global average.

src/ezpz/examples/fsdp.py
@ezpz.timeitlogit(rank=ezpz.get_rank())
def train(
    model: nn.Module | DistributedDataParallel | FSDP,
    train_loader: DataLoader,
    optimizer: torch.optim.Optimizer,
    epoch: int,
    sampler: DistributedSampler | None = None,
) -> dict:
    """One epoch of training and loss aggregation across ranks.

    Args:
        model: Wrapped model (DDP/FSDP).
        train_loader: Dataloader for training set.
        optimizer: Optimizer instance.
        epoch: Current epoch index.
        sampler: Optional distributed sampler to set epoch.

    Returns:
        Dict with epoch, wall-clock duration, and averaged train loss.
    """
    device_type = ezpz.distributed.get_torch_device_type()
    device = (
        torch.device("cpu")
        if device_type == "cpu"
        else torch.device(f"{device_type}:{ezpz.distributed.get_local_rank()}")
    )
    model.train()
    ddp_loss = torch.zeros(2).to(device)
    if sampler:
        sampler.set_epoch(epoch)
    ezpz.distributed.synchronize()
    t0 = time.perf_counter()
    batch, target = next(iter(train_loader))
    for _, (batch, target) in enumerate(train_loader):
        batch, target = batch.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(batch)
        loss = F.nll_loss(output, target, reduction="sum")
        loss.backward()
        optimizer.step()
        ddp_loss[0] += loss.item()
        ddp_loss[1] += len(batch)
    ezpz.distributed.synchronize()
    t1 = time.perf_counter()
    dist.all_reduce(ddp_loss, op=dist.ReduceOp.SUM)  # type:ignore
    return {
        "epoch": epoch,
        "dt": t1 - t0,
        "train_loss": ddp_loss[0] / ddp_loss[1],
    }
test -- Evaluation

Evaluates the model on validation data with gradients disabled. Tracks loss, correct predictions, and total samples, then all-reduces across ranks.

src/ezpz/examples/fsdp.py
@ezpz.timeitlogit(rank=ezpz.get_rank())
def test(model, test_loader):
    """Evaluate model on validation data and gather metrics."""
    device_type = ezpz.distributed.get_torch_device_type()
    device = (
        torch.device("cpu")
        if device_type == "cpu"
        else torch.device(f"{device_type}:{ezpz.distributed.get_local_rank()}")
    )
    model.eval()
    # correct = 0
    ddp_loss = torch.zeros(3).to(device)
    with torch.no_grad():
        for batch, target in test_loader:
            batch, target = batch.to(device), target.to(device)
            output = model(batch)
            ddp_loss[0] += F.nll_loss(output, target, reduction="sum")
            pred = output.argmax(
                dim=1, keepdim=True
            )  # get the index of the max log-probability
            ddp_loss[1] += pred.eq(target.view_as(pred)).sum()
            ddp_loss[2] += len(batch)

    dist.all_reduce(ddp_loss, op=dist.ReduceOp.SUM)  # type:ignore

    test_loss = ddp_loss[0] / ddp_loss[2]

    return {
        "test_loss": test_loss,
        "test_acc": 100.0 * ddp_loss[1] / ddp_loss[2],
    }
prepare_model_optimizer_and_scheduler -- FSDP Wrapping

Creates the Net model, wraps it with FullyShardedDataParallel using mixed-precision settings, and returns the model, optimizer, and LR scheduler.

src/ezpz/examples/fsdp.py
def prepare_model_optimizer_and_scheduler(args: argparse.Namespace) -> dict:
    """Create the FSDP-wrapped model, optimizer, and LR scheduler."""
    device_type = ezpz.distributed.get_torch_device_type()
    device = (
        torch.device("cpu")
        if device_type == "cpu"
        else torch.device(f"{device_type}:{ezpz.distributed.get_local_rank()}")
    )
    if args.dataset == "MNIST":
        num_classes = 10
        img_size = 28
    elif args.dataset == "OpenImages":
        num_classes = 600
        img_size = 224
    elif args.dataset == "ImageNet":
        num_classes = 1000
        img_size = 224
    elif args.dataset == "ImageNet1k":
        num_classes = 1000
        img_size = 224
    else:
        raise ValueError(f"Unsupported dataset: {args.dataset}")
    model = Net(
        num_classes=num_classes,
        img_size=img_size,
        conv1_channels=args.conv1_channels,
        conv2_channels=args.conv2_channels,
        fc_dim=args.fc_dim,
    ).to(device)
    logger.info(f"\n{summarize_model(model, verbose=False, depth=2)}")
    dtypes = {
        "fp16": torch.float16,
        "bf16": torch.bfloat16,
        "bfloat16": torch.bfloat16,
        "fp32": torch.float32,
    }
    dtype = dtypes[args.dtype]
    model = FSDP(
        model,
        device_id=device,
        mixed_precision=MixedPrecision(
            param_dtype=dtype,
            cast_forward_inputs=True,
        ),
    )
    optimizer = optim.AdamW(model.parameters(), lr=args.lr)
    logger.info(f"{model=}")
    scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma)
    return {
        "model": model,
        "optimizer": optimizer,
        "scheduler": scheduler,
    }
get_data -- Data Loading

Dispatches to dataset-specific loaders (get_mnist, get_imagenet1k, get_openimages, get_imagenet) from ezpz.data.vision based on the --dataset flag.

src/ezpz/examples/fsdp.py
def get_data(args: argparse.Namespace) -> dict:
    """Load train/test datasets according to args.dataset."""
    data_prefix_fallback = Path(os.getcwd()).joinpath(
        ".cache", "ezpz", "data", f"{args.dataset.lower()}"
    )
    data_prefix = args.data_prefix or data_prefix_fallback
    if args.dataset == "MNIST":
        from ezpz.data.vision import get_mnist

        data = get_mnist(
            outdir=Path(data_prefix),
            train_batch_size=args.batch_size,
            test_batch_size=args.test_batch_size,
            pin_memory=True,
            num_workers=args.num_workers,
        )

    elif args.dataset == "ImageNet1k":
        from ezpz.data.vision import get_imagenet1k

        data = get_imagenet1k(
            outdir=Path(data_prefix),
            train_batch_size=args.batch_size,
            test_batch_size=args.test_batch_size,
            pin_memory=True,
            num_workers=args.num_workers,
        )

    elif args.dataset == "OpenImages":
        from ezpz.data.vision import get_openimages

        data = get_openimages(
            outdir=Path(data_prefix),
            train_batch_size=args.batch_size,
            test_batch_size=args.test_batch_size,
            shuffle=False,
            pin_memory=True,
            num_workers=args.num_workers,
        )
    elif args.dataset == "ImageNet":
        from ezpz.data.vision import get_imagenet

        data = get_imagenet(
            outdir=Path(data_prefix),
            train_batch_size=args.batch_size,
            test_batch_size=args.test_batch_size,
            shuffle=False,
            pin_memory=True,
            num_workers=args.num_workers,
        )
    else:
        raise ValueError(f"Unsupported dataset: {args.dataset}")

    return data
fsdp_main -- Main Function

Orchestrates the full training run: initializes distributed training with ezpz.setup_torch, optionally sets up Weights & Biases logging, loads data, prepares the FSDP-wrapped model, and runs the epoch loop.

src/ezpz/examples/fsdp.py
@ezpz.timeitlogit(rank=ezpz.get_rank())
def fsdp_main(args: argparse.Namespace) -> None:
    """Main training loop orchestrating data, model, and logging."""
    t0 = time.perf_counter()
    rank = ezpz.setup_torch(seed=args.seed)
    t_setup = time.perf_counter()
    if rank == 0:
        # try:
        fp = Path(__file__)
        run = ezpz.setup_wandb(project_name=f"ezpz.{fp.parent.stem}.{fp.stem}")
        if run is not None and wandb is not None and run is wandb.run:
            run.config.update({"args": {**vars(args)}})
            run.config.update({"ezpz.dist": {**ezpz.get_dist_info()}})

    data = get_data(args)
    ezpz.distributed.barrier()
    train_loader = data["train"]["loader"]
    test_loader = data["test"]["loader"]

    tmp = prepare_model_optimizer_and_scheduler(args)
    model = tmp["model"]
    optimizer = tmp["optimizer"]
    scheduler = tmp["scheduler"]

An ezpz.history.History object tracks per-epoch metrics and optionally writes them to JSONL. The epoch loop calls train, test, and scheduler.step each iteration.

src/ezpz/examples/fsdp.py
    outdir = get_example_outdir(WBPROJ_NAME)
    logger.info("Outputs will be saved to %s", outdir)
    metrics_path = outdir.joinpath(f"metrics-{rank}.jsonl")
    outdir.mkdir(parents=True, exist_ok=True)
    history = ezpz.history.History(
        report_dir=outdir,
        report_enabled=(rank == 0),
        jsonl_path=metrics_path,
        # jsonl_overwrite=True,
        distributed_history=(
            1 < ezpz.get_world_size() <= 384  # and not config.pytorch_profiler
        ),
    )
    start = time.perf_counter()
    for epoch in range(1, args.epochs + 1):
        train_metrics = train(
            model=model,
            train_loader=train_loader,
            optimizer=optimizer,
            epoch=epoch,
            sampler=data["train"]["sampler"],
        )
        test_metrics = test(model, test_loader)
        scheduler.step()
        logger.info(history.update({**train_metrics, **test_metrics}))

After training completes, timings are logged (and optionally sent to W&B), the model checkpoint is saved if --save-model was passed, and history.finalize writes the final report on rank 0.

src/ezpz/examples/fsdp.py
    train_end = time.perf_counter()
    logger.info(
        " ".join(
            [
                f"{args.epochs + 1} epochs took",
                f"{train_end - start:.1f}s",
            ]
        )
    )
    timings = {
        "main/setup_torch": t_setup - t0,
        "main/train": train_end - start,
        "main/total": train_end - t0,
        "timings/training_start": start - t0,
        "timings/train_duration": train_end - start,
        "timings/end-to-end": train_end - t0,
    }
    logger.info("Timings: %s", timings)
    if wandb is not None and getattr(wandb, "run", None) is not None:
        try:
            wandb.log(
                {
                    (f"timings/{k}" if not k.startswith("timings/") else k): v
                    for k, v in timings.items()
                }
            )
        except Exception:
            logger.warning("Failed to log timings to wandb")
    ezpz.distributed.barrier()

    if args.save_model:
        ezpz.distributed.barrier()  # wait for slowpokes
        states = model.state_dict()
        if rank == 0:
            torch.save(states, "mnist_cnn.pt")

    if rank == 0:
        dataset = history.finalize(
            run_name=WBPROJ_NAME,
            dataset_fname="train",
        )
        logger.info(f"{dataset=}")
Entrypoint

Parses CLI arguments, runs fsdp_main, and calls ezpz.cleanup() to tear down the process group.

src/ezpz/examples/fsdp.py
if __name__ == "__main__":
    args = parse_args()
    fsdp_main(args=args)
    ezpz.cleanup()

Helpβš“οΈŽ

--help
$ python3 -m ezpz.examples.fsdp --help
usage: fsdp.py [-h] [--num-workers N]
            [--dataset {MNIST,OpenImages,ImageNet,ImageNet1k}]
            [--batch-size N] [--dtype D] [--test-batch-size N] [--epochs N]
            [--lr LR] [--gamma M] [--seed S] [--save-model]
            [--data-prefix DATA_PREFIX]

PyTorch MNIST Example using FSDP

options:
-h, --help            show this help message and exit
--num-workers N       number of data loading workers (default: 4)
--dataset {MNIST,OpenImages,ImageNet,ImageNet1k}
                        Dataset to use (default: MNIST)
--batch-size N        input batch size for training (default: 64)
--dtype D             Datatype for training (default=bf16).
--test-batch-size N   input batch size for testing (default: 1000)
--epochs N            number of epochs to train (default: 10)
--lr LR               learning rate (default: 1e-3)
--gamma M             Learning rate step gamma (default: 0.7)
--seed S              random seed (default: 1)
--save-model          For Saving the current Model
--data-prefix DATA_PREFIX
                        data directory prefix

Outputβš“οΈŽ

Output on Sunspot
$ ezpz launch python3 -m ezpz.examples.fsdp

[2025-12-31 12:21:21,523041][I][ezpz/launch:396:launch] ----[πŸ‹ ezpz.launch][started][2025-12-31-122121]----
[2025-12-31 12:21:22,375537][I][ezpz/launch:416:launch] Job ID: 12458339
[2025-12-31 12:21:22,376302][I][ezpz/launch:417:launch] nodelist: ['x1921c0s3b0n0', 'x1921c0s7b0n0']
[2025-12-31 12:21:22,376691][I][ezpz/launch:418:launch] hostfile: /var/spool/pbs/aux/12458339.sunspot-pbs-0001.head.cm.sunspot.alcf.anl.gov
[2025-12-31 12:21:22,377360][I][ezpz/pbs:264:get_pbs_launch_cmd] βœ… Using [24/24] GPUs [2 hosts] x [12 GPU/host]
[2025-12-31 12:21:22,378079][I][ezpz/launch:367:build_executable] Building command to execute by piecing together:
[2025-12-31 12:21:22,378474][I][ezpz/launch:368:build_executable] (1.) launch_cmd: mpiexec --envall --np=24 --ppn=12 --hostfile=/var/spool/pbs/aux/12458339.sunspot-pbs-0001.head.cm.sunspot.alcf.anl.gov --no-vni --cpu-bind=verbose,list:2-4:10-12:18-20:26-28:34-36:42-44:54-56:62-64:70-72:78-80:86-88:94-96
[2025-12-31 12:21:22,379293][I][ezpz/launch:369:build_executable] (2.) cmd_to_launch: python3 -m ezpz.examples.fsdp
[2025-12-31 12:21:22,380037][I][ezpz/launch:433:launch] Took: 1.45 seconds to build command.
[2025-12-31 12:21:22,380393][I][ezpz/launch:436:launch] Executing:
mpiexec
  --envall
  --np=24
  --ppn=12
  --hostfile=/var/spool/pbs/aux/12458339.sunspot-pbs-0001.head.cm.sunspot.alcf.anl.gov
  --no-vni
  --cpu-bind=verbose,list:2-4:10-12:18-20:26-28:34-36:42-44:54-56:62-64:70-72:78-80:86-88:94-96
  python3
  -m
  ezpz.examples.fsdp
[2025-12-31 12:21:22,381628][I][ezpz/launch:443:launch] Execution started @ 2025-12-31-122122...
[2025-12-31 12:21:22,382071][I][ezpz/launch:139:run_command] Running command:
 mpiexec --envall --np=24 --ppn=12 --hostfile=/var/spool/pbs/aux/12458339.sunspot-pbs-0001.head.cm.sunspot.alcf.anl.gov --no-vni --cpu-bind=verbose,list:2-4:10-12:18-20:26-28:34-36:42-44:54-56:62-64:70-72:78-80:86-88:94-96 python3 -m ezpz.examples.fsdp
cpubind:list x1921c0s7b0n0 pid 111174 rank 12 0: mask 0x1c
cpubind:list x1921c0s7b0n0 pid 111175 rank 13 1: mask 0x1c00
cpubind:list x1921c0s7b0n0 pid 111176 rank 14 2: mask 0x1c0000
cpubind:list x1921c0s7b0n0 pid 111177 rank 15 3: mask 0x1c000000
cpubind:list x1921c0s7b0n0 pid 111178 rank 16 4: mask 0x1c00000000
cpubind:list x1921c0s7b0n0 pid 111179 rank 17 5: mask 0x1c0000000000
cpubind:list x1921c0s7b0n0 pid 111180 rank 18 6: mask 0x1c0000000000000
cpubind:list x1921c0s7b0n0 pid 111181 rank 19 7: mask 0x1c000000000000000
cpubind:list x1921c0s7b0n0 pid 111182 rank 20 8: mask 0x1c00000000000000000
cpubind:list x1921c0s7b0n0 pid 111183 rank 21 9: mask 0x1c0000000000000000000
cpubind:list x1921c0s7b0n0 pid 111184 rank 22 10: mask 0x1c000000000000000000000
cpubind:list x1921c0s7b0n0 pid 111185 rank 23 11: mask 0x1c00000000000000000000000
cpubind:list x1921c0s3b0n0 pid 107043 rank 0 0: mask 0x1c
cpubind:list x1921c0s3b0n0 pid 107044 rank 1 1: mask 0x1c00
cpubind:list x1921c0s3b0n0 pid 107045 rank 2 2: mask 0x1c0000
cpubind:list x1921c0s3b0n0 pid 107046 rank 3 3: mask 0x1c000000
cpubind:list x1921c0s3b0n0 pid 107047 rank 4 4: mask 0x1c00000000
cpubind:list x1921c0s3b0n0 pid 107048 rank 5 5: mask 0x1c0000000000
cpubind:list x1921c0s3b0n0 pid 107049 rank 6 6: mask 0x1c0000000000000
cpubind:list x1921c0s3b0n0 pid 107050 rank 7 7: mask 0x1c000000000000000
cpubind:list x1921c0s3b0n0 pid 107051 rank 8 8: mask 0x1c00000000000000000
cpubind:list x1921c0s3b0n0 pid 107052 rank 9 9: mask 0x1c0000000000000000000
cpubind:list x1921c0s3b0n0 pid 107053 rank 10 10: mask 0x1c000000000000000000000
cpubind:list x1921c0s3b0n0 pid 107054 rank 11 11: mask 0x1c00000000000000000000000
[2025-12-31 12:21:26,964250][I][ezpz/dist:1501:setup_torch_distributed] Using torch_{device,backend}= {xpu, xccl}
[2025-12-31 12:21:26,967037][I][ezpz/dist:1366:setup_torch_DDP] Caught MASTER_PORT=41625 from environment!
[2025-12-31 12:21:26,967795][I][ezpz/dist:1382:setup_torch_DDP] Using torch.distributed.init_process_group with
- master_addr='x1921c0s3b0n0'
- master_port='41625'
- world_size=24
- rank=0
- local_rank=0
- timeout=datetime.timedelta(seconds=3600)
- backend='xccl'
[2025-12-31 12:21:26,968707][I][ezpz/dist:1014:init_process_group] Calling torch.distributed.init_process_group_with: rank=0 world_size=24 backend=xccl
[2025-12-31 12:21:27,619965][I][ezpz/dist:1727:setup_torch] Using device='xpu' with backend='xccl' + 'xccl' for distributed training.
[2025-12-31 12:21:27,620787][W][ezpz/dist:544:print_dist_setup] Using [24 / 24] available "xpu" devices !!
[2025-12-31 12:21:27,621230][I][ezpz/dist:1774:setup_torch] ['x1921c0s3b0n0'][device='xpu'][node=0/1][rank=00/23][local_rank=00/11]
[2025-12-31 12:21:27,620421][I][ezpz/dist:1774:setup_torch] ['x1921c0s3b0n0'][device='xpu'][node=1/1][rank=01/23][local_rank=01/11]
[2025-12-31 12:21:27,620452][I][ezpz/dist:1774:setup_torch] ['x1921c0s3b0n0'][device='xpu'][node=0/1][rank=02/23][local_rank=02/11]
[2025-12-31 12:21:27,620445][I][ezpz/dist:1774:setup_torch] ['x1921c0s3b0n0'][device='xpu'][node=0/1][rank=04/23][local_rank=04/11]
[2025-12-31 12:21:27,620450][I][ezpz/dist:1774:setup_torch] ['x1921c0s3b0n0'][device='xpu'][node=1/1][rank=05/23][local_rank=05/11]
[2025-12-31 12:21:27,620418][I][ezpz/dist:1774:setup_torch] ['x1921c0s3b0n0'][device='xpu'][node=0/1][rank=06/23][local_rank=06/11]
[2025-12-31 12:21:27,620439][I][ezpz/dist:1774:setup_torch] ['x1921c0s3b0n0'][device='xpu'][node=1/1][rank=07/23][local_rank=07/11]
[2025-12-31 12:21:27,620431][I][ezpz/dist:1774:setup_torch] ['x1921c0s3b0n0'][device='xpu'][node=0/1][rank=08/23][local_rank=08/11]
[2025-12-31 12:21:27,620400][I][ezpz/dist:1774:setup_torch] ['x1921c0s3b0n0'][device='xpu'][node=1/1][rank=09/23][local_rank=09/11]
[2025-12-31 12:21:27,620398][I][ezpz/dist:1774:setup_torch] ['x1921c0s3b0n0'][device='xpu'][node=0/1][rank=10/23][local_rank=10/11]
[2025-12-31 12:21:27,620433][I][ezpz/dist:1774:setup_torch] ['x1921c0s3b0n0'][device='xpu'][node=1/1][rank=11/23][local_rank=11/11]
[2025-12-31 12:21:27,620451][I][ezpz/dist:1774:setup_torch] ['x1921c0s3b0n0'][device='xpu'][node=1/1][rank=03/23][local_rank=03/11]
[2025-12-31 12:21:27,620523][I][ezpz/dist:1774:setup_torch] ['x1921c0s7b0n0'][device='xpu'][node=0/1][rank=12/23][local_rank=00/11]
[2025-12-31 12:21:27,620546][I][ezpz/dist:1774:setup_torch] ['x1921c0s7b0n0'][device='xpu'][node=1/1][rank=13/23][local_rank=01/11]
[2025-12-31 12:21:27,620556][I][ezpz/dist:1774:setup_torch] ['x1921c0s7b0n0'][device='xpu'][node=0/1][rank=14/23][local_rank=02/11]
[2025-12-31 12:21:27,620557][I][ezpz/dist:1774:setup_torch] ['x1921c0s7b0n0'][device='xpu'][node=0/1][rank=16/23][local_rank=04/11]
[2025-12-31 12:21:27,620568][I][ezpz/dist:1774:setup_torch] ['x1921c0s7b0n0'][device='xpu'][node=1/1][rank=15/23][local_rank=03/11]
[2025-12-31 12:21:27,620557][I][ezpz/dist:1774:setup_torch] ['x1921c0s7b0n0'][device='xpu'][node=1/1][rank=17/23][local_rank=05/11]
[2025-12-31 12:21:27,620575][I][ezpz/dist:1774:setup_torch] ['x1921c0s7b0n0'][device='xpu'][node=1/1][rank=19/23][local_rank=07/11]
[2025-12-31 12:21:27,620556][I][ezpz/dist:1774:setup_torch] ['x1921c0s7b0n0'][device='xpu'][node=0/1][rank=20/23][local_rank=08/11]
[2025-12-31 12:21:27,620560][I][ezpz/dist:1774:setup_torch] ['x1921c0s7b0n0'][device='xpu'][node=1/1][rank=21/23][local_rank=09/11]
[2025-12-31 12:21:27,620578][I][ezpz/dist:1774:setup_torch] ['x1921c0s7b0n0'][device='xpu'][node=0/1][rank=22/23][local_rank=10/11]
[2025-12-31 12:21:27,620579][I][ezpz/dist:1774:setup_torch] ['x1921c0s7b0n0'][device='xpu'][node=1/1][rank=23/23][local_rank=11/11]
[2025-12-31 12:21:27,620579][I][ezpz/dist:1774:setup_torch] ['x1921c0s7b0n0'][device='xpu'][node=0/1][rank=18/23][local_rank=06/11]
[2025-12-31 12:21:28,206982][I][ezpz/dist:2039:setup_wandb] Setting up wandb from rank=0
[2025-12-31 12:21:28,207580][I][ezpz/dist:2040:setup_wandb] Using WB_PROJECT=ezpz.examples.fsdp
wandb: Currently logged in as: foremans (aurora_gpt) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin
wandb: Tracking run with wandb version 0.23.1
wandb: Run data is saved locally in /lus/tegu/projects/datascience/foremans/projects/saforem2/ezpz/wandb/run-20251231_122128-11cqdt05
wandb: Run `wandb offline` to turn off syncing.
wandb: Syncing run vivid-glade-86
wandb:  View project at https://wandb.ai/aurora_gpt/ezpz.examples.fsdp
wandb:  View run at https://wandb.ai/aurora_gpt/ezpz.examples.fsdp/runs/11cqdt05
[2025-12-31 12:21:29,790902][I][ezpz/dist:2069:setup_wandb] wandb.run=[vivid-glade-86](https://wandb.ai/aurora_gpt/ezpz.examples.fsdp/runs/11cqdt05)
[2025-12-31 12:21:29,796125][I][ezpz/dist:2112:setup_wandb] Running on machine='SunSpot'
[2025-12-31 12:21:30,092593][I][examples/fsdp:196:prepare_model_optimizer_and_scheduler] 
=================================================================
Layer (type:depth-idx)                   Param #
=================================================================
Net                                      --
β”œβ”€Conv2d: 1-1                            320
β”œβ”€Conv2d: 1-2                            18,496
β”œβ”€Dropout: 1-3                           --
β”œβ”€Dropout: 1-4                           --
β”œβ”€Linear: 1-5                            1,179,776
β”œβ”€Linear: 1-6                            1,290
=================================================================
Total params: 1,199,882
Trainable params: 1,199,882
Non-trainable params: 0
=================================================================
[2025-12-31 12:21:30,134352][I][examples/fsdp:212:prepare_model_optimizer_and_scheduler] model=FullyShardedDataParallel(
  (_fsdp_wrapped_module): Net(
    (conv1): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1))
    (conv2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1))
    (dropout1): Dropout(p=0.25, inplace=False)
    (dropout2): Dropout(p=0.5, inplace=False)
    (fc1): Linear(in_features=9216, out_features=128, bias=True)
    (fc2): Linear(in_features=128, out_features=10, bias=True)
  )
)
[2025-12-31 12:21:30,173375][I][ezpz/history:220:__init__] Using History with distributed_history=True
2025:12:31-12:21:30:(107043) |CCL_WARN| value of CCL_OP_SYNC changed to be 1 (default:0)
2025:12:31-12:21:30:(107043) |CCL_WARN| value of CCL_PROCESS_LAUNCHER changed to be pmix (default:hydra)
[2025-12-31 12:21:55,502783][I][examples/fsdp:340:fsdp_main] epoch=1 dt=12.487221 train_loss=0.596659 test_loss=0.143485 test_acc=95.563553 dt/mean=11.990577 dt/max=12.487222 dt/min=11.897395 dt/std=0.119125 train_loss/mean=0.596659 train_loss/max=0.596659 train_loss/min=0.596659 train_loss/std=0.000173 test_loss/mean=0.143485 test_loss/max=0.143485 test_loss/min=0.143485 test_loss/std=0.000000 test_acc/mean=95.563560 test_acc/max=95.563553 test_acc/min=95.563553 test_acc/std=0.000000
[2025-12-31 12:21:55,911549][I][examples/fsdp:340:fsdp_main] epoch=2 dt=0.361235 train_loss=0.174450 test_loss=0.080361 test_acc=97.511993 dt/mean=0.365279 dt/max=0.373996 dt/min=0.355496 dt/std=0.005433 train_loss/mean=0.174450 train_loss/max=0.174450 train_loss/min=0.174450 train_loss/std=0.000000 test_loss/mean=0.080361 test_loss/max=0.080361 test_loss/min=0.080361 test_loss/std=0.000022 test_acc/mean=97.511993 test_acc/max=97.511993 test_acc/min=97.511993 test_acc/std=0.000000
[2025-12-31 12:21:56,308947][I][examples/fsdp:340:fsdp_main] epoch=3 dt=0.359641 train_loss=0.120487 test_loss=0.060764 test_acc=98.021584 dt/mean=0.358203 dt/max=0.361614 dt/min=0.353194 dt/std=0.002922 train_loss/mean=0.120487 train_loss/max=0.120487 train_loss/min=0.120487 train_loss/std=0.000000 test_loss/mean=0.060764 test_loss/max=0.060764 test_loss/min=0.060764 test_loss/std=0.000015 test_acc/mean=98.021591 test_acc/max=98.021584 test_acc/min=98.021584 test_acc/std=0.000000
[2025-12-31 12:21:56,703145][I][examples/fsdp:340:fsdp_main] epoch=4 dt=0.356608 train_loss=0.098917 test_loss=0.052346 test_acc=98.301361 dt/mean=0.356618 dt/max=0.359070 dt/min=0.353434 dt/std=0.001995 train_loss/mean=0.098917 train_loss/max=0.098917 train_loss/min=0.098917 train_loss/std=0.000000 test_loss/mean=0.052346 test_loss/max=0.052346 test_loss/min=0.052346 test_loss/std=0.000000 test_acc/mean=98.301361 test_acc/max=98.301361 test_acc/min=98.301361 test_acc/std=0.031250
[2025-12-31 12:21:57,100230][I][examples/fsdp:340:fsdp_main] epoch=5 dt=0.357687 train_loss=0.085740 test_loss=0.047243 test_acc=98.441246 dt/mean=0.356900 dt/max=0.360295 dt/min=0.352879 dt/std=0.002699 train_loss/mean=0.085740 train_loss/max=0.085740 train_loss/min=0.085740 train_loss/std=0.000000 test_loss/mean=0.047243 test_loss/max=0.047243 test_loss/min=0.047243 test_loss/std=0.000000 test_acc/mean=98.441246 test_acc/max=98.441246 test_acc/min=98.441246 test_acc/std=0.000000
[2025-12-31 12:21:57,497234][I][examples/fsdp:340:fsdp_main] epoch=6 dt=0.357410 train_loss=0.080569 test_loss=0.044845 test_acc=98.471222 dt/mean=0.356574 dt/max=0.359746 dt/min=0.353584 dt/std=0.002156 train_loss/mean=0.080569 train_loss/max=0.080569 train_loss/min=0.080569 train_loss/std=0.000000 test_loss/mean=0.044845 test_loss/max=0.044845 test_loss/min=0.044845 test_loss/std=0.000015 test_acc/mean=98.471222 test_acc/max=98.471222 test_acc/min=98.471222 test_acc/std=0.000000
[2025-12-31 12:21:57,893327][I][examples/fsdp:340:fsdp_main] epoch=7 dt=0.355675 train_loss=0.075174 test_loss=0.043703 test_acc=98.481216 dt/mean=0.356044 dt/max=0.358311 dt/min=0.353675 dt/std=0.001370 train_loss/mean=0.075174 train_loss/max=0.075174 train_loss/min=0.075174 train_loss/std=0.000022 test_loss/mean=0.043703 test_loss/max=0.043703 test_loss/min=0.043703 test_loss/std=0.000011 test_acc/mean=98.481224 test_acc/max=98.481216 test_acc/min=98.481216 test_acc/std=0.000000
[2025-12-31 12:21:58,292161][I][examples/fsdp:340:fsdp_main] epoch=8 dt=0.358490 train_loss=0.073104 test_loss=0.041848 test_acc=98.551163 dt/mean=0.359055 dt/max=0.362143 dt/min=0.355792 dt/std=0.001879 train_loss/mean=0.073104 train_loss/max=0.073104 train_loss/min=0.073104 train_loss/std=0.000022 test_loss/mean=0.041848 test_loss/max=0.041848 test_loss/min=0.041848 test_loss/std=0.000000 test_acc/mean=98.551170 test_acc/max=98.551163 test_acc/min=98.551163 test_acc/std=0.000000
[2025-12-31 12:21:58,692175][I][examples/fsdp:340:fsdp_main] epoch=9 dt=0.359963 train_loss=0.069403 test_loss=0.041198 test_acc=98.571144 dt/mean=0.360091 dt/max=0.363091 dt/min=0.356911 dt/std=0.001945 train_loss/mean=0.069403 train_loss/max=0.069403 train_loss/min=0.069403 train_loss/std=0.000022 test_loss/mean=0.041198 test_loss/max=0.041198 test_loss/min=0.041198 test_loss/std=0.000011 test_acc/mean=98.571152 test_acc/max=98.571144 test_acc/min=98.571144 test_acc/std=0.000000
[2025-12-31 12:21:59,091674][I][examples/fsdp:340:fsdp_main] epoch=10 dt=0.358637 train_loss=0.068348 test_loss=0.041941 test_acc=98.571144 dt/mean=0.358994 dt/max=0.361870 dt/min=0.356423 dt/std=0.001696 train_loss/mean=0.068348 train_loss/max=0.068348 train_loss/min=0.068348 train_loss/std=0.000000 test_loss/mean=0.041941 test_loss/max=0.041941 test_loss/min=0.041941 test_loss/std=0.000000 test_acc/mean=98.571152 test_acc/max=98.571144 test_acc/min=98.571144 test_acc/std=0.000000
[2025-12-31 12:21:59,093446][I][examples/fsdp:342:fsdp_main] 11 epochs took 28.9s
[2025-12-31 12:21:59,124624][I][ezpz/history:2385:finalize] Saving plots to /lus/tegu/projects/datascience/foremans/projects/saforem2/ezpz/outputs/ezpz-fsdp/2025-12-31-122159/plots/mplot (matplotlib) and /lus/tegu/projects/datascience/foremans/projects/saforem2/ezpz/outputs/ezpz-fsdp/2025-12-31-122159/plots/tplot (tplot)
                     dt                                    dt/min
    β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”    β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
12.5β”€β–Œ                                 β”‚11.9─-                                 β”‚
10.5β”€β–š                                 β”‚ 8.0─ -                                β”‚
    │▝▖                                β”‚ 4.2─  -                               β”‚
 8.4─ β–Œ                                β”‚ 0.4─   -------------------------------β”‚
 6.4─ ▐                                β”‚    β””β”¬β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”¬β”˜
 4.4─  β–Œ                               β”‚    1.0     3.2      5.5     7.8   10.0
    β”‚  β–š                               β”‚dt/min              iter
 2.4─  ▝▖                              β”‚                   dt/std
 0.4─   β–šβ–„β–„β–„β–„β–„β–„β–„β–„β–„β–„β–„β–„β–„β–„β–„β–„β–„β–„β–„β–„β–„β–„β–„β–„β–„β–„β–„β–„β–„β–„β”‚     β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
    β””β”¬β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”¬β”˜0.119─*                                β”‚
    1.0     3.2      5.5     7.8   10.0 0.099─ *                               β”‚
dt                  iter                0.060─  *                              β”‚
                   dt/mean              0.041─   *                             β”‚
    β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”0.001─    *****************************β”‚
12.0─·                                 β”‚     β””β”¬β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”¬β”˜
10.1─·                                 β”‚     1.0     3.2     5.5     7.8   10.0
    β”‚Β·                                 β”‚dt/std              iter
 8.1─ Β·                                β”‚                   dt/max
 6.2─ Β·                                β”‚    β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
    β”‚  Β·                               β”‚12.5─+                                 β”‚
 4.2─  Β·                               β”‚10.5─ +                                β”‚
 2.3─   Β·                              β”‚ 6.4─  +                               β”‚
    β”‚   Β·                              β”‚ 4.4─   +                              β”‚
 0.4─    Β·Β·Β·Β·Β·Β·Β·Β·Β·Β·Β·Β·Β·Β·Β·Β·Β·Β·Β·Β·Β·Β·Β·Β·Β·Β·Β·Β·Β·Β·β”‚ 0.4─    ++++++++++++++++++++++++++++++β”‚
    β””β”¬β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”¬β”˜    β””β”¬β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”¬β”˜
    1.0     3.2      5.5     7.8   10.0     1.0     3.2      5.5     7.8   10.0
dt/mean             iter                dt/max              iter
text saved in /lus/tegu/projects/datascience/foremans/projects/saforem2/ezpz/outputs/ezpz-fsdp/2025-12-31-122159/plots/tplot/dt.txt
    β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
12.5─ ++ dt/max                                                                β”‚
    β”‚ -- dt/min                                                                β”‚
    β”‚ Β·Β· dt/mean                                                               β”‚
    β”‚ β–žβ–ž dt                                                                    β”‚
10.5─ β–Œ                                                                        β”‚
    β”‚ ▐                                                                        β”‚
    β”‚ ▝▖                                                                       β”‚
    β”‚  β–Œ                                                                       β”‚
 8.4─  ▐                                                                       β”‚
    β”‚  ▐                                                                       β”‚
    β”‚   β–Œ                                                                      β”‚
    β”‚   β–š                                                                      β”‚
 6.4─   ▐                                                                      β”‚
    β”‚    β–Œ                                                                     β”‚
    β”‚    β–š                                                                     β”‚
    β”‚    ▐                                                                     β”‚
    β”‚     β–Œ                                                                    β”‚
 4.4─     β–Œ                                                                    β”‚
    β”‚     ▐                                                                    β”‚
    β”‚     ▝▖                                                                   β”‚
    β”‚      β–Œ                                                                   β”‚
 2.4─      ▐                                                                   β”‚
    β”‚      ▝▖                                                                  β”‚
    β”‚       β–Œ                                                                  β”‚
    β”‚       ▐                                                                  β”‚
 0.4─       ▝▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄│
    β””β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”˜
    1.0               3.2                5.5               7.8             10.0
text saved in /lus/tegu/projects/datascience/foremans/projects/saforem2/ezpz/outputs/ezpz-fsdp/2025-12-31-122159/plots/tplot/dt_summary.txt
               dt/mean hist                             dt/max hist
   β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”   β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
9.0β”€β–ˆβ–ˆβ–ˆβ–ˆ                               β”‚9.0β”€β–ˆβ–ˆβ–ˆβ–ˆ                               β”‚
7.5β”€β–ˆβ–ˆβ–ˆβ–ˆ                               β”‚7.5β”€β–ˆβ–ˆβ–ˆβ–ˆ                               β”‚
   β”‚β–ˆβ–ˆβ–ˆβ–ˆ                               β”‚   β”‚β–ˆβ–ˆβ–ˆβ–ˆ                               β”‚
6.0β”€β–ˆβ–ˆβ–ˆβ–ˆ                               β”‚6.0β”€β–ˆβ–ˆβ–ˆβ–ˆ                               β”‚
4.5β”€β–ˆβ–ˆβ–ˆβ–ˆ                               β”‚4.5β”€β–ˆβ–ˆβ–ˆβ–ˆ                               β”‚
   β”‚β–ˆβ–ˆβ–ˆβ–ˆ                               β”‚   β”‚β–ˆβ–ˆβ–ˆβ–ˆ                               β”‚
3.0β”€β–ˆβ–ˆβ–ˆβ–ˆ                               β”‚3.0β”€β–ˆβ–ˆβ–ˆβ–ˆ                               β”‚
1.5β”€β–ˆβ–ˆβ–ˆβ–ˆ                               β”‚1.5β”€β–ˆβ–ˆβ–ˆβ–ˆ                               β”‚
   β”‚β–ˆβ–ˆβ–ˆβ–ˆ                           β–ˆβ–ˆβ–ˆβ–ˆβ”‚   β”‚β–ˆβ–ˆβ–ˆβ–ˆ                           β–ˆβ–ˆβ–ˆβ–ˆβ”‚
0.0β”€β–ˆβ–ˆβ–ˆ                            β–ˆβ–ˆβ–ˆβ–ˆβ”‚0.0β”€β–ˆβ–ˆβ–ˆ                            β–ˆβ–ˆβ–ˆβ–ˆβ”‚
   β””β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”¬β”˜   β””β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”¬β”˜
  -0.2      3.0     6.2      9.3   12.5   -0.2      3.1     6.4      9.7   13.0
                dt/min hist                             dt/std hist
   β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”   β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
9.0β”€β–ˆβ–ˆβ–ˆβ–ˆ                               β”‚9.0β”€β–ˆβ–ˆβ–ˆβ–ˆ                               β”‚
   β”‚β–ˆβ–ˆβ–ˆβ–ˆ                               β”‚   β”‚β–ˆβ–ˆβ–ˆβ–ˆ                               β”‚
7.5β”€β–ˆβ–ˆβ–ˆβ–ˆ                               β”‚7.5β”€β–ˆβ–ˆβ–ˆβ–ˆ                               β”‚
6.0β”€β–ˆβ–ˆβ–ˆβ–ˆ                               β”‚6.0β”€β–ˆβ–ˆβ–ˆβ–ˆ                               β”‚
   β”‚β–ˆβ–ˆβ–ˆβ–ˆ                               β”‚   β”‚β–ˆβ–ˆβ–ˆβ–ˆ                               β”‚
4.5β”€β–ˆβ–ˆβ–ˆβ–ˆ                               β”‚4.5β”€β–ˆβ–ˆβ–ˆβ–ˆ                               β”‚
   β”‚β–ˆβ–ˆβ–ˆβ–ˆ                               β”‚   β”‚β–ˆβ–ˆβ–ˆβ–ˆ                               β”‚
3.0β”€β–ˆβ–ˆβ–ˆβ–ˆ                               β”‚3.0β”€β–ˆβ–ˆβ–ˆβ–ˆ                               β”‚
1.5β”€β–ˆβ–ˆβ–ˆβ–ˆ                               β”‚1.5β”€β–ˆβ–ˆβ–ˆβ–ˆ                               β”‚
   β”‚β–ˆβ–ˆβ–ˆβ–ˆ                           β–ˆβ–ˆβ–ˆβ–ˆβ”‚   β”‚β–ˆβ–ˆβ–ˆβ–ˆ                           β–ˆβ–ˆβ–ˆβ–ˆβ”‚
0.0β”€β–ˆβ–ˆβ–ˆ                            β–ˆβ–ˆβ–ˆβ–ˆβ”‚0.0β”€β–ˆβ–ˆβ–ˆ                            β–ˆβ–ˆβ–ˆβ–ˆβ”‚
   β””β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”¬β”˜   β””β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”¬β”˜
  -0.2      3.0     6.1      9.3   12.4   -0.004   0.028   0.060    0.092 0.124
text saved in /lus/tegu/projects/datascience/foremans/projects/saforem2/ezpz/outputs/ezpz-fsdp/2025-12-31-122159/plots/tplot/dt_hist.txt
                  test_acc                              test_acc/min
     β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”     β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
98.57─              β–—β–„β–„β–„β–„β–„β–„β–„β–„β–„β–„β–žβ–€β–€β–€β–€β–€β–€β–€β”‚98.57─           ----------------------β”‚
98.07─       β–„β–„β–„β–„β–€β–€β–€β–˜                  β”‚97.57─    -------                      β”‚
     β”‚     β–„β–€                          β”‚96.57─  --                             β”‚
97.57─   β–žβ–€                            β”‚95.56─--                               β”‚
97.07─  ▐                              β”‚     β””β”¬β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”¬β”˜
96.57─ β–—β–˜                              β”‚     1.0     3.2     5.5     7.8   10.0
     β”‚ β–ž                               β”‚test_acc/min        iter
96.06─▐                                β”‚                 test_acc/std
95.56β”€β–Œ                                β”‚      β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
     β””β”¬β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”¬β”˜0.0312─          *                     β”‚
     1.0     3.2     5.5     7.8   10.0 0.0260─         * *                    β”‚
test_acc            iter                0.0156─        *   *                   β”‚
                test_acc/mean           0.0104─       *     *                  β”‚
     β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”0.0000─********      ******************β”‚
98.57─              Β·Β·Β·Β·Β·Β·Β·Β·Β·Β·Β·Β·Β·Β·Β·Β·Β·Β·Β·β”‚      β””β”¬β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”¬β”˜
98.07─           Β·Β·Β·                   β”‚      1.0     3.2     5.5    7.8   10.0
     β”‚       Β·Β·Β·Β·                      β”‚test_acc/std         iter
97.57─    Β·Β·Β·                          β”‚                test_acc/max
97.07─   Β·                             β”‚     β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
     β”‚  Β·                              β”‚98.57─           ++++++++++++++++++++++β”‚
96.57─  Β·                              β”‚98.07─    +++++++                      β”‚
96.06─ Β·                               β”‚97.07─   +                             β”‚
     β”‚Β·                                β”‚96.57─  +                              β”‚
95.56─·                                β”‚95.56─++                               β”‚
     β””β”¬β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”¬β”˜     β””β”¬β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”¬β”˜
     1.0     3.2     5.5     7.8   10.0      1.0     3.2     5.5     7.8   10.0
test_acc/mean       iter                test_acc/max        iter
text saved in /lus/tegu/projects/datascience/foremans/projects/saforem2/ezpz/outputs/ezpz-fsdp/2025-12-31-122159/plots/tplot/test_acc.txt
     β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
98.57─ ++ test_acc/max                                    β–—β–„β–„β–„β–žβ–€β–€β–€β–€β–€β–€β–€β–€β–€β–€β–€β–€β–€β–€β–€β–€β”‚
     β”‚ -- test_acc/min             β–—β–„β–„β–€β–€β–€β–€β–€β–€β–€β–€β–€β–€β–€β–€β–€β–€β–€β–€β–€β–€β–€β–€β–˜Β·Β·Β·                 β”‚
     β”‚ Β·Β· test_acc/mean       β–„β–„β–„β–€β–€β–˜Β·Β·                                         β”‚
     β”‚ β–žβ–ž test_acc        β–„β–„β–€β–€                                                 β”‚
98.07─                β–„β–„β–€β–€Β·Β·                                                   β”‚
     β”‚              β–—β–žΒ·Β·Β·                                                      β”‚
     β”‚            β–—β–žβ–˜Β·                                                         β”‚
     β”‚           β–„β–˜Β·                                                           β”‚
97.57─         β–„β–€Β·                                                             β”‚
     β”‚       β–—β–€Β·                                                               β”‚
     β”‚       β–ž                                                                 β”‚
     β”‚      β–—β–˜                                                                 β”‚
97.07─      β–ž                                                                  β”‚
     β”‚     β–—β–˜                                                                  β”‚
     β”‚     β–ž                                                                   β”‚
     β”‚    β–—β–˜                                                                   β”‚
     β”‚    β–ž                                                                    β”‚
96.57─   β–—β–˜                                                                    β”‚
     β”‚   β–ž                                                                     β”‚
     β”‚  β–—β–˜                                                                     β”‚
     β”‚  β–ž                                                                      β”‚
96.06─ β–—β–˜                                                                      β”‚
     β”‚ β–ž                                                                       β”‚
     β”‚β–—β–˜                                                                       β”‚
     β”‚β–ž                                                                        β”‚
95.56β”€β–Œ                                                                        β”‚
     β””β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”˜
     1.0               3.2               5.5               7.8             10.0
text saved in /lus/tegu/projects/datascience/foremans/projects/saforem2/ezpz/outputs/ezpz-fsdp/2025-12-31-122159/plots/tplot/test_acc_summary.txt
            test_acc/mean hist                       test_acc/max hist
   β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”   β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
7.0─                               β–ˆβ–ˆβ–ˆβ–ˆβ”‚7.0─                               β–ˆβ–ˆβ–ˆβ–ˆβ”‚
5.8─                               β–ˆβ–ˆβ–ˆβ–ˆβ”‚5.8─                               β–ˆβ–ˆβ–ˆβ–ˆβ”‚
   β”‚                               β–ˆβ–ˆβ–ˆβ–ˆβ”‚   β”‚                               β–ˆβ–ˆβ–ˆβ–ˆβ”‚
4.7─                               β–ˆβ–ˆβ–ˆβ–ˆβ”‚4.7─                               β–ˆβ–ˆβ–ˆβ–ˆβ”‚
3.5─                               β–ˆβ–ˆβ–ˆβ–ˆβ”‚3.5─                               β–ˆβ–ˆβ–ˆβ–ˆβ”‚
   β”‚                               β–ˆβ–ˆβ–ˆβ–ˆβ”‚   β”‚                               β–ˆβ–ˆβ–ˆβ–ˆβ”‚
2.3─                               β–ˆβ–ˆβ–ˆβ–ˆβ”‚2.3─                               β–ˆβ–ˆβ–ˆβ–ˆβ”‚
1.2─                               β–ˆβ–ˆβ–ˆβ–ˆβ”‚1.2─                               β–ˆβ–ˆβ–ˆβ–ˆβ”‚
   β”‚β–ˆβ–ˆβ–ˆβ–ˆ                 β–ˆβ–ˆβ–ˆβ–ˆ   β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ”‚   β”‚β–ˆβ–ˆβ–ˆβ–ˆ                 β–ˆβ–ˆβ–ˆβ–ˆ   β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ”‚
0.0β”€β–ˆβ–ˆβ–ˆ                  β–ˆβ–ˆβ–ˆ    β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ”‚0.0β”€β–ˆβ–ˆβ–ˆ                  β–ˆβ–ˆβ–ˆ    β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ”‚
   β””β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”¬β”˜   β””β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”¬β”˜
  95.4     96.2    97.1     97.9   98.7   95.4     96.2    97.1     97.9   98.7
             test_acc/min hist                       test_acc/std hist
   β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”   β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
7.0─                               β–ˆβ–ˆβ–ˆβ–ˆβ”‚9.0β”€β–ˆβ–ˆβ–ˆβ–ˆ                               β”‚
   β”‚                               β–ˆβ–ˆβ–ˆβ–ˆβ”‚   β”‚β–ˆβ–ˆβ–ˆβ–ˆ                               β”‚
5.8─                               β–ˆβ–ˆβ–ˆβ–ˆβ”‚7.5β”€β–ˆβ–ˆβ–ˆβ–ˆ                               β”‚
4.7─                               β–ˆβ–ˆβ–ˆβ–ˆβ”‚6.0β”€β–ˆβ–ˆβ–ˆβ–ˆ                               β”‚
   β”‚                               β–ˆβ–ˆβ–ˆβ–ˆβ”‚   β”‚β–ˆβ–ˆβ–ˆβ–ˆ                               β”‚
3.5─                               β–ˆβ–ˆβ–ˆβ–ˆβ”‚4.5β”€β–ˆβ–ˆβ–ˆβ–ˆ                               β”‚
   β”‚                               β–ˆβ–ˆβ–ˆβ–ˆβ”‚   β”‚β–ˆβ–ˆβ–ˆβ–ˆ                               β”‚
2.3─                               β–ˆβ–ˆβ–ˆβ–ˆβ”‚3.0β”€β–ˆβ–ˆβ–ˆβ–ˆ                               β”‚
1.2─                               β–ˆβ–ˆβ–ˆβ–ˆβ”‚1.5β”€β–ˆβ–ˆβ–ˆβ–ˆ                               β”‚
   β”‚β–ˆβ–ˆβ–ˆβ–ˆ                 β–ˆβ–ˆβ–ˆβ–ˆ   β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ”‚   β”‚β–ˆβ–ˆβ–ˆβ–ˆ                           β–ˆβ–ˆβ–ˆβ–ˆβ”‚
0.0β”€β–ˆβ–ˆβ–ˆ                  β–ˆβ–ˆβ–ˆ    β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ”‚0.0β”€β–ˆβ–ˆβ–ˆ                            β–ˆβ–ˆβ–ˆβ–ˆβ”‚
   β””β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”¬β”˜   β””β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”˜
  95.4     96.2    97.1     97.9   98.7   -0.0014  0.0071  0.0156  0.0241
text saved in /lus/tegu/projects/datascience/foremans/projects/saforem2/ezpz/outputs/ezpz-fsdp/2025-12-31-122159/plots/tplot/test_acc_hist.txt
                  test_loss                             test_loss/min
     β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”     β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
0.143β”€β–Œ                                β”‚0.143─-                                β”‚
0.126─▝▖                               β”‚0.109─ --                              β”‚
     β”‚ β–š                               β”‚0.075─   -----                         β”‚
0.109─  β–Œ                              β”‚0.041─        -------------------------β”‚
0.092─  ▝▖                             β”‚     β””β”¬β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”¬β”˜
0.075─   ▝▖                            β”‚     1.0     3.2     5.5     7.8   10.0
     β”‚    β–β–šβ––                          β”‚test_loss/min       iter
0.058─      β–β–šβ–„β–„β–„β––                     β”‚                  test_loss/std
0.041─           β–β–€β–€β–€β–€β–€β–€β–šβ–„β–„β–„β–„β–„β–„β–„β–„β–„β–„β–„β–„β–„β–„β”‚         β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
     β””β”¬β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”¬β”˜0.0000216─   *                         β”‚
     1.0     3.2     5.5     7.8   10.0 0.0000180─  * ***         *            β”‚
test_loss           iter                0.0000108─ *     *       * ***     *   β”‚
               test_loss/mean           0.0000072─*       *     *     *   * *  β”‚
     β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”0.0000000─*        *****       ***   **β”‚
0.143─·                                β”‚         β””β”¬β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”¬β”˜
0.126─·                                β”‚         1.0    3.2    5.5    7.8  10.0
     β”‚ Β·                               β”‚test_loss/std         iter
0.109─  Β·                              β”‚                test_loss/max
0.092─  Β·                              β”‚     β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
     β”‚   Β·                             β”‚0.143─+                                β”‚
0.075─    Β·                            β”‚0.126─ ++                              β”‚
0.058─     Β·Β·Β·                         β”‚0.092─   ++                            β”‚
     β”‚        Β·Β·Β·Β·Β·Β·Β·                  β”‚0.075─     +++                         β”‚
0.041─               Β·Β·Β·Β·Β·Β·Β·Β·Β·Β·Β·Β·Β·Β·Β·Β·Β·Β·β”‚0.041─        +++++++++++++++++++++++++β”‚
     β””β”¬β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”¬β”˜     β””β”¬β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”¬β”˜
     1.0     3.2     5.5     7.8   10.0      1.0     3.2     5.5     7.8   10.0
test_loss/mean      iter                test_loss/max       iter
text saved in /lus/tegu/projects/datascience/foremans/projects/saforem2/ezpz/outputs/ezpz-fsdp/2025-12-31-122159/plots/tplot/test_loss.txt
     β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
0.143─ ++ test_loss/max                                                        β”‚
     β”‚ -- test_loss/min                                                        β”‚
     β”‚ Β·Β· test_loss/mean                                                       β”‚
     β”‚ β–žβ–ž test_loss                                                            β”‚
0.126─  β–Œ                                                                      β”‚
     β”‚  ▐                                                                      β”‚
     β”‚   β–Œ                                                                     β”‚
     β”‚   ▐                                                                     β”‚
0.109─    β–Œ                                                                    β”‚
     β”‚    ▐                                                                    β”‚
     β”‚     β–Œ                                                                   β”‚
     β”‚     ▐                                                                   β”‚
0.092─      β–Œ                                                                  β”‚
     β”‚      ▐                                                                  β”‚
     β”‚       β–Œ                                                                 β”‚
     β”‚       ▝▖                                                                β”‚
     β”‚        ▝▄                                                               β”‚
0.075─          β–šβ––                                                             β”‚
     β”‚           β–β–š                                                            β”‚
     β”‚             β–€β––                                                          β”‚
     β”‚              β–β–šβ––                                                        β”‚
0.058─                β–β–€β–šβ–„β––                                                    β”‚
     β”‚                    β–β–€β–šβ–„β––                                                β”‚
     β”‚                        ▝▀▀▄▄▖                                           β”‚
     β”‚                             β–β–€β–€β–šβ–„β–„β–„β–„β–„β–„β–„β–„Β·Β·Β·Β·Β·Β·Β·Β·                        β”‚
0.041─                                         β–€β–€β–€β–€β–€β–€β–€β–€β–„β–„β–„β–„β–„β–„β–„β–„β–„β–„β–„β–„β–„β–„β–„β–„β–„β–„β–„β–„β–„β–„β–„β–„β”‚
     β””β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”˜
     1.0               3.2               5.5               7.8             10.0
text saved in /lus/tegu/projects/datascience/foremans/projects/saforem2/ezpz/outputs/ezpz-fsdp/2025-12-31-122159/plots/tplot/test_loss_summary.txt
           test_loss/mean hist                     test_loss/max hist
 β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
6β”€β–ˆβ–ˆβ–ˆβ–ˆ                                 β”‚6β”€β–ˆβ–ˆβ–ˆβ–ˆ                                 β”‚
5β”€β–ˆβ–ˆβ–ˆβ–ˆ                                 β”‚5β”€β–ˆβ–ˆβ–ˆβ–ˆ                                 β”‚
 β”‚β–ˆβ–ˆβ–ˆβ–ˆ                                 β”‚ β”‚β–ˆβ–ˆβ–ˆβ–ˆ                                 β”‚
4β”€β–ˆβ–ˆβ–ˆβ–ˆ                                 β”‚4β”€β–ˆβ–ˆβ–ˆβ–ˆ                                 β”‚
3β”€β–ˆβ–ˆβ–ˆβ–ˆ                                 β”‚3β”€β–ˆβ–ˆβ–ˆβ–ˆ                                 β”‚
 β”‚β–ˆβ–ˆβ–ˆβ–ˆ                                 β”‚ β”‚β–ˆβ–ˆβ–ˆβ–ˆ                                 β”‚
2β”€β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ                             β”‚2β”€β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ                             β”‚
1β”€β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ   β–ˆβ–ˆβ–ˆβ–ˆ                  β–ˆβ–ˆβ–ˆβ–ˆβ”‚1β”€β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ   β–ˆβ–ˆβ–ˆβ–ˆ                  β–ˆβ–ˆβ–ˆβ–ˆβ”‚
 β”‚β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ   β–ˆβ–ˆβ–ˆβ–ˆ                  β–ˆβ–ˆβ–ˆβ–ˆβ”‚ β”‚β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ   β–ˆβ–ˆβ–ˆβ–ˆ                  β–ˆβ–ˆβ–ˆβ–ˆβ”‚
0β”€β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ    β–ˆβ–ˆβ–ˆβ–ˆ                  β–ˆβ–ˆβ–ˆβ–ˆβ”‚0β”€β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ    β–ˆβ–ˆβ–ˆβ–ˆ                  β–ˆβ–ˆβ–ˆβ–ˆβ”‚
 β””β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”˜ β””β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”˜
 0.037   0.064    0.092    0.120  0.148  0.037   0.064    0.092    0.120  0.148
           test_loss/min hist                        test_loss/std hist
 β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”    β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
6β”€β–ˆβ–ˆβ–ˆβ–ˆ                                 β”‚5.00β”€β–ˆβ–ˆβ–ˆβ–ˆ                              β”‚
 β”‚β–ˆβ–ˆβ–ˆβ–ˆ                                 β”‚    β”‚β–ˆβ–ˆβ–ˆβ–ˆ                              β”‚
5β”€β–ˆβ–ˆβ–ˆβ–ˆ                                 β”‚4.17β”€β–ˆβ–ˆβ–ˆβ–ˆ                              β”‚
4β”€β–ˆβ–ˆβ–ˆβ–ˆ                                 β”‚3.33β”€β–ˆβ–ˆβ–ˆβ–ˆ                              β”‚
 β”‚β–ˆβ–ˆβ–ˆβ–ˆ                                 β”‚    β”‚β–ˆβ–ˆβ–ˆβ–ˆ                              β”‚
3β”€β–ˆβ–ˆβ–ˆβ–ˆ                                 β”‚2.50β”€β–ˆβ–ˆβ–ˆβ–ˆ                              β”‚
 β”‚β–ˆβ–ˆβ–ˆβ–ˆ                                 β”‚    β”‚β–ˆβ–ˆβ–ˆβ–ˆ             β–ˆβ–ˆβ–ˆβ–ˆ   β–ˆβ–ˆβ–ˆ       β”‚
2β”€β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ                             β”‚1.67β”€β–ˆβ–ˆβ–ˆβ–ˆ             β–ˆβ–ˆβ–ˆβ–ˆ   β–ˆβ–ˆβ–ˆ       β”‚
1β”€β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ   β–ˆβ–ˆβ–ˆβ–ˆ                  β–ˆβ–ˆβ–ˆβ–ˆβ”‚0.83β”€β–ˆβ–ˆβ–ˆβ–ˆ             β–ˆβ–ˆβ–ˆβ–ˆ   β–ˆβ–ˆβ–ˆ   β–ˆβ–ˆβ–ˆβ–ˆβ”‚
 β”‚β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ   β–ˆβ–ˆβ–ˆβ–ˆ                  β–ˆβ–ˆβ–ˆβ–ˆβ”‚    β”‚β–ˆβ–ˆβ–ˆβ–ˆ             β–ˆβ–ˆβ–ˆβ–ˆ   β–ˆβ–ˆβ–ˆ   β–ˆβ–ˆβ–ˆβ–ˆβ”‚
0β”€β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ    β–ˆβ–ˆβ–ˆβ–ˆ                  β–ˆβ–ˆβ–ˆβ–ˆβ”‚0.00β”€β–ˆβ–ˆβ–ˆ              β–ˆβ–ˆβ–ˆ    β–ˆβ–ˆβ–ˆ   β–ˆβ–ˆβ–ˆβ–ˆβ”‚
 β””β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”˜    β””β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”˜
 0.037   0.064    0.092    0.120  0.148   -0.0000010      0.0000108 0.0000167
text saved in /lus/tegu/projects/datascience/foremans/projects/saforem2/ezpz/outputs/ezpz-fsdp/2025-12-31-122159/plots/tplot/test_loss_hist.txt
                 train_loss                            train_loss/min
     β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”     β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
0.597β”€β–Œ                                β”‚0.597─-                                β”‚
0.509─▐                                β”‚0.421─ --                              β”‚
     β”‚ β–Œ                               β”‚0.244─   --                            β”‚
0.421─ ▐                               β”‚0.068─     ----------------------------β”‚
0.333─  β–Œ                              β”‚     β””β”¬β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”¬β”˜
0.244─  ▐                              β”‚     1.0     3.2     5.5     7.8   10.0
     β”‚   β–Œ                             β”‚train_loss/min      iter
0.156─   ▝▄▄▄▖                         β”‚                 train_loss/std
0.068─       ▝▀▀▀▀▀▀▀▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄│        β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
     β””β”¬β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”¬β”˜0.000173─*                             β”‚
     1.0     3.2     5.5     7.8   10.0 0.000144─*                             β”‚
train_loss          iter                0.000086─ *                            β”‚
               train_loss/mean          0.000058─  *                ********   β”‚
     β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”0.000000─   ****************        ***β”‚
0.597─·                                β”‚        β””β”¬β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”¬β”˜
0.509─·                                β”‚        1.0    3.2     5.5    7.8  10.0
     β”‚ Β·                               β”‚train_loss/std        iter
0.421─ Β·                               β”‚               train_loss/max
0.333─  Β·                              β”‚     β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
     β”‚  Β·                              β”‚0.597─+                                β”‚
0.244─   Β·                             β”‚0.509─ +                               β”‚
0.156─    Β·                            β”‚0.333─  +                              β”‚
     β”‚     Β·Β·Β·Β·Β·Β·Β·                     β”‚0.244─   ++                            β”‚
0.068─            Β·Β·Β·Β·Β·Β·Β·Β·Β·Β·Β·Β·Β·Β·Β·Β·Β·Β·Β·Β·Β·β”‚0.068─     ++++++++++++++++++++++++++++β”‚
     β””β”¬β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”¬β”˜     β””β”¬β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”¬β”˜
     1.0     3.2     5.5     7.8   10.0      1.0     3.2     5.5     7.8   10.0
train_loss/mean     iter                train_loss/max      iter
text saved in /lus/tegu/projects/datascience/foremans/projects/saforem2/ezpz/outputs/ezpz-fsdp/2025-12-31-122159/plots/tplot/train_loss.txt
     β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
0.597─ ++ train_loss/max                                                       β”‚
     β”‚ -- train_loss/min                                                       β”‚
     β”‚ Β·Β· train_loss/mean                                                      β”‚
     β”‚ β–žβ–ž train_loss                                                           β”‚
0.509─ ▐                                                                       β”‚
     β”‚ ▝▖                                                                      β”‚
     β”‚  β–š                                                                      β”‚
     β”‚  ▐                                                                      β”‚
0.421─   β–Œ                                                                     β”‚
     β”‚   ▐                                                                     β”‚
     β”‚   ▝▖                                                                    β”‚
     β”‚    β–Œ                                                                    β”‚
0.333─    ▐                                                                    β”‚
     β”‚     β–Œ                                                                   β”‚
     β”‚     β–š                                                                   β”‚
     β”‚     ▝▖                                                                  β”‚
     β”‚      β–Œ                                                                  β”‚
0.244─      ▐                                                                  β”‚
     β”‚       β–Œ                                                                 β”‚
     β”‚       β–š                                                                 β”‚
     β”‚       ▝▖                                                                β”‚
0.156─        ▝▀▄▖                                                             β”‚
     β”‚           ▝▀▄▖                                                          β”‚
     β”‚              β–β–€β–šβ–„β–„β–„β––                                                    β”‚
     β”‚                 Β·Β·Β·β–β–€β–€β–€β–šβ–„β–„β–„β–„β–„β–„β–„β––Β·Β·Β·Β·Β·Β·Β·Β·                                β”‚
0.068─                                ▝▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄│
     β””β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”˜
     1.0               3.2               5.5               7.8             10.0
text saved in /lus/tegu/projects/datascience/foremans/projects/saforem2/ezpz/outputs/ezpz-fsdp/2025-12-31-122159/plots/tplot/train_loss_summary.txt
           train_loss/mean hist                     train_loss/max hist
   β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”   β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
8.0β”€β–ˆβ–ˆβ–ˆβ–ˆ                               β”‚8.0β”€β–ˆβ–ˆβ–ˆβ–ˆ                               β”‚
6.7β”€β–ˆβ–ˆβ–ˆβ–ˆ                               β”‚6.7β”€β–ˆβ–ˆβ–ˆβ–ˆ                               β”‚
   β”‚β–ˆβ–ˆβ–ˆβ–ˆ                               β”‚   β”‚β–ˆβ–ˆβ–ˆβ–ˆ                               β”‚
5.3β”€β–ˆβ–ˆβ–ˆβ–ˆ                               β”‚5.3β”€β–ˆβ–ˆβ–ˆβ–ˆ                               β”‚
4.0β”€β–ˆβ–ˆβ–ˆβ–ˆ                               β”‚4.0β”€β–ˆβ–ˆβ–ˆβ–ˆ                               β”‚
   β”‚β–ˆβ–ˆβ–ˆβ–ˆ                               β”‚   β”‚β–ˆβ–ˆβ–ˆβ–ˆ                               β”‚
2.7β”€β–ˆβ–ˆβ–ˆβ–ˆ                               β”‚2.7β”€β–ˆβ–ˆβ–ˆβ–ˆ                               β”‚
1.3β”€β–ˆβ–ˆβ–ˆβ–ˆ                               β”‚1.3β”€β–ˆβ–ˆβ–ˆβ–ˆ                               β”‚
   β”‚β–ˆβ–ˆβ–ˆβ–ˆ   β–ˆβ–ˆβ–ˆβ–ˆ                    β–ˆβ–ˆβ–ˆβ–ˆβ”‚   β”‚β–ˆβ–ˆβ–ˆβ–ˆ   β–ˆβ–ˆβ–ˆβ–ˆ                    β–ˆβ–ˆβ–ˆβ–ˆβ”‚
0.0β”€β–ˆβ–ˆβ–ˆ    β–ˆβ–ˆβ–ˆ                     β–ˆβ–ˆβ–ˆβ–ˆβ”‚0.0β”€β–ˆβ–ˆβ–ˆ    β–ˆβ–ˆβ–ˆ                     β–ˆβ–ˆβ–ˆβ–ˆβ”‚
   β””β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”¬β”˜   β””β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”¬β”˜
  0.04     0.19    0.33     0.48   0.62   0.04     0.19    0.33     0.48   0.62
            train_loss/min hist                    train_loss/std hist
   β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
8.0β”€β–ˆβ–ˆβ–ˆβ–ˆ                               β”‚6β”€β–ˆβ–ˆβ–ˆβ–ˆ                                 β”‚
   β”‚β–ˆβ–ˆβ–ˆβ–ˆ                               β”‚ β”‚β–ˆβ–ˆβ–ˆβ–ˆ                                 β”‚
6.7β”€β–ˆβ–ˆβ–ˆβ–ˆ                               β”‚5β”€β–ˆβ–ˆβ–ˆβ–ˆ                                 β”‚
5.3β”€β–ˆβ–ˆβ–ˆβ–ˆ                               β”‚4β”€β–ˆβ–ˆβ–ˆβ–ˆ                                 β”‚
   β”‚β–ˆβ–ˆβ–ˆβ–ˆ                               β”‚ β”‚β–ˆβ–ˆβ–ˆβ–ˆ                                 β”‚
4.0β”€β–ˆβ–ˆβ–ˆβ–ˆ                               β”‚3β”€β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ                             β”‚
   β”‚β–ˆβ–ˆβ–ˆβ–ˆ                               β”‚ β”‚β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ                             β”‚
2.7β”€β–ˆβ–ˆβ–ˆβ–ˆ                               β”‚2β”€β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ                             β”‚
1.3β”€β–ˆβ–ˆβ–ˆβ–ˆ                               β”‚1β”€β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ                         β–ˆβ–ˆβ–ˆβ–ˆβ”‚
   β”‚β–ˆβ–ˆβ–ˆβ–ˆ   β–ˆβ–ˆβ–ˆβ–ˆ                    β–ˆβ–ˆβ–ˆβ–ˆβ”‚ β”‚β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ                         β–ˆβ–ˆβ–ˆβ–ˆβ”‚
0.0β”€β–ˆβ–ˆβ–ˆ    β–ˆβ–ˆβ–ˆ                     β–ˆβ–ˆβ–ˆβ–ˆβ”‚0β”€β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ                          β–ˆβ–ˆβ–ˆβ–ˆβ”‚
   β””β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”¬β”˜ β””β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
  0.04     0.19    0.33     0.48   0.62  -0.000008      0.000086  0.000133
text saved in /lus/tegu/projects/datascience/foremans/projects/saforem2/ezpz/outputs/ezpz-fsdp/2025-12-31-122159/plots/tplot/train_loss_hist.txt
[2025-12-31 12:22:03,182749][W][ezpz/history:2320:save_dataset] Unable to save dataset to W&B, skipping!
[2025-12-31 12:22:03,184704][I][utils/__init__:651:dataset_to_h5pyfile] Saving dataset to: /lus/tegu/projects/datascience/foremans/projects/saforem2/ezpz/outputs/ezpz-fsdp/2025-12-31-122159/train_dataset.h5
[2025-12-31 12:22:03,196685][I][ezpz/history:2433:finalize] Saving history report to /lus/tegu/projects/datascience/foremans/projects/saforem2/ezpz/outputs/ezpz-fsdp/2025-12-31-122159/report.md
[2025-12-31 12:22:03,202017][I][examples/fsdp:360:fsdp_main] dataset=<xarray.Dataset> Size: 2kB
Dimensions:          (draw: 10)
Coordinates:
  * draw             (draw) int64 80B 0 1 2 3 4 5 6 7 8 9
Data variables: (12/25)
    epoch            (draw) int64 80B 1 2 3 4 5 6 7 8 9 10
    dt               (draw) float64 80B 12.49 0.3612 0.3596 ... 0.36 0.3586
    train_loss       (draw) float32 40B 0.5967 0.1744 0.1205 ... 0.0694 0.06835
    test_loss        (draw) float32 40B 0.1435 0.08036 ... 0.0412 0.04194
    test_acc         (draw) float32 40B 95.56 97.51 98.02 ... 98.55 98.57 98.57
    epoch_mean       (draw) float64 80B 1.0 2.0 3.0 4.0 5.0 6.0 7.0 8.0 9.0 10.0
    ...               ...
    test_loss_min    (draw) float64 80B 0.1435 0.08036 ... 0.0412 0.04194
    test_loss_std    (draw) float64 80B 0.0 2.158e-05 ... 1.079e-05 0.0
    test_acc_mean    (draw) float64 80B 95.56 97.51 98.02 ... 98.55 98.57 98.57
    test_acc_max     (draw) float64 80B 95.56 97.51 98.02 ... 98.55 98.57 98.57
    test_acc_min     (draw) float64 80B 95.56 97.51 98.02 ... 98.55 98.57 98.57
    test_acc_std     (draw) float64 80B 0.0 0.0 0.0 0.03125 ... 0.0 0.0 0.0 0.0
[2025-12-31 12:22:03,205311][I][examples/fsdp:452:<module>] Took 36.24 seconds
wandb:
wandb: πŸš€ View run vivid-glade-86 at: 
wandb: Find logs at: ../../../../../../lus/tegu/projects/datascience/foremans/projects/saforem2/ezpz/wandb/run-20251231_122128-11cqdt05/logs
[2025-12-31 12:22:04,704632][I][ezpz/launch:447:launch] ----[πŸ‹ ezpz.launch][stop][2025-12-31-122204]----
[2025-12-31 12:22:04,705324][I][ezpz/launch:448:launch] Execution finished with 0.
[2025-12-31 12:22:04,705724][I][ezpz/launch:449:launch] Executing finished in 42.32 seconds.
[2025-12-31 12:22:04,706075][I][ezpz/launch:450:launch] Took 42.32 seconds to run. Exiting.