Train MLP with DDP on MNISTโ๏ธ
Train a simple fully connected (torch.nn.Linear) network using DDP
on the MNIST dataset.
Key API Functions
setup_torch()โ Initialize distributed trainingwrap_model()โ Wrap model for DDPcleanup()โ Tear down the process group
See: [๐ docs], [๐ source]
Sourceโ๏ธ
src/ezpz/examples/test.py
#!/usr/bin/env python3
"""
examples/test.py
"""
import argparse
import json
import os
import platform
import sys
import time
import warnings
from dataclasses import asdict, dataclass, field
from pathlib import Path
from typing import Iterator, Optional, Sequence
import torch
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.utils.data import DataLoader
from xarray import Dataset
import ezpz
import ezpz.distributed
from ezpz.configs import PathLike
from ezpz.cli.flags import build_test_parser
from ezpz.profile import get_profiling_context
START_TIME = time.perf_counter() # start time
# noqa: E402
warnings.filterwarnings("ignore")
ModelOptimizerPair = tuple[torch.nn.Module, torch.optim.Optimizer]
logger = ezpz.get_logger(__name__)
MODEL_PRESETS = {
"debug": {
"batch_size": 16,
"train_iters": 20,
"log_freq": 1,
"print_freq": 1,
"layer_sizes": [128, 64],
},
"small": {
"batch_size": 64,
"train_iters": 200,
"log_freq": 1,
"print_freq": 10,
"layer_sizes": [256, 128, 64],
},
"medium": {
"batch_size": 128,
"train_iters": 200,
"log_freq": 1,
"print_freq": 10,
"layer_sizes": [512, 256, 128],
},
"large": {
"batch_size": 256,
"train_iters": 400,
"log_freq": 1,
"print_freq": 10,
"layer_sizes": [1024, 512, 256],
},
}
MODEL_PRESET_FLAGS = {
"batch_size": ["--batch-size"],
"train_iters": ["--train-iters", "--train_iters"],
"log_freq": ["--log-freq", "--log_freq"],
"print_freq": ["--print-freq", "--print_freq"],
"layer_sizes": ["--layer-sizes"],
}
WANDB_DISABLED = False
try:
import wandb
except Exception:
wandb = None
WANDB_DISABLED = True
# WANDB_DISABLED = os.environ.get("WANDB_DISABLED", False)
# WANDB_MODE = os.environ.get("WANDB_MODE", "").lower()
# if not WANDB_DISABLED and WANDB_MODE != "disabled":
# try:
# wandb = ezpz.lazy.lazy_import("wandb")
# if not ezpz.distributed.verify_wandb():
# logger.warning("W&B API key not found, skipping wandb setup!")
# logger.info(
# "To enable W&B logging, run `wandb login` or set the WANDB_API_KEY"
# )
# except Exception as e:
# wandb = None
# WANDB_DISABLED = True
# logger.exception(e)
# logger.warning("W&B not available, skipping wandb setup!")
# logger.info("Continue without W&B logging...")
# else:
# wandb = None
# WANDB_DISABLED = True
@dataclass
class TrainConfig:
"""Runtime configuration for the `ezpz.examples.test` distributed smoke test."""
warmup: int
tp: int
pp: int
cp: int
batch_size: int
input_size: int
output_size: int
train_iters: int
log_freq: int
backend: str
dtype: str
print_freq: int
pyinstrument_profiler: bool
pytorch_profiler: bool
pytorch_profiler_wait: int
pytorch_profiler_warmup: int
pytorch_profiler_active: int
pytorch_profiler_repeat: int
profile_memory: bool
rank_zero_only: bool
record_shapes: bool
with_stack: bool
with_flops: bool
with_modules: bool
acc_events: bool
layer_sizes: list = field(default_factory=lambda: [512, 256, 128])
dataset: str = "mnist"
dataset_root: Optional[PathLike] = None
num_workers: int = 0
no_distributed_history: bool = False
save_datasets: bool = False
def __post_init__(self):
"""Initialise output paths and configure profiling context managers."""
self._created_at = os.environ.get("EZPZ_LOG_TIMESTAMP")
if self._created_at is None:
self._created_at = (
ezpz.get_timestamp() if ezpz.get_rank() == 0 else None
)
self._created_at = ezpz.distributed.broadcast(self._created_at, root=0)
if self._created_at is not None:
os.environ["EZPZ_LOG_TIMESTAMP"] = self._created_at
self.outdir = Path(os.getcwd()).joinpath(
"outputs", "ezpz.examples.test", f"{self._created_at}"
)
self.outdir.mkdir(parents=True, exist_ok=True)
dataset_root = (
Path(self.dataset_root).expanduser()
if self.dataset_root is not None
else self.outdir.parent.joinpath("datasets", self.dataset)
)
dataset_root.mkdir(parents=True, exist_ok=True)
self.dataset_root = dataset_root
profiler_type = "torch" if self.pytorch_profiler else "pyinstrument"
self.ctx = get_profiling_context(
profiler_type=profiler_type,
rank_zero_only=self.rank_zero_only,
record_shapes=self.record_shapes,
with_stack=self.with_stack,
with_flops=self.with_flops,
with_modules=self.with_modules,
acc_events=self.acc_events,
profile_memory=self.profile_memory,
wait=self.pytorch_profiler_wait,
warmup=self.pytorch_profiler_warmup,
active=self.pytorch_profiler_active,
repeat=self.pytorch_profiler_repeat,
outdir=self.outdir,
)
logger.info(f"Outputs will be saved to {self.outdir}")
def get_torch_dtype(self) -> torch.dtype:
"""Return the torch dtype requested by this configuration."""
if self.dtype is None:
return torch.get_default_dtype()
if self.dtype in {
"fp16",
"half",
"float16",
}:
return torch.float16
if self.dtype in {
"bfloat16",
"bf16",
}:
return torch.bfloat16
if self.dtype in {
"float32",
"fp32",
"float",
"single",
}:
return torch.float32
logger.warning(f"Unknown dtype: {self.dtype=}, using float32")
return torch.float32
@dataclass
class Trainer:
"""Co-ordinate training loops, logging, and profiling for the test model."""
config: TrainConfig
model: torch.nn.Module
optimizer: torch.optim.Optimizer
# history: ezpz.History = field(init=False)
train_iter: int = 0
rank: int = ezpz.get_rank()
# device_type: str = ezpz.get_torch_device_type()
device_type = os.environ.get("TORCH_DEVICE", ezpz.get_torch_device())
world_size = ezpz.get_world_size()
local_rank = ezpz.get_local_rank()
device_id = f"{device_type}:{local_rank}"
_train_loader: Optional[DataLoader] = field(init=False, default=None)
_train_iterator: Optional[Iterator[tuple[torch.Tensor, torch.Tensor]]] = (
field(init=False, default=None)
)
_feature_dim: int = field(init=False, default=0)
def __post_init__(self):
"""Move the model to the target device and register logging hooks."""
self.device_id = f"{self.device_type}:{self.local_rank}"
self.dtype = self.config.get_torch_dtype()
self.model.to(self.device_id)
self.model.to(self.dtype)
metrics_path = self.config.outdir.joinpath("metrics.jsonl")
self.history: ezpz.history.History = ezpz.history.History(
report_dir=self.config.outdir,
report_enabled=True,
jsonl_path=metrics_path,
jsonl_overwrite=True,
distributed_history=(
1 < self.world_size <= 384 and not self.config.pytorch_profiler
),
)
if self.config.tp > 1 or self.config.pp > 1 or self.config.cp > 1:
ezpz.distributed.barrier(group=ezpz.tp.get_tensor_parallel_group())
ezpz.distributed.barrier(group=ezpz.tp.get_data_parallel_group())
ezpz.distributed.barrier(
group=ezpz.tp.get_pipeline_parallel_group()
)
ezpz.distributed.barrier(
group=ezpz.tp.get_context_parallel_group()
)
if self.rank == 0 and not WANDB_DISABLED:
if (wbrun := getattr(wandb, "run", None)) is not None and callable(
wbrun.watch
):
wbrun.watch(self.model, log="all")
if self.world_size > 1:
logger.debug("Hit torch.distributed.barrier()")
ezpz.distributed.barrier()
self._train_loader, self._train_iterator = self._build_dataloader()
self._feature_dim = self.config.input_size
@ezpz.timeitlogit(rank=ezpz.get_rank())
def _build_dataloader(
self,
) -> tuple[DataLoader, Iterator[tuple[torch.Tensor, torch.Tensor]]]:
"""Construct a training dataloader for the requested dataset."""
dataset_name = self.config.dataset.lower()
if dataset_name == "mnist":
try:
from ezpz.data.vision import get_mnist
except (
ModuleNotFoundError
) as exc: # pragma: no cover - optional dep
msg = (
"torchvision is required to use the MNIST dataset. "
"Install it via `pip install torchvision`."
)
raise RuntimeError(msg) from exc
assert self.config.dataset_root is not None
dset_root = Path(self.config.dataset_root).expanduser().resolve()
dset_root.mkdir(parents=True, exist_ok=True)
bundle = get_mnist(
train_batch_size=self.config.batch_size,
test_batch_size=self.config.batch_size,
outdir=dset_root,
num_workers=self.config.num_workers,
download=self.rank == 0,
pin_memory=str(self.device_type).startswith(("cuda", "mps")),
)
train_loader = bundle["train"]["loader"]
return train_loader, iter(train_loader)
raise ValueError(f"Unknown dataset: {dataset_name!r}")
@ezpz.timeitlogit(rank=ezpz.get_rank())
def _next_batch(self) -> tuple[torch.Tensor, torch.Tensor]:
"""Return the next batch from the training dataloader."""
assert self._train_loader is not None
assert self._train_iterator is not None
try:
return next(self._train_iterator)
except StopIteration:
self._train_iterator = iter(self._train_loader)
return next(self._train_iterator)
@ezpz.timeitlogit(rank=ezpz.get_rank())
def _prepare_inputs(self, inputs: torch.Tensor) -> torch.Tensor:
"""Move inputs to the configured device and coerce feature dimensions."""
inputs = inputs.to(self.device_id)
inputs = inputs.reshape(inputs.size(0), -1)
if inputs.size(1) < self._feature_dim:
pad = self._feature_dim - inputs.size(1)
inputs = torch.nn.functional.pad(inputs, (0, pad))
elif inputs.size(1) > self._feature_dim:
inputs = inputs[:, : self._feature_dim]
return inputs.to(self.dtype)
@ezpz.timeitlogit(rank=ezpz.get_rank())
def _forward_step(self) -> tuple[dict, torch.Tensor]:
"""Execute a forward pass returning metrics and the loss tensor."""
t0 = time.perf_counter()
batch_inputs, targets = self._next_batch()
inputs = self._prepare_inputs(batch_inputs)
targets = targets.to(self.device_id)
logits = self.model(inputs)
loss = calc_loss(logits, targets)
accuracy = (logits.argmax(dim=1) == targets).float().mean()
ezpz.distributed.synchronize()
metrics = {
"loss": loss.detach(),
"accuracy": accuracy.detach(),
"dtf": time.perf_counter() - t0,
}
return metrics, loss
@ezpz.timeitlogit(rank=ezpz.get_rank())
def _backward_step(self, loss: torch.Tensor) -> float:
"""Perform the backwards/optimiser step and return elapsed seconds."""
t0 = time.perf_counter()
if self.config.backend == "deepspeed":
self.model.backward(loss)
self.model.step(loss)
else:
loss.backward()
self.optimizer.step()
ezpz.distributed.synchronize()
return time.perf_counter() - t0
@ezpz.timeitlogit(rank=ezpz.get_rank())
def train_step(self) -> dict:
"""Run one optimiser step, emitting periodic logs/metrics."""
self.train_iter += 1
metrics, loss = self._forward_step()
metrics["dtb"] = self._backward_step(loss)
self.optimizer.zero_grad()
if self.train_iter == self.config.train_iters:
return metrics
if (
self.train_iter % self.config.log_freq == 0
or self.train_iter % self.config.print_freq == 0
):
summary = self.history.update({"iter": self.train_iter, **metrics})
if self.train_iter % self.config.print_freq == 0:
logger.info(f"{summary}")
return metrics
@ezpz.timeitlogit(rank=ezpz.get_rank())
def finalize(
self, outdir: Optional[str | Path | os.PathLike] = None
) -> Dataset:
"""Flush profilers and return the aggregated training dataset."""
import ambivalent
import matplotlib.pyplot as plt
plt.style.use(ambivalent.STYLES["ambivalent"])
outdir = Path(outdir) if outdir is not None else self.config.outdir
env_info = self._gather_environment_snapshot()
dataset = self.history.finalize(
run_name="ezpz.examples.test",
dataset_fname="train",
warmup=self.config.warmup,
save=False, # XXX: don't bother saving test data
plot=(self.rank == 0),
outdir=outdir,
env_info=env_info,
)
logger.info(f"{dataset=}")
if wandb is not None and ezpz.verify_wandb() and not WANDB_DISABLED:
try:
wandb.log(
{
"train_metrics": wandb.Table(
dataframe=dataset.to_dataframe()
)
}
)
except Exception:
logger.warning("Failed to log final dataset to wandb")
return dataset
@ezpz.timeitlogit(rank=ezpz.get_rank())
def train(self, profiler: Optional[torch.profiler.profile] = None) -> None:
"""Loop over all training iterations and return the final dataset."""
for step in range(self.config.train_iters):
if step == self.config.warmup:
logger.info(f"Warmup complete at step {step}")
_ = self.train_step()
if profiler is not None:
profiler.step()
def _gather_environment_snapshot(self) -> dict[str, dict[str, str]]:
"""Collect key runtime environment details for reporting."""
python_details = {
"Version": (
f"{sys.version_info.major}."
f"{sys.version_info.minor}."
f"{sys.version_info.micro}"
),
"Implementation": sys.implementation.name,
"Executable": sys.executable,
}
torch_details = {
"Version": torch.__version__,
"Device": str(self.device_id),
"Backend": (
ezpz.distributed.get_torch_backend()
if hasattr(ezpz.distributed, "get_torch_backend")
else "unknown"
),
}
host_name = (
platform.uname().node
if hasattr(platform, "uname")
else os.environ.get("HOSTNAME", "unknown")
)
path_details = {
"Working directory": str(Path.cwd()),
"Output directory": str(self.config.outdir),
"Dataset root": str(self.config.dataset_root),
"Hostname": host_name,
}
dist_details = {
"Rank": str(self.rank),
"Local rank": str(self.local_rank),
"World size": str(self.world_size),
}
env_vars: dict[str, str] = {}
for key in (
"MASTER_ADDR",
"MASTER_PORT",
"NODE_RANK",
"LOCAL_RANK",
"RANK",
"WORLD_SIZE",
):
value = os.environ.get(key)
if value is not None:
env_vars[key] = value
snapshot: dict[str, dict[str, str]] = {
"Paths": path_details,
"Python": python_details,
"Torch": torch_details,
"Distributed": dist_details,
}
if env_vars:
snapshot["Environment Variables"] = env_vars
return snapshot
@ezpz.timeitlogit(rank=ezpz.get_rank())
def train(
config: TrainConfig, profiler: Optional[torch.profiler.profile] = None
) -> Trainer:
"""Instantiate the model/optimiser and run the training loop."""
from ezpz.models.minimal import SequentialLinearNet
from ezpz.utils import model_summary
timings = {}
t0m = time.perf_counter()
model = SequentialLinearNet(
input_dim=config.input_size,
output_dim=config.output_size,
sizes=config.layer_sizes,
)
logger.info(
f"Model size: {sum(p.numel() for p in model.parameters())} parameters"
)
try:
logger.info(f"\n{model_summary(model)}")
except Exception as e:
logger.warning(
f"Failed to summarize model: {e}, using default summary"
)
logger.info(model)
t1m = time.perf_counter()
dt_model = t1m - t0m
logger.info(f"Took: {dt_model} seconds to build model")
model, optimizer = build_model_and_optimizer(model, dtype=config.dtype)
t2m = time.perf_counter()
dt_optimizer = time.perf_counter() - t1m
logger.info(f"Took: {dt_optimizer:.2f} seconds to build optimizer")
trainer = Trainer(config=config, model=model, optimizer=optimizer)
t1tr = time.perf_counter()
logger.info(
f"Took: {(dt_trainer := t1tr - t2m):.2f} seconds to build trainer"
)
jstr = json.dumps(asdict(config), indent=2, sort_keys=True, default=str)
logger.info(f"config:\n{jstr}")
t1s = time.perf_counter()
logger.info(
f"Took: {(dt_train_start := t1s - START_TIME):.2f} to get here."
)
# -------------------------------------------
# Main training loop
t0t = time.perf_counter()
_ = trainer.train(profiler=profiler)
t1t = time.perf_counter()
logger.info(
f"Took: {(dt_train_duration := t1t - t0t):.2f} seconds to finish training"
)
rank = ezpz.get_rank()
history: ezpz.history.History = trainer.history
dataset = None
# Record timings and return trainer
if rank == 0:
dataset = history.finalize(
run_name="ezpz.examples.test",
dataset_fname="train",
warmup=config.warmup,
save=(rank == 0 and config.save_datasets),
plot=(rank == 0),
outdir=config.outdir,
env_info=trainer._gather_environment_snapshot(),
)
logger.info(f"{dataset=}")
timings = {
"timings/model": dt_model,
"timings/optimizer": dt_optimizer,
"timings/trainer": dt_trainer,
"timings/training_start": dt_train_start,
"timings/train_duration": dt_train_duration,
"timings/end-to-end": time.perf_counter() - START_TIME,
}
if wandb is not None and ezpz.verify_wandb() and not WANDB_DISABLED:
try:
wandb.log(
{
(f"timings/{k}" if not k.startswith("timings/") else k): v
for k, v in timings.items()
},
commit=False,
)
except Exception as e:
logger.exception(e)
logger.warning("Unable to 'wandb.log(timings)', skipping!")
if dataset is not None:
try:
wandb.log(
{
"train_metrics": wandb.Table(
dataframe=dataset.to_dataframe()
)
}
)
except Exception as e:
logger.exception(e)
logger.warning("Failed to log final dataset to wandb")
if ezpz.get_world_size() > 1:
ezpz.barrier()
return trainer
def _arg_provided(argv: Sequence[str], flags: Sequence[str]) -> bool:
return any(flag in argv for flag in flags)
def _apply_model_preset(args: argparse.Namespace, argv: Sequence[str]) -> None:
if args.model is None:
return
preset = MODEL_PRESETS.get(args.model)
if preset is None:
return
for field_name, value in preset.items():
flags = MODEL_PRESET_FLAGS.get(field_name, [])
if not _arg_provided(argv, flags):
setattr(args, field_name, value)
def parse_args(argv: Sequence[str] | None = None) -> argparse.Namespace:
"""Parse CLI arguments for `ezpz.examples.test`."""
argv = sys.argv[1:] if argv is None else list(argv)
parser = build_test_parser()
args = parser.parse_args(argv)
_apply_model_preset(args, argv)
if args.backend.lower() in {"ds", "deepspeed"}:
try:
import deepspeed # type:ignore # noqa: F401
args.deepspeed = True
except (ImportError, ModuleNotFoundError) as e:
logger.error(
"Deepspeed not available. "
"Install via `python3 -m pip install deepspeed`"
)
raise e
return args
def get_config_from_args(args: argparse.Namespace) -> TrainConfig:
"""Translate CLI arguments into a :class:`TrainConfig`."""
config = TrainConfig(
acc_events=args.acc_events,
batch_size=args.batch_size,
profile_memory=args.profile_memory,
record_shapes=args.record_shapes,
with_stack=args.with_stack,
with_flops=args.with_flops,
with_modules=args.with_modules,
rank_zero_only=args.rank_zero_only,
backend=args.backend,
dtype=args.dtype,
log_freq=args.log_freq,
print_freq=args.print_freq,
tp=args.tp,
pp=args.pp,
cp=args.cp,
input_size=args.input_size,
output_size=args.output_size,
train_iters=args.train_iters,
layer_sizes=args.layer_sizes,
dataset=args.dataset,
dataset_root=args.dataset_root,
num_workers=args.num_workers,
pyinstrument_profiler=args.pyinstrument_profiler,
pytorch_profiler=args.pytorch_profiler,
pytorch_profiler_wait=args.pytorch_profiler_wait,
pytorch_profiler_warmup=args.pytorch_profiler_warmup,
pytorch_profiler_active=args.pytorch_profiler_active,
pytorch_profiler_repeat=args.pytorch_profiler_repeat,
warmup=args.warmup,
)
return config
@ezpz.timeitlogit(rank=ezpz.get_rank())
def calc_loss(logits: torch.Tensor, targets: torch.Tensor) -> torch.Tensor:
"""Return the cross entropy loss for the classification dataset."""
return torch.nn.functional.cross_entropy(logits.float(), targets)
@ezpz.timeitlogit(rank=ezpz.get_rank())
def build_model_and_optimizer(
model: torch.nn.Module,
dtype: Optional[str] = None,
) -> ModelOptimizerPair:
"""Prepare the model and optimiser for the requested backend."""
device_override = os.environ.get("TORCH_DEVICE")
device_type = device_override or ezpz.get_torch_device()
if isinstance(device_type, str) and device_type.startswith("mps"):
logger.warning(
"MPS does not support torch.distributed collectives; falling back to CPU"
)
device_type = "cpu"
world_size = ezpz.get_world_size()
local_rank = ezpz.get_local_rank()
if isinstance(device_type, str) and device_type in {"cuda", "xpu"}:
device_type = f"{device_type}:{local_rank}"
model.to(device_type)
if isinstance(device_type, str) and device_type.startswith("cuda"):
model.to(local_rank)
logger.info(f"model=\n{model}")
optimizer = torch.optim.Adam(model.parameters())
if world_size > 1:
model.to(device_type)
model = ezpz.distributed.wrap_model(
model=model, use_fsdp=False, dtype=dtype
)
try:
if isinstance(device_type, str) and device_type.startswith("cuda"):
model = DDP(model, device_ids=[local_rank])
else:
model = DDP(model)
except Exception:
model = DDP(model)
# elif backend.lower() in ("ds", "deepspeed"):
# parser = argparse.ArgumentParser(
# prog="deepspeed", description="My training script."
# )
# parser.add_argument(
# "--local_rank",
# required=False,
# type=int,
# default=-1,
# help="local rank passed from distributed launcher",
# )
# # parser.add_argument(
# # '--deepspeed',
# # action='store_true',
# # default=True,
# # help='Use deepspeed',
# # )
# # parser.add_argument(
# # '--deepspeed_config',
# # type=str,
# # default='deepspeed_config.json',
# # help='Deepspeed config file',
# # )
# try:
# import deepspeed # type:ignore
# except (ImportError, ModuleNotFoundError) as e:
# logger.error(
# "Deepspeed not available. "
# "Install via `python3 -m pip install deepspeed`"
# )
# raise e
#
# # Include DeepSpeed configuration arguments
# parser = deepspeed.add_config_arguments(parser)
# cmd_args = parser.parse_args()
# model, optimizer, *_ = deepspeed.initialize(
# args=cmd_args,
# model=model,
# optimizer=optimizer,
# )
# logger.info(f"{cmd_args=}")
return model, optimizer
@ezpz.timeitlogit(rank=ezpz.get_rank())
def main() -> Trainer:
"""Entry point used by ``python -m ezpz.examples.test``."""
t0 = time.perf_counter()
args = parse_args()
config = get_config_from_args(args)
timings = {}
_ = ezpz.distributed.setup_torch(
tensor_parallel_size=config.tp,
pipeline_parallel_size=config.pp,
context_parallel_size=config.cp,
)
t_setup = time.perf_counter()
logger.info(f"Took: {(t_setup - t0):.2f} seconds to setup torch")
# Initialise wandb early so console capture covers the full run.
if ezpz.get_rank() == 0 and not WANDB_DISABLED:
wbconfig = {}
wbconfig |= asdict(config)
wbconfig |= ezpz.get_dist_info()
console = os.environ.get("WANDB_CONSOLE", "auto").lower()
assert console in {
"auto",
"wrap",
"redirect",
"wrap_raw",
"wrap_emu",
"off",
}, f"Invalid WANDB_CONSOLE value: {console}"
logger.info(f"W&B console logging set to: {console}")
settings = wandb.Settings(console=console)
_ = ezpz.distributed.setup_wandb(
project_name="ezpz.examples.test",
config=wbconfig,
settings=settings,
)
with config.ctx as c:
trainer = train(config, profiler=c)
t_train = time.perf_counter()
if trainer.config.backend.lower() in ["ds", "deepspeed"]:
try:
import deepspeed.comm
deepspeed.comm.log_summary()
except ImportError as e:
logger.exception(e)
logger.exception(
"Deepspeed not available. "
"Install via `python3 -m pip install deepspeed`"
)
logger.info("Continuing without deepspeed summary...")
logger.info(f"Took: {time.perf_counter() - START_TIME:.2f} seconds")
t1 = time.perf_counter()
timings = {
"main/setup_torch": (t_setup - t0),
"main/train": (t_train - t_setup),
"main/total": (t1 - t0),
}
if wandb is not None and (run := getattr(wandb, "run")) is not None:
try:
wandb.log(
{
(f"timings/{k}" if not k.startswith("timings/") else k): v
for k, v in timings.items()
}
)
except Exception:
logger.warning("Failed to log timings to wandb")
logger.info(f"wandb.run=[{run.name}]({run.url})")
return trainer
if __name__ == "__main__":
import sys
trainer = main()
ezpz.cleanup()
sys.exit(0)
Code Walkthroughโ๏ธ
Imports and Constants
DDP is imported directly because this example uses vanilla data-parallel
wrapping (no FSDP). build_test_parser from ezpz.cli.flags provides a
shared argument parser so CLI flags like --epochs, --lr, and --compile
are consistent across all built-in examples. get_profiling_context lets
the example optionally run under PyTorch Profiler or pyinstrument without
changing the training loop.
#!/usr/bin/env python3
"""
examples/test.py
"""
import argparse
import json
import os
import platform
import sys
import time
import warnings
from dataclasses import asdict, dataclass, field
from pathlib import Path
from typing import Iterator, Optional, Sequence
import torch
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.utils.data import DataLoader
from xarray import Dataset
import ezpz
import ezpz.distributed
from ezpz.configs import PathLike
from ezpz.cli.flags import build_test_parser
from ezpz.profile import get_profiling_context
START_TIME = time.perf_counter() # start time
# noqa: E402
warnings.filterwarnings("ignore")
ModelOptimizerPair = tuple[torch.nn.Module, torch.optim.Optimizer]
logger = ezpz.get_logger(__name__)
START_TIME is captured at import time so end-to-end wall-clock timings
can be reported later.
MODEL_PRESETS
Named presets that bundle batch size, iteration count, logging frequency,
and layer sizes into a single --model flag.
MODEL_PRESETS = {
"debug": {
"batch_size": 16,
"train_iters": 20,
"log_freq": 1,
"print_freq": 1,
"layer_sizes": [128, 64],
},
"small": {
"batch_size": 64,
"train_iters": 200,
"log_freq": 1,
"print_freq": 10,
"layer_sizes": [256, 128, 64],
},
"medium": {
"batch_size": 128,
"train_iters": 200,
"log_freq": 1,
"print_freq": 10,
"layer_sizes": [512, 256, 128],
},
"large": {
"batch_size": 256,
"train_iters": 400,
"log_freq": 1,
"print_freq": 10,
"layer_sizes": [1024, 512, 256],
},
}
MODEL_PRESET_FLAGS = {
"batch_size": ["--batch-size"],
"train_iters": ["--train-iters", "--train_iters"],
"log_freq": ["--log-freq", "--log_freq"],
"print_freq": ["--print-freq", "--print_freq"],
"layer_sizes": ["--layer-sizes"],
}
MODEL_PRESET_FLAGS maps each preset field to its CLI flags so that
explicitly provided flags take precedence over the preset defaults.
TrainConfig Dataclass
All runtime configuration lives in a single dataclass. __post_init__
creates the output directory, broadcasts the timestamp across ranks, and
builds the profiling context manager.
@dataclass
class TrainConfig:
"""Runtime configuration for the `ezpz.examples.test` distributed smoke test."""
warmup: int
tp: int
pp: int
cp: int
batch_size: int
input_size: int
output_size: int
train_iters: int
log_freq: int
backend: str
dtype: str
print_freq: int
pyinstrument_profiler: bool
pytorch_profiler: bool
pytorch_profiler_wait: int
pytorch_profiler_warmup: int
pytorch_profiler_active: int
pytorch_profiler_repeat: int
profile_memory: bool
rank_zero_only: bool
record_shapes: bool
with_stack: bool
with_flops: bool
with_modules: bool
acc_events: bool
layer_sizes: list = field(default_factory=lambda: [512, 256, 128])
dataset: str = "mnist"
dataset_root: Optional[PathLike] = None
num_workers: int = 0
no_distributed_history: bool = False
save_datasets: bool = False
def __post_init__(self):
"""Initialise output paths and configure profiling context managers."""
self._created_at = os.environ.get("EZPZ_LOG_TIMESTAMP")
if self._created_at is None:
self._created_at = (
ezpz.get_timestamp() if ezpz.get_rank() == 0 else None
)
self._created_at = ezpz.distributed.broadcast(self._created_at, root=0)
if self._created_at is not None:
os.environ["EZPZ_LOG_TIMESTAMP"] = self._created_at
self.outdir = Path(os.getcwd()).joinpath(
"outputs", "ezpz.examples.test", f"{self._created_at}"
)
self.outdir.mkdir(parents=True, exist_ok=True)
dataset_root = (
Path(self.dataset_root).expanduser()
if self.dataset_root is not None
else self.outdir.parent.joinpath("datasets", self.dataset)
)
dataset_root.mkdir(parents=True, exist_ok=True)
self.dataset_root = dataset_root
profiler_type = "torch" if self.pytorch_profiler else "pyinstrument"
self.ctx = get_profiling_context(
profiler_type=profiler_type,
rank_zero_only=self.rank_zero_only,
record_shapes=self.record_shapes,
with_stack=self.with_stack,
with_flops=self.with_flops,
with_modules=self.with_modules,
acc_events=self.acc_events,
profile_memory=self.profile_memory,
wait=self.pytorch_profiler_wait,
warmup=self.pytorch_profiler_warmup,
active=self.pytorch_profiler_active,
repeat=self.pytorch_profiler_repeat,
outdir=self.outdir,
)
logger.info(f"Outputs will be saved to {self.outdir}")
get_torch_dtype() converts the string dtype field into the
corresponding torch.dtype:
def get_torch_dtype(self) -> torch.dtype:
"""Return the torch dtype requested by this configuration."""
if self.dtype is None:
return torch.get_default_dtype()
if self.dtype in {
"fp16",
"half",
"float16",
}:
return torch.float16
if self.dtype in {
"bfloat16",
"bf16",
}:
return torch.bfloat16
if self.dtype in {
"float32",
"fp32",
"float",
"single",
}:
return torch.float32
logger.warning(f"Unknown dtype: {self.dtype=}, using float32")
return torch.float32
Trainer Class
Coordinates the training loop, metric tracking, and profiling.
__init__ and __post_init__โ๏ธ
Class-level attributes capture distributed topology at definition time.
__post_init__ moves the model to the device, initializes the
History tracker, optionally sets up W&B, and builds the dataloader.
@dataclass
class Trainer:
"""Co-ordinate training loops, logging, and profiling for the test model."""
config: TrainConfig
model: torch.nn.Module
optimizer: torch.optim.Optimizer
# history: ezpz.History = field(init=False)
train_iter: int = 0
rank: int = ezpz.get_rank()
# device_type: str = ezpz.get_torch_device_type()
device_type = os.environ.get("TORCH_DEVICE", ezpz.get_torch_device())
world_size = ezpz.get_world_size()
local_rank = ezpz.get_local_rank()
device_id = f"{device_type}:{local_rank}"
_train_loader: Optional[DataLoader] = field(init=False, default=None)
_train_iterator: Optional[Iterator[tuple[torch.Tensor, torch.Tensor]]] = (
field(init=False, default=None)
)
_feature_dim: int = field(init=False, default=0)
def __post_init__(self):
"""Move the model to the target device and register logging hooks."""
self.device_id = f"{self.device_type}:{self.local_rank}"
self.dtype = self.config.get_torch_dtype()
self.model.to(self.device_id)
self.model.to(self.dtype)
metrics_path = self.config.outdir.joinpath("metrics.jsonl")
self.history: ezpz.history.History = ezpz.history.History(
report_dir=self.config.outdir,
report_enabled=True,
jsonl_path=metrics_path,
jsonl_overwrite=True,
distributed_history=(
1 < self.world_size <= 384 and not self.config.pytorch_profiler
),
)
if self.config.tp > 1 or self.config.pp > 1 or self.config.cp > 1:
ezpz.distributed.barrier(group=ezpz.tp.get_tensor_parallel_group())
ezpz.distributed.barrier(group=ezpz.tp.get_data_parallel_group())
ezpz.distributed.barrier(group=ezpz.tp.get_pipeline_parallel_group())
ezpz.distributed.barrier(group=ezpz.tp.get_context_parallel_group())
if self.rank == 0 and not WANDB_DISABLED:
logger.debug("Setting up wandb")
wbconfig = {}
wbconfig |= asdict(self.config)
wbconfig |= ezpz.get_dist_info()
_ = ezpz.setup_wandb(
project_name="ezpz.examples.test",
config=wbconfig,
)
if (wbrun := getattr(wandb, "run", None)) is not None and callable(
wbrun.watch
):
wbrun.watch(self.model, log="all")
if self.world_size > 1:
logger.debug("Hit torch.distributed.barrier()")
ezpz.distributed.barrier()
self._train_loader, self._train_iterator = self._build_dataloader()
self._feature_dim = self.config.input_size
_build_dataloaderโ๏ธ
Constructs the training DataLoader. Currently supports MNIST via
ezpz.data.vision.get_mnist; only rank 0 downloads the data.
@ezpz.timeitlogit(rank=ezpz.get_rank())
def _build_dataloader(
self,
) -> tuple[DataLoader, Iterator[tuple[torch.Tensor, torch.Tensor]]]:
"""Construct a training dataloader for the requested dataset."""
dataset_name = self.config.dataset.lower()
if dataset_name == "mnist":
try:
from ezpz.data.vision import get_mnist
except (
ModuleNotFoundError
) as exc: # pragma: no cover - optional dep
msg = (
"torchvision is required to use the MNIST dataset. "
"Install it via `pip install torchvision`."
)
raise RuntimeError(msg) from exc
assert self.config.dataset_root is not None
dset_root = Path(self.config.dataset_root).expanduser().resolve()
dset_root.mkdir(parents=True, exist_ok=True)
bundle = get_mnist(
train_batch_size=self.config.batch_size,
test_batch_size=self.config.batch_size,
outdir=dset_root,
num_workers=self.config.num_workers,
download=self.rank == 0,
pin_memory=str(self.device_type).startswith(("cuda", "mps")),
)
train_loader = bundle["train"]["loader"]
return train_loader, iter(train_loader)
raise ValueError(f"Unknown dataset: {dataset_name!r}")
_forward_stepโ๏ธ
Fetches a batch, reshapes/pads inputs to match the model's expected feature dimension, runs the forward pass, and computes cross-entropy loss and accuracy. Returns both a metrics dict and the raw loss tensor.
@ezpz.timeitlogit(rank=ezpz.get_rank())
def _forward_step(self) -> tuple[dict, torch.Tensor]:
"""Execute a forward pass returning metrics and the loss tensor."""
t0 = time.perf_counter()
batch_inputs, targets = self._next_batch()
inputs = self._prepare_inputs(batch_inputs)
targets = targets.to(self.device_id)
logits = self.model(inputs)
loss = calc_loss(logits, targets)
accuracy = (logits.argmax(dim=1) == targets).float().mean()
ezpz.distributed.synchronize()
metrics = {
"loss": loss.detach(),
"accuracy": accuracy.detach(),
"dtf": time.perf_counter() - t0,
}
return metrics, loss
_backward_stepโ๏ธ
Runs backpropagation and the optimizer step. Branches on whether the
backend is DeepSpeed (which manages its own .backward() / .step())
or standard PyTorch.
@ezpz.timeitlogit(rank=ezpz.get_rank())
def _backward_step(self, loss: torch.Tensor) -> float:
"""Perform the backwards/optimiser step and return elapsed seconds."""
t0 = time.perf_counter()
if self.config.backend == "deepspeed":
self.model.backward(loss)
self.model.step(loss)
else:
loss.backward()
self.optimizer.step()
ezpz.distributed.synchronize()
return time.perf_counter() - t0
train_stepโ๏ธ
One full optimizer step: forward, backward, zero-grad. Logs metrics at
log_freq intervals and prints summaries at print_freq intervals.
@ezpz.timeitlogit(rank=ezpz.get_rank())
def train_step(self) -> dict:
"""Run one optimiser step, emitting periodic logs/metrics."""
self.train_iter += 1
metrics, loss = self._forward_step()
metrics["dtb"] = self._backward_step(loss)
self.optimizer.zero_grad()
if self.train_iter == self.config.train_iters:
return metrics
if (
self.train_iter % self.config.log_freq == 0
or self.train_iter % self.config.print_freq == 0
):
summary = self.history.update({"iter": self.train_iter, **metrics})
if self.train_iter % self.config.print_freq == 0:
logger.info(f"{summary}")
return metrics
trainโ๏ธ
The outer training loop. Iterates for config.train_iters steps and
optionally advances the PyTorch profiler each step.
@ezpz.timeitlogit(rank=ezpz.get_rank())
def train(self, profiler: Optional[torch.profiler.profile] = None) -> None:
"""Loop over all training iterations and return the final dataset."""
for step in range(self.config.train_iters):
if step == self.config.warmup:
logger.info(f"Warmup complete at step {step}")
_ = self.train_step()
if profiler is not None:
profiler.step()
finalizeโ๏ธ
Flushes the History tracker, generates plots (rank 0 only), and
optionally logs the final dataset to W&B.
@ezpz.timeitlogit(rank=ezpz.get_rank())
def finalize(
self, outdir: Optional[str | Path | os.PathLike] = None
) -> Dataset:
"""Flush profilers and return the aggregated training dataset."""
import ambivalent
import matplotlib.pyplot as plt
plt.style.use(ambivalent.STYLES["ambivalent"])
outdir = Path(outdir) if outdir is not None else self.config.outdir
env_info = self._gather_environment_snapshot()
dataset = self.history.finalize(
run_name="ezpz.examples.test",
dataset_fname="train",
warmup=self.config.warmup,
save=False, # XXX: don't bother saving test data
plot=(self.rank == 0),
outdir=outdir,
env_info=env_info,
)
logger.info(f"{dataset=}")
if wandb is not None and ezpz.verify_wandb() and not WANDB_DISABLED:
try:
wandb.log(
{
"train_metrics": wandb.Table(
dataframe=dataset.to_dataframe()
)
}
)
except Exception:
logger.warning("Failed to log final dataset to wandb")
return dataset
calc_loss
Standalone loss function using cross-entropy. Cast to float32 before
computing to avoid issues with reduced-precision dtypes.
build_model_and_optimizer
Moves the model to the appropriate device, creates an Adam optimizer,
and wraps the model in DDP when running multi-rank. Falls back to CPU
on MPS since MPS does not support torch.distributed collectives.
@ezpz.timeitlogit(rank=ezpz.get_rank())
def build_model_and_optimizer(
model: torch.nn.Module,
dtype: Optional[str] = None,
) -> ModelOptimizerPair:
"""Prepare the model and optimiser for the requested backend."""
device_override = os.environ.get("TORCH_DEVICE")
device_type = device_override or ezpz.get_torch_device()
if isinstance(device_type, str) and device_type.startswith("mps"):
logger.warning(
"MPS does not support torch.distributed collectives; falling back to CPU"
)
device_type = "cpu"
world_size = ezpz.get_world_size()
local_rank = ezpz.get_local_rank()
if isinstance(device_type, str) and device_type in {"cuda", "xpu"}:
device_type = f"{device_type}:{local_rank}"
model.to(device_type)
if isinstance(device_type, str) and device_type.startswith("cuda"):
model.to(local_rank)
logger.info(f"model=\n{model}")
optimizer = torch.optim.Adam(model.parameters())
if world_size > 1:
model.to(device_type)
model = ezpz.distributed.wrap_model(
model=model, use_fsdp=False, dtype=dtype
)
try:
if isinstance(device_type, str) and device_type.startswith(
"cuda"
):
model = DDP(model, device_ids=[local_rank])
else:
model = DDP(model)
except Exception:
model = DDP(model)
return model, optimizer
train (module-level)
Orchestrates the full training run: builds the SequentialLinearNet
model, wraps it with build_model_and_optimizer, constructs a Trainer,
runs the training loop, finalizes history, and logs timings.
@ezpz.timeitlogit(rank=ezpz.get_rank())
def train(
config: TrainConfig, profiler: Optional[torch.profiler.profile] = None
) -> Trainer:
"""Instantiate the model/optimiser and run the training loop."""
from ezpz.models.minimal import SequentialLinearNet
from ezpz.utils import model_summary
timings = {}
t0m = time.perf_counter()
model = SequentialLinearNet(
input_dim=config.input_size,
output_dim=config.output_size,
sizes=config.layer_sizes,
)
logger.info(
f"Model size: {sum(p.numel() for p in model.parameters())} parameters"
)
try:
logger.info(f"\n{model_summary(model)}")
except Exception as e:
logger.warning(
f"Failed to summarize model: {e}, using default summary"
)
logger.info(model)
t1m = time.perf_counter()
dt_model = t1m - t0m
logger.info(f"Took: {dt_model} seconds to build model")
model, optimizer = build_model_and_optimizer(
model, dtype=config.dtype
)
t2m = time.perf_counter()
dt_optimizer = time.perf_counter() - t1m
logger.info(f"Took: {dt_optimizer:.2f} seconds to build optimizer")
trainer = Trainer(config=config, model=model, optimizer=optimizer)
t1tr = time.perf_counter()
logger.info(
f"Took: {(dt_trainer := t1tr - t2m):.2f} seconds to build trainer"
)
jstr = json.dumps(asdict(config), indent=2, sort_keys=True, default=str)
logger.info(f"config:\n{jstr}")
t1s = time.perf_counter()
logger.info(
f"Took: {(dt_train_start := t1s - START_TIME):.2f} to get here."
)
# -------------------------------------------
# Main training loop
t0t = time.perf_counter()
_ = trainer.train(profiler=profiler)
t1t = time.perf_counter()
logger.info(
f"Took: {(dt_train_duration := t1t - t0t):.2f} seconds to finish training"
)
rank = ezpz.get_rank()
history: ezpz.history.History = trainer.history
dataset = None
# Record timings and return trainer
if rank == 0:
dataset = history.finalize(
run_name="ezpz.examples.test",
dataset_fname="train",
warmup=config.warmup,
save=(rank == 0 and config.save_datasets),
plot=(rank == 0),
outdir=config.outdir,
env_info=trainer._gather_environment_snapshot(),
)
logger.info(f"{dataset=}")
timings = {
"timings/model": dt_model,
"timings/optimizer": dt_optimizer,
"timings/trainer": dt_trainer,
"timings/training_start": dt_train_start,
"timings/train_duration": dt_train_duration,
"timings/end-to-end": time.perf_counter() - START_TIME,
}
if wandb is not None and ezpz.verify_wandb() and not WANDB_DISABLED:
try:
wandb.log(
{
(f"timings/{k}" if not k.startswith("timings/") else k): v
for k, v in timings.items()
},
commit=False,
)
except Exception as e:
logger.exception(e)
logger.warning("Unable to 'wandb.log(timings)', skipping!")
if dataset is not None:
try:
wandb.log(
{
"train_metrics": wandb.Table(
dataframe=dataset.to_dataframe()
)
}
)
except Exception as e:
logger.exception(e)
logger.warning("Failed to log final dataset to wandb")
if ezpz.get_world_size() > 1:
ezpz.barrier()
return trainer
main
Entry point for python -m ezpz.examples.test. Parses CLI args, calls
setup_torch (with optional tensor/pipeline/context parallelism),
enters the profiling context, and kicks off train().
@ezpz.timeitlogit(rank=ezpz.get_rank())
def main() -> Trainer:
"""Entry point used by ``python -m ezpz.examples.test``."""
t0 = time.perf_counter()
args = parse_args()
config = get_config_from_args(args)
timings = {}
_ = ezpz.distributed.setup_torch(
tensor_parallel_size=config.tp,
pipeline_parallel_size=config.pp,
context_parallel_size=config.cp,
)
t_setup = time.perf_counter()
logger.info(f"Took: {(t_setup - t0):.2f} seconds to setup torch")
with config.ctx as c:
trainer = train(config, profiler=c)
t_train = time.perf_counter()
if trainer.config.backend.lower() in ["ds", "deepspeed"]:
try:
import deepspeed.comm
deepspeed.comm.log_summary()
except ImportError as e:
logger.exception(e)
logger.exception(
"Deepspeed not available. "
"Install via `python3 -m pip install deepspeed`"
)
logger.info("Continuing without deepspeed summary...")
logger.info(f"Took: {time.perf_counter() - START_TIME:.2f} seconds")
t1 = time.perf_counter()
timings = {
"main/setup_torch": (t_setup - t0),
"main/train": (t_train - t_setup),
"main/total": (t1 - t0),
}
if wandb is not None and (run := getattr(wandb, "run")) is not None:
try:
wandb.log(
{
(f"timings/{k}" if not k.startswith("timings/") else k): v
for k, v in timings.items()
}
)
except Exception:
logger.warning("Failed to log timings to wandb")
logger.info(f"wandb.run=[{run.name}]({run.url})")
return trainer
__main__ guard
Calls main(), tears down the distributed process group, and exits.
Helpโ๏ธ
--help
$ python3 -m ezpz.examples.test --help
usage: test.py [-h] [--warmup WARMUP] [--tp TP] [--pp PP] [--deepspeed_config DEEPSPEED_CONFIG] [--cp CP] [--backend BACKEND]
[--pyinstrument-profiler] [-p] [--rank-zero-only] [--pytorch-profiler-wait PYTORCH_PROFILER_WAIT]
[--pytorch-profiler-warmup PYTORCH_PROFILER_WARMUP] [--pytorch-profiler-active PYTORCH_PROFILER_ACTIVE]
[--pytorch-profiler-repeat PYTORCH_PROFILER_REPEAT] [--profile-memory] [--record-shapes] [--with-stack]
[--with-flops] [--with-modules] [--acc-events] [--train-iters TRAIN_ITERS] [--log-freq LOG_FREQ]
[--print-freq PRINT_FREQ] [--batch-size BATCH_SIZE] [--input-size INPUT_SIZE] [--output-size OUTPUT_SIZE]
[--layer-sizes LAYER_SIZES] [--dtype DTYPE] [--dataset DATASET] [--dataset-root DATASET_ROOT]
[--num-workers NUM_WORKERS] [--no-distributed-history]
ezpz test: A simple PyTorch distributed smoke test Trains a simple MLP on MNIST dataset using DDP. NOTE: `ezpz test` is a lightweight
wrapper around: `ezpz launch python3 -m ezpz.examples.test`
options:
-h, --help show this help message and exit
--warmup WARMUP Warmup iterations
--tp TP Tensor parallel size
--pp PP Pipeline length
--deepspeed_config DEEPSPEED_CONFIG
Deepspeed config file
--cp CP Context parallel size
--backend BACKEND Backend (DDP, DeepSpeed, etc.)
--pyinstrument-profiler
Profile the training loop
-p, --profile Use PyTorch profiler
--rank-zero-only Run profiler only on rank 0
--pytorch-profiler-wait PYTORCH_PROFILER_WAIT
Wait time before starting the PyTorch profiler
--pytorch-profiler-warmup PYTORCH_PROFILER_WARMUP
Warmup iterations for the PyTorch profiler
--pytorch-profiler-active PYTORCH_PROFILER_ACTIVE
Active iterations for the PyTorch profiler
--pytorch-profiler-repeat PYTORCH_PROFILER_REPEAT
Repeat iterations for the PyTorch profiler
--profile-memory Profile memory usage
--record-shapes Record shapes in the profiler
--with-stack Include stack traces in the profiler
--with-flops Include FLOPs in the profiler
--with-modules Include module information in the profiler
--acc-events Accumulate events in the profiler
--train-iters TRAIN_ITERS, --train_iters TRAIN_ITERS
Number of training iterations
--log-freq LOG_FREQ, --log_freq LOG_FREQ
Logging frequency
--print-freq PRINT_FREQ, --print_freq PRINT_FREQ
Printing frequency
--batch-size BATCH_SIZE
Batch size
--input-size INPUT_SIZE
Input size
--output-size OUTPUT_SIZE
Output size
--layer-sizes LAYER_SIZES
Comma-separated list of layer sizes
--dtype DTYPE Data type (fp16, float16, bfloat16, bf16, float32, etc.)
--dataset DATASET Dataset to use for training (e.g., mnist).
--dataset-root DATASET_ROOT
Directory to cache dataset downloads.
--num-workers NUM_WORKERS
Number of dataloader workers to use.
--no-distributed-history
Disable distributed history aggregation
Outputโ๏ธ
Output
$ ezpz test
[2025-12-31 12:42:16,253799][I][ezpz/examples/test:132:__post_init__] Outputs will be saved to /Users/samforeman/vibes/saforem2/ezpz/outputs/ezpz.examples.test/2025-12-31-124216
[2025-12-31 12:42:16,255418][I][ezpz/dist:1501:setup_torch_distributed] Using device=mps with backend=gloo
[2025-12-31 12:42:16,269406][I][ezpz/dist:1366:setup_torch_DDP] Caught MASTER_PORT=64609 from environment!
[2025-12-31 12:42:16,269926][I][ezpz/dist:1382:setup_torch_DDP] Using torch.distributed.init_process_group with
- master_addr='Sams-MacBook-Pro-2.local'
- master_port='64609'
- world_size=2
- rank=0
- local_rank=0
- timeout=datetime.timedelta(seconds=3600)
- backend='gloo'
[2025-12-31 12:42:16,270684][I][ezpz/dist:1014:init_process_group] Calling torch.distributed.init_process_group_with: rank=0 world_size=2 backend=gloo
[2025-12-31 12:42:16,357662][I][ezpz/dist:1727:setup_torch] Using device='mps' with backend='gloo' + 'gloo' for distributed training.
[2025-12-31 12:42:16,384815][I][ezpz/dist:1774:setup_torch] ['Sams-MacBook-Pro-2.local'][device='mps'][node=0/0][rank=1/1][local_rank=1/1]
[2025-12-31 12:42:16,424260][W][ezpz/dist:544:print_dist_setup] Using [2 / 2] available "mps" devices !!
[2025-12-31 12:42:16,424719][I][ezpz/dist:1774:setup_torch] ['Sams-MacBook-Pro-2.local'][device='mps'][node=0/0][rank=0/1][local_rank=0/1]
[2025-12-31 12:42:16,425119][I][ezpz/examples/test:678:main] Took: 0.18 seconds to setup torch
[2025-12-31 12:42:16,434549][I][ezpz/examples/test:461:train] Model size: 567434 parameters
[2025-12-31 12:42:16,435638][I][ezpz/examples/test:465:train]
=================================================================
Layer (type:depth-idx) Param #
=================================================================
SequentialLinearNet --
โโSequential: 1-1 567,434
=================================================================
Total params: 567,434
Trainable params: 567,434
Non-trainable params: 0
=================================================================
[2025-12-31 12:42:16,436420][I][ezpz/examples/test:473:train] Took: 0.009592041955329478 seconds to build model
[2025-12-31 12:42:16,436687][W][ezpz/examples/test:590:build_model_and_optimizer] MPS does not support torch.distributed collectives; falling back to CPU
[2025-12-31 12:42:16,437061][I][ezpz/examples/test:601:build_model_and_optimizer] model=
SequentialLinearNet(
(layers): Sequential(
(0): Linear(in_features=784, out_features=512, bias=True)
(1): ReLU()
(2): Linear(in_features=512, out_features=256, bias=True)
(3): ReLU()
(4): Linear(in_features=256, out_features=128, bias=True)
(5): ReLU()
(6): Linear(in_features=128, out_features=10, bias=True)
)
)
[2025-12-31 12:42:17,072997][I][ezpz/dist:685:wrap_model] Wrapping model with: ddp
[2025-12-31 12:42:17,087695][I][ezpz/examples/test:479:train] Took: 0.65 seconds to build optimizer
[2025-12-31 12:42:17,330686][I][ezpz/history:220:__init__] Using History with distributed_history=True
[2025-12-31 12:42:17,420667][I][ezpz/dist:2039:setup_wandb] Setting up wandb from rank=0
[2025-12-31 12:42:17,421071][I][ezpz/dist:2040:setup_wandb] Using WB_PROJECT=ezpz.examples.test
wandb: Currently logged in as: foremans (aurora_gpt) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin
wandb: setting up run de0ra7dh
wandb: Tracking run with wandb version 0.23.1
wandb: Run data is saved locally in /Users/samforeman/vibes/saforem2/ezpz/wandb/run-20251231_124217-de0ra7dh
wandb: Run `wandb offline` to turn off syncing.
wandb: Syncing run glad-fire-6862
wandb: View project at https://wandb.ai/aurora_gpt/ezpz.examples.test
wandb: View run at https://wandb.ai/aurora_gpt/ezpz.examples.test/runs/de0ra7dh
[2025-12-31 12:42:19,114502][I][ezpz/dist:2069:setup_wandb] wandb.run=[glad-fire-6862](https://wandb.ai/aurora_gpt/ezpz.examples.test/runs/de0ra7dh)
[2025-12-31 12:42:19,195945][I][ezpz/dist:2112:setup_wandb] Running on machine='localhost'
[2025-12-31 12:42:19,575441][I][ezpz/examples/test:482:train] Took: 2.49 seconds to build trainer
[2025-12-31 12:42:19,576180][I][ezpz/examples/test:486:train] config:
{
"acc_events": false,
"backend": "DDP",
"batch_size": 128,
"cp": 1,
"dataset": "mnist",
"dataset_root": "/Users/samforeman/vibes/saforem2/ezpz/outputs/ezpz.examples/test/datasets/mnist",
"dtype": "bf16",
"input_size": 784,
"layer_sizes": [
512,
256,
128
],
"log_freq": 1,
"no_distributed_history": false,
"num_workers": 0,
"output_size": 10,
"pp": 1,
"print_freq": 10,
"profile_memory": true,
"pyinstrument_profiler": false,
"pytorch_profiler": false,
"pytorch_profiler_active": 3,
"pytorch_profiler_repeat": 5,
"pytorch_profiler_wait": 1,
"pytorch_profiler_warmup": 2,
"rank_zero_only": false,
"record_shapes": true,
"tp": 1,
"train_iters": 200,
"warmup": 5,
"with_flops": true,
"with_modules": true,
"with_stack": true
}
[2025-12-31 12:42:19,577931][I][ezpz/examples/test:488:train] Took: 4.05 to get here.
[2025-12-31 12:42:20,031182][I][ezpz/examples/test:369:train] Warmup complete at step 5
[2025-12-31 12:42:20,084940][I][ezpz/examples/test:325:train_step] iter=10 loss=1.156702 accuracy=0.617188 dtf=0.006740 dtb=0.002168 loss/mean=1.162131 loss/max=1.167561 loss/min=1.156702 loss/std=0.005437 accuracy/mean=0.679688 accuracy/max=0.742188 accuracy/min=0.617188 accuracy/std=0.062500 dtf/mean=0.006928 dtf/max=0.007116 dtf/min=0.006740 dtf/std=0.000188 dtb/mean=0.002139 dtb/max=0.002168 dtb/min=0.002110 dtb/std=0.000029
[2025-12-31 12:42:20,239358][I][ezpz/examples/test:325:train_step] iter=20 loss=0.689231 accuracy=0.773438 dtf=0.007689 dtb=0.004232 loss/mean=0.729648 loss/max=0.770065 loss/min=0.689231 loss/std=0.040417 accuracy/mean=0.777344 accuracy/max=0.781250 accuracy/min=0.773438 accuracy/std=0.003906 dtf/mean=0.007831 dtf/max=0.007973 dtf/min=0.007689 dtf/std=0.000142 dtb/mean=0.004175 dtb/max=0.004232 dtb/min=0.004118 dtb/std=0.000057
[2025-12-31 12:42:20,390147][I][ezpz/examples/test:325:train_step] iter=30 loss=0.530459 accuracy=0.851562 dtf=0.006860 dtb=0.004432 loss/mean=0.479719 loss/max=0.530459 loss/min=0.428980 loss/std=0.050740 accuracy/mean=0.867188 accuracy/max=0.882812 accuracy/min=0.851562 accuracy/std=0.015625 dtf/mean=0.006832 dtf/max=0.006860 dtf/min=0.006804 dtf/std=0.000028 dtb/mean=0.004483 dtb/max=0.004534 dtb/min=0.004432 dtb/std=0.000051
[2025-12-31 12:42:20,619267][I][ezpz/examples/test:325:train_step] iter=40 loss=0.340644 accuracy=0.898438 dtf=0.010063 dtb=0.001984 loss/mean=0.357486 loss/max=0.374328 loss/min=0.340644 loss/std=0.016842 accuracy/mean=0.886719 accuracy/max=0.898438 accuracy/min=0.875000 accuracy/std=0.011719 dtf/mean=0.010164 dtf/max=0.010265 dtf/min=0.010063 dtf/std=0.000101 dtb/mean=0.001927 dtb/max=0.001984 dtb/min=0.001870 dtb/std=0.000057
[2025-12-31 12:42:20,774938][I][ezpz/examples/test:325:train_step] iter=50 loss=0.361376 accuracy=0.882812 dtf=0.008326 dtb=0.005981 loss/mean=0.337681 loss/max=0.361376 loss/min=0.313987 loss/std=0.023694 accuracy/mean=0.890625 accuracy/max=0.898438 accuracy/min=0.882812 accuracy/std=0.007812 dtf/mean=0.008406 dtf/max=0.008487 dtf/min=0.008326 dtf/std=0.000081 dtb/mean=0.006001 dtb/max=0.006022 dtb/min=0.005981 dtb/std=0.000021
[2025-12-31 12:42:20,944339][I][ezpz/examples/test:325:train_step] iter=60 loss=0.377222 accuracy=0.906250 dtf=0.006605 dtb=0.001605 loss/mean=0.328784 loss/max=0.377222 loss/min=0.280345 loss/std=0.048439 accuracy/mean=0.906250 accuracy/max=0.906250 accuracy/min=0.906250 accuracy/std=0.000000 dtf/mean=0.006642 dtf/max=0.006680 dtf/min=0.006605 dtf/std=0.000037 dtb/mean=0.001830 dtb/max=0.002055 dtb/min=0.001605 dtb/std=0.000225
[2025-12-31 12:42:21,088867][I][ezpz/examples/test:325:train_step] iter=70 loss=0.576832 accuracy=0.851562 dtf=0.006629 dtb=0.001574 loss/mean=0.491650 loss/max=0.576832 loss/min=0.406469 loss/std=0.085181 accuracy/mean=0.871094 accuracy/max=0.890625 accuracy/min=0.851562 accuracy/std=0.019531 dtf/mean=0.006417 dtf/max=0.006629 dtf/min=0.006206 dtf/std=0.000212 dtb/mean=0.001499 dtb/max=0.001574 dtb/min=0.001424 dtb/std=0.000075
[2025-12-31 12:42:21,253870][I][ezpz/examples/test:325:train_step] iter=80 loss=0.190064 accuracy=0.945312 dtf=0.010226 dtb=0.002672 loss/mean=0.248800 loss/max=0.307535 loss/min=0.190064 loss/std=0.058736 accuracy/mean=0.929688 accuracy/max=0.945312 accuracy/min=0.914062 accuracy/std=0.015625 dtf/mean=0.010215 dtf/max=0.010226 dtf/min=0.010203 dtf/std=0.000011 dtb/mean=0.004028 dtb/max=0.005383 dtb/min=0.002672 dtb/std=0.001355
[2025-12-31 12:42:21,421837][I][ezpz/examples/test:325:train_step] iter=90 loss=0.347430 accuracy=0.906250 dtf=0.007348 dtb=0.005348 loss/mean=0.338818 loss/max=0.347430 loss/min=0.330205 loss/std=0.008612 accuracy/mean=0.910156 accuracy/max=0.914062 accuracy/min=0.906250 accuracy/std=0.003906 dtf/mean=0.007332 dtf/max=0.007348 dtf/min=0.007316 dtf/std=0.000016 dtb/mean=0.005451 dtb/max=0.005554 dtb/min=0.005348 dtb/std=0.000103
[2025-12-31 12:42:21,583712][I][ezpz/examples/test:325:train_step] iter=100 loss=0.205180 accuracy=0.937500 dtf=0.006650 dtb=0.001697 loss/mean=0.186145 loss/max=0.205180 loss/min=0.167109 loss/std=0.019036 accuracy/mean=0.945312 accuracy/max=0.953125 accuracy/min=0.937500 accuracy/std=0.007812 dtf/mean=0.006642 dtf/max=0.006650 dtf/min=0.006634 dtf/std=0.000008 dtb/mean=0.001716 dtb/max=0.001736 dtb/min=0.001697 dtb/std=0.000019
[2025-12-31 12:42:21,747500][I][ezpz/examples/test:325:train_step] iter=110 loss=0.280337 accuracy=0.890625 dtf=0.007608 dtb=0.001616 loss/mean=0.403753 loss/max=0.527169 loss/min=0.280337 loss/std=0.123416 accuracy/mean=0.871094 accuracy/max=0.890625 accuracy/min=0.851562 accuracy/std=0.019531 dtf/mean=0.007645 dtf/max=0.007683 dtf/min=0.007608 dtf/std=0.000038 dtb/mean=0.001591 dtb/max=0.001616 dtb/min=0.001565 dtb/std=0.000026
[2025-12-31 12:42:21,911124][I][ezpz/examples/test:325:train_step] iter=120 loss=0.193751 accuracy=0.945312 dtf=0.007935 dtb=0.003730 loss/mean=0.222549 loss/max=0.251347 loss/min=0.193751 loss/std=0.028798 accuracy/mean=0.941406 accuracy/max=0.945312 accuracy/min=0.937500 accuracy/std=0.003906 dtf/mean=0.007733 dtf/max=0.007935 dtf/min=0.007531 dtf/std=0.000202 dtb/mean=0.003998 dtb/max=0.004267 dtb/min=0.003730 dtb/std=0.000269
[2025-12-31 12:42:22,062758][I][ezpz/examples/test:325:train_step] iter=130 loss=0.141411 accuracy=0.968750 dtf=0.008744 dtb=0.001594 loss/mean=0.129989 loss/max=0.141411 loss/min=0.118568 loss/std=0.011421 accuracy/mean=0.972656 accuracy/max=0.976562 accuracy/min=0.968750 accuracy/std=0.003906 dtf/mean=0.008772 dtf/max=0.008801 dtf/min=0.008744 dtf/std=0.000028 dtb/mean=0.001590 dtb/max=0.001594 dtb/min=0.001585 dtb/std=0.000005
[2025-12-31 12:42:22,209100][I][ezpz/examples/test:325:train_step] iter=140 loss=0.211549 accuracy=0.921875 dtf=0.010127 dtb=0.001660 loss/mean=0.199538 loss/max=0.211549 loss/min=0.187527 loss/std=0.012011 accuracy/mean=0.929688 accuracy/max=0.937500 accuracy/min=0.921875 accuracy/std=0.007812 dtf/mean=0.010193 dtf/max=0.010259 dtf/min=0.010127 dtf/std=0.000066 dtb/mean=0.001661 dtb/max=0.001661 dtb/min=0.001660 dtb/std=0.000000
[2025-12-31 12:42:22,373980][I][ezpz/examples/test:325:train_step] iter=150 loss=0.388431 accuracy=0.898438 dtf=0.007742 dtb=0.004872 loss/mean=0.371302 loss/max=0.388431 loss/min=0.354173 loss/std=0.017129 accuracy/mean=0.890625 accuracy/max=0.898438 accuracy/min=0.882812 accuracy/std=0.007812 dtf/mean=0.007812 dtf/max=0.007882 dtf/min=0.007742 dtf/std=0.000070 dtb/mean=0.004859 dtb/max=0.004872 dtb/min=0.004847 dtb/std=0.000012
[2025-12-31 12:42:22,532635][I][ezpz/examples/test:325:train_step] iter=160 loss=0.194098 accuracy=0.945312 dtf=0.009664 dtb=0.002261 loss/mean=0.237500 loss/max=0.280903 loss/min=0.194098 loss/std=0.043402 accuracy/mean=0.925781 accuracy/max=0.945312 accuracy/min=0.906250 accuracy/std=0.019531 dtf/mean=0.009647 dtf/max=0.009664 dtf/min=0.009630 dtf/std=0.000017 dtb/mean=0.002264 dtb/max=0.002267 dtb/min=0.002261 dtb/std=0.000003
[2025-12-31 12:42:22,698409][I][ezpz/examples/test:325:train_step] iter=170 loss=0.310664 accuracy=0.859375 dtf=0.008939 dtb=0.001697 loss/mean=0.293060 loss/max=0.310664 loss/min=0.275457 loss/std=0.017604 accuracy/mean=0.886719 accuracy/max=0.914062 accuracy/min=0.859375 accuracy/std=0.027344 dtf/mean=0.008993 dtf/max=0.009047 dtf/min=0.008939 dtf/std=0.000054 dtb/mean=0.001674 dtb/max=0.001697 dtb/min=0.001652 dtb/std=0.000022
[2025-12-31 12:42:22,867578][I][ezpz/examples/test:325:train_step] iter=180 loss=0.144313 accuracy=0.953125 dtf=0.008492 dtb=0.002695 loss/mean=0.154619 loss/max=0.164924 loss/min=0.144313 loss/std=0.010305 accuracy/mean=0.949219 accuracy/max=0.953125 accuracy/min=0.945312 accuracy/std=0.003906 dtf/mean=0.008534 dtf/max=0.008576 dtf/min=0.008492 dtf/std=0.000042 dtb/mean=0.002694 dtb/max=0.002695 dtb/min=0.002692 dtb/std=0.000002
[2025-12-31 12:42:23,009535][I][ezpz/examples/test:325:train_step] iter=190 loss=0.143928 accuracy=0.968750 dtf=0.008190 dtb=0.001721 loss/mean=0.186032 loss/max=0.228136 loss/min=0.143928 loss/std=0.042104 accuracy/mean=0.949219 accuracy/max=0.968750 accuracy/min=0.929688 accuracy/std=0.019531 dtf/mean=0.008230 dtf/max=0.008270 dtf/min=0.008190 dtf/std=0.000040 dtb/mean=0.001679 dtb/max=0.001721 dtb/min=0.001637 dtb/std=0.000042
[2025-12-31 12:42:23,797136][I][ezpz/history:2385:finalize] Saving plots to /Users/samforeman/vibes/saforem2/ezpz/outputs/ezpz.examples/test/2025-12-31-124216/plots/mplot (matplotlib) and /Users/samforeman/vibes/saforem2/ezpz/outputs/ezpz.examples/test/2025-12-31-124216/plots/tplot (tplot)
accuracy accuracy/min
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
0.992โค โ โ โ โโโโ โโ โโโโโโโโโโโโโโโโ0.969โค -----------------------------------------------โ
0.930โค โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ โโโโโโโ0.734โค------ - - โ
0.867โค โโโโโโโโ โ โ โโ โ โ โ โโฌโโโโโโโโโโโโโฌโโโโโโโโโโโโโฌโโโโโโโโโโโโโฌโโโโโโโโโโโโโฌโ
0.742โค โโโโโ โ โ 1.0 49.2 97.5 145.8 194.0
0.680โคโโโ โ โaccuracy/min iter
0.617โคโโ โ accuracy/std
โโฌโโโโโโโโโโโโโฌโโโโโโโโโโโโโฌโโโโโโโโโโโโโฌโโโโโโโโโโโโโฌโ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
1.0 49.2 97.5 145.8 194.0 0.066โค *** * * * ** * * โ
accuracy iter 0.022โค*****************************************************โ
accuracy/mean โโฌโโโโโโโโโโโโโฌโโโโโโโโโโโโโฌโโโโโโโโโโโโโฌโโโโโโโโโโโโโฌโ
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ 1.0 49.2 97.5 145.8 194.0
0.973โค ยท ยท ยทยทยท ยท ยท ยทยทยทยท ยทยทยทยทยทยทยทยทโaccuracy/std iter
0.915โค ยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยท ยทยทยท ยทยทยทยทยทยทโ accuracy/max
0.857โค ยทยทยทยทยทยทยท ยทยทยท ยท โ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
0.741โค ยทยทยทยทยท โ0.992โค + ++++++++++++++++++++++++++++++++++++++++++++โ
0.683โคยทยทยท โ0.870โค +++++++++++++ +++++ ++ + + + โ
0.625โคยทยท โ0.686โค++ โ
โโฌโโโโโโโโโโโโโฌโโโโโโโโโโโโโฌโโโโโโโโโโโโโฌโโโโโโโโโโโโโฌโ โโฌโโโโโโโโโโโโโฌโโโโโโโโโโโโโฌโโโโโโโโโโโโโฌโโโโโโโโโโโโโฌโ
1.0 49.2 97.5 145.8 194.0 1.0 49.2 97.5 145.8 194.0
accuracy/mean iter accuracy/max iter
text saved in /Users/samforeman/vibes/saforem2/ezpz/outputs/ezpz.examples.test/2025-12-31-124216/plots/tplot/accuracy.txt
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
0.992โค ++ accuracy/max โ โ +โ โ
โ -- accuracy/min โโ +โ โ โโ +โ +โ+โ+โ +โโ โ โโโโ + โ
โ ยทยท accuracy/mean + โ+ โ โโ โ โ โ +โโโโ+โโโโโ+โโโโ โโโ โโโโยทโโโโโโโโโโโโโโโโโยทโโยทโโโ
0.930โค โโ accuracy โ โโยท โโยทโโโ โโ+ โ โโโโโโโ+โ+ โโโโโโยทโโโโโโโโโโโ+โยทโโโโโโโโโโโโโยทยท--โโ-โโโ -โโโโโโ
โ โ+ โโโโ+โโโโโโโโโยทยทโโโโยทโโ โโโโโโ โโโโโโโโโโโโโโยทยท -โ - โ โโโ โโโ โโยทยทโ-โ -โโ- - โโ ยทโโ โโโโโโ
โ +โยท โยทโโโโโโยทโโ ยทโโโโยทโโ-โโโโโโ โ โ-- โ โโโโโโ -โ ยท โ โโ-โโ
0.867โค +ยทโโโโยทโโโยทโยทยทยท---โโ -ยท โ- โโโ- -- ยท โ โ โ
โ + + โโโโโโ-โยท ยทยทยท--- - - โ - โ
โ + + ++โโโโโโโ -ยท- - โ
0.805โค ++ โยท +โโโยทโโโ --- - โ
โ ++โโโโโโโโยท โ - โ
โ โ+ยทโโโโโยท-โ- โ
0.742โค โยทโโโโ -ยท โ- โ
โ โยทโโโโ - โ- โ
โโโโโโ- โ โ
0.680โคโโโ โ- โ
โโโโ โ
โโโโ โ
0.617โคโโโ โ
โโฌโโโโโโโโโโโโโโโโโโโโโโโโโโโโฌโโโโโโโโโโโโโโโโโโโโโโโโโโโโฌโโโโโโโโโโโโโโโโโโโโโโโโโโโโฌโโโโโโโโโโโโโโโโโโโโโโโโโโโโฌโ
1.0 49.2 97.5 145.8 194.0
text saved in /Users/samforeman/vibes/saforem2/ezpz/outputs/ezpz.examples.test/2025-12-31-124216/plots/tplot/accuracy_summary.txt
accuracy/mean hist accuracy/max hist
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
80.0โค โโโโโโ โ85.0โค โโโโโโ โ
66.7โค โโโโโโ โ70.8โค โโโโโโ โ
53.3โค โโโโโโ โ56.7โค โโโโโโโโโโโ โ
40.0โค โโโโโโโโโโโโโโโโโ42.5โค โโโโโโโโโโโ โ
26.7โค โโโโโโโโโโโโโโโโโ28.3โค โโโโโโโโโโโโโโโโโ
13.3โค โโโโโ โโโโโโโโโโโโโโโโโโโโโโโ14.2โค โโโโโโโโโโโโโโโโโโโโโโโโโโโโ
0.0โคโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ 0.0โคโโโโโ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
โโฌโโโโโโโโโโโโโฌโโโโโโโโโโโโโโฌโโโโโโโโโโโโโฌโโโโโโโโโโโโโฌโ โโฌโโโโโโโโโโโโโฌโโโโโโโโโโโโโโฌโโโโโโโโโโโโโฌโโโโโโโโโโโโโฌโ
0.61 0.70 0.80 0.89 0.99 0.61 0.71 0.81 0.91 1.01
accuracy/min hist accuracy/std hist
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
77.0โค โโโโโโ โ61.0โคโโโโโ โ
64.2โค โโโโโโ โ50.8โคโโโโโโโโโโโ โ
51.3โค โโโโโโ โ40.7โคโโโโโโโโโโโโโโโโ โ
38.5โค โโโโโโโโโโโ โ30.5โคโโโโโโโโโโโโโโโโ โ
25.7โค โโโโโโโโโโโโโโโโโ20.3โคโโโโโโโโโโโโโโโโ โ
12.8โค โโโโโ โโโโโโโโโโโโโโโโโโโโโโโ10.2โคโโโโโโโโโโโโโโโโโโโโโโโโโโโ โ
0.0โคโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ 0.0โคโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
โโฌโโโโโโโโโโโโโฌโโโโโโโโโโโโโโฌโโโโโโโโโโโโโฌโโโโโโโโโโโโโฌโ โโฌโโโโโโโโโโโโโฌโโโโโโโโโโโโโโฌโโโโโโโโโโโโโฌโโโโโโโโโโโโโฌโ
0.60 0.70 0.79 0.89 0.98 -0.003 0.015 0.033 0.051 0.069
text saved in /Users/samforeman/vibes/saforem2/ezpz/outputs/ezpz.examples.test/2025-12-31-124216/plots/tplot/accuracy_hist.txt
dtb dtb/min
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
0.00626โค โ โ โ โ โ โ โ0.00606โค ------- -- ---- - - -- --- ------ - --- -โ
0.00547โค โโ โโ โ โ โโ โโโโ โ โ โโ โโ โ โโ0.00297โค--- --------------------------------- -------------โ
0.00469โค โโโ โโโโ โ โโ โโ โโโโ โโ โ โ โโโโโโ โ โโโ โโ โโฌโโโโโโโโโโโโโฌโโโโโโโโโโโโฌโโโโโโโโโโโโโฌโโโโโโโโโโโโฌโ
0.00311โค โโโ โ โโ โ โโโโโ โโโโโโโโ โโโ โโโ โโ โโโโโ โ โโ 1.0 49.2 97.5 145.8 194.0
0.00233โค โโโโโ โโโโโโโโโโโโโโโโโโโ โโโ โโโ โโโโโโโโโโ โโdtb/min iter
0.00154โคโโโโโโโ โโโโโโโโโโโโโโโโโ โโโโโโโโโโโ โโโโโโโโโโโโโโ dtb/std
โโฌโโโโโโโโโโโโโฌโโโโโโโโโโโโฌโโโโโโโโโโโโโฌโโโโโโโโโโโโฌโ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
1.0 49.2 97.5 145.8 194.0 0.00187โค * * * * ** **** * * โ
dtb iter 0.00062โค***************************************************โ
dtb/mean โโฌโโโโโโโโโโโโโฌโโโโโโโโโโโโฌโโโโโโโโโโโโโฌโโโโโโโโโโโโฌโ
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ 1.0 49.2 97.5 145.8 194.0
0.00616โค ยท ยท โdtb/std iter
0.00538โค ยทยท ยทยท ยทยท ยทยท ยท ยท ยท ยทยท ยท ยทโ dtb/max
0.00461โค ยทยทยทยทยทยทยทยทยทยท ยทยทยทยท ยทยท ยท ยทยท ยทยทยท ยทยทยทยทยทยท ยท ยทยทยท ยทโ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
0.00305โค ยท ยทยทยทยทยทยทยทยท ยทยทยทยทยท ยทยท ยทยทยทยทยท ยทยทยท ยทยทยท ยทยทยทยทยทยทยทยท ยท ยทโ0.00626โค + + + ++ ++ ++ + + + + + +โ
0.00228โค ยทยท ยทยทยท ยทยทยทยทยทยทยทยทยทยท ยทยทยทยทยทยทยทยทยทยทยทยทยท ยทยท ยทยทยทยทยทยทยทยทยทยท ยทโ0.00469โค +++++++ ++ +++++ ++ +++++ +++ ++++++++++++ + +โ
0.00150โคยทยทยท ยทยทยท ยทยทยทยทยทยทยทยทยทยทยท ยทยทยทยทยท ยทยทยทยทยทยทยทยทยทยท ยทยทยทยทยทยทยทยทยทยทยทยท โ0.00233โค+++ +++ ++++++++++++++++++++++++++++ +++++++++++++โ
โโฌโโโโโโโโโโโโโฌโโโโโโโโโโโโฌโโโโโโโโโโโโโฌโโโโโโโโโโโโฌโ โโฌโโโโโโโโโโโโโฌโโโโโโโโโโโโฌโโโโโโโโโโโโโฌโโโโโโโโโโโโฌโ
1.0 49.2 97.5 145.8 194.0 1.0 49.2 97.5 145.8 194.0
dtb/mean iter dtb/max iter
text saved in /Users/samforeman/vibes/saforem2/ezpz/outputs/ezpz.examples.test/2025-12-31-124216/plots/tplot/dtb.txt
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
0.00626โค ++ dtb/max โ โ
โ -- dtb/min โโ โ โ โ
โ ยทยท dtb/mean โ โโ โโ โ โ โโ โ
0.00545โค โโ dtb โโ โโ โโ + โ โ โโ โโ + โโ
โ โโ โโโ โโ โโโ + โ โโ โ โ โ โโ โโ ++ + ยทโโ
โ โโโโ โโโยท โ โโ โโโ +โโ โโโโ โ โโ โโ โโโ ++ +โ ยทโโ
0.00465โค โโโโ + โโโยท โโ โโ โโโ +โโ โโโโ โโโ โโ + โโ โ โ ++ ยทโโ ยทโโ
โ โโโโ +โ โโยทโยท โโ โโ โโโโโ +โโ โโโโ +โโโ โโ++ โโ โ โ +++ ยทโโ ยทโโ
โ โโ โ+โ โ ยทโโ+ โโ โโ โโโโโ โ โ ยทโโ โโโโ ++โโโ โโ+โ โโโ โ โ โ ++โ โยทโโ ยทโโ
0.00384โค โโ โ+โ โ ยทยทโยท โโ โโ โโโโโ โโ โ ยทโโ โโโโ ++โโโ โโโโ โโโ โโ โ โ +ยทโ + โยทโโ ยท ยทโโ
โ โโ โ+โ โ -ยทโยท โโ โโ โโโโโ โโ โ ยทโโ โโโโ ++โโโโ โ โโ โโโโโโ โโโ +ยทโ โโ โยทโโ โโ ยทโโ
โ โโ โ+โ โ --โยท โโ โโ โโโโโ โโ โ ยทโโ โโโโ +ยทโโยทโ โ โโ โโโโโโ โโโ ยทยทโ โโ โยทโโ โโ ยทโโ
0.00304โค โโ โโโ โ --โยท โโ โโ โโโโโ โโ โ ยทโโ โโโโ โยทโโยทโ โ โโ โโโโ- โโโ ยทยทโ โโ โยทโโ โโ ยทโโ
โ โโ โโโ โ -- โ โโ โโ โโโโโ โโโโโ โโโ โโโโโโยทโโยทโ โ โโโ โโโโ โโโ โยทยทโ โโ โยทโโโ+ โโ ยทโโ
โ โโ โโโโ โ -- โ โโยท โโ โโโโโ โโโโโ +โโโ โโโโโโยทโโยทโ โ โโโ โโโโ โโโ โยทยทโ โโโโโโโโโ+โโ ยทโโ
0.00223โค โ โโ โโ-โ โ - โ+โ โ โโโ โโโโโ โโโโโ โโโโโ โโโโโโโยทโโยทโ +โ โโโโ โโโโ โโโ โโยทโโยทโโโโ โโโ+โโ ยทโโ
โ โโโโโ โโ-โ โ โโโ โ โ-โโโโโโโยทโโโโโโโโ -โ+โโโโโโโโโ--โโโยทโ+ยทโ โโโโยท + โโโ โ โโโโโโยทโโ โโโโ โโโยทโโ ยท โโโ
โโโ โ โ โโ โโโ โโโ โโโ โโยท โโโโโโโ โโ- -โโ โโโโโโ- -โโโโโโโ ยทโโโโโโโโโโ โ โ -โโโ โโโ- โโโโโโโโโโยทโ โ
0.00142โค โ - โ โ โ โ โ โ โโโโ โ โ โ โ -โ โโ โ
โโฌโโโโโโโโโโโโโโโโโโโโโโโโโโโโฌโโโโโโโโโโโโโโโโโโโโโโโโโโโฌโโโโโโโโโโโโโโโโโโโโโโโโโโโโฌโโโโโโโโโโโโโโโโโโโโโโโโโโโฌโ
1.0 49.2 97.5 145.8 194.0
text saved in /Users/samforeman/vibes/saforem2/ezpz/outputs/ezpz.examples.test/2025-12-31-124216/plots/tplot/dtb_summary.txt
dtb/mean hist dtb/max hist
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
95.0โคโโโโโ โ88.0โคโโโโโ โ
79.2โคโโโโโ โ73.3โคโโโโโ โ
63.3โคโโโโโ โ58.7โคโโโโโ โ
47.5โคโโโโโ โ44.0โคโโโโโ โ
31.7โคโโโโโโโโโโโ โ29.3โคโโโโโโโโโโโ โ
15.8โคโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ โ14.7โคโโโโโโโโโโโโโโโโ โโโโโโโโโโโโโโโโโโโโโโ โ
0.0โคโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ 0.0โคโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
โโฌโโโโโโโโโโโโโฌโโโโโโโโโโโโโโฌโโโโโโโโโโโโโฌโโโโโโโโโโโโโฌโ โโฌโโโโโโโโโโโโโฌโโโโโโโโโโโโโโฌโโโโโโโโโโโโโฌโโโโโโโโโโโโโฌโ
0.0013 0.0026 0.0038 0.0051 0.0064 0.0013 0.0026 0.0039 0.0052 0.0065
dtb/min hist dtb/std hist
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
105.0โคโโโโโ โ166.0โคโโโโโ โ
87.5โคโโโโโ โ138.3โคโโโโโ โ
70.0โคโโโโโ โ110.7โคโโโโโ โ
52.5โคโโโโโ โ 83.0โคโโโโโ โ
35.0โคโโโโโ โ 55.3โคโโโโโ โ
17.5โคโโโโโโโโโโโโโโโโ โโโโโโโโโโ โ 27.7โคโโโโโ โ
0.0โคโโโโโโโโโโโโโโโโโโโโโโโโโโ โโโโโโโโโโโโโโโโโโโโโโโโโโโ 0.0โคโโโโโโโโโโโโโโโโโโโโโ โโโโโโโโโโโโโโโโโโโโโโโโโโโ
โโฌโโโโโโโโโโโโโฌโโโโโโโโโโโโโฌโโโโโโโโโโโโโฌโโโโโโโโโโโโโฌโ โโฌโโโโโโโโโโโโโฌโโโโโโโโโโโโโฌโโโโโโโโโโโโโฌโโโโโโโโโโโโโฌโ
0.0012 0.0025 0.0037 0.0050 0.0063 -0.00008 0.00043 0.00094 0.00144 0.00195
text saved in /Users/samforeman/vibes/saforem2/ezpz/outputs/ezpz.examples.test/2025-12-31-124216/plots/tplot/dtb_hist.txt
dtf dtf/min
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
0.0122โค โ โ โ โ0.0122โค - -- - --- -- - - - - - - - -- --- ---โ
0.0112โค โโ โโ โ โ โ โ โโโ0.0080โค-------------------- -------------- ----------------โ
0.0101โค โโ โ โ โโ โโโ โโโโ โโโ โ โ โโโโ โโ โโโ โโโโโโ โโฌโโโโโโโโโโโโโฌโโโโโโโโโโโโโฌโโโโโโโโโโโโฌโโโโโโโโโโโโโฌโ
0.0080โค โโโโโโโ โโโโโโโโโ โโโโโโโโโโโโโโโโ โโ โโโโโโโโโโโโโโ 1.0 49.2 97.5 145.8 194.0
0.0070โคโโโโโโโโโโโโโโโโโโ โโโโโโโโโโโโโโโโ โโโโโโโ โ โโโโโโโdtf/min iter
0.0059โคโโโ โโ โโโโโโโโโ โโโโโโ โ โโโ โโโ โโ โ โโโโโโ dtf/std
โโฌโโโโโโโโโโโโโฌโโโโโโโโโโโโโฌโโโโโโโโโโโโฌโโโโโโโโโโโโโฌโ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
1.0 49.2 97.5 145.8 194.0 0.0329โค * โ
dtf iter 0.0110โค****************************************************โ
dtf/mean โโฌโโโโโโโโโโโโโฌโโโโโโโโโโโโโฌโโโโโโโโโโโโฌโโโโโโโโโโโโโฌโ
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ 1.0 49.2 97.5 145.8 194.0
0.0400โค ยท โdtf/std iter
0.0343โค ยท โ dtf/max
0.0287โค ยท โ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
0.0173โค ยท โ0.073โค + โ
0.0116โค ยท ยท ยทยท ยท ยทยทยท ยทยท ยท ยทยทยท ยท ยท ยท ยท ยทยท ยทยท ยท ยทยทยทโ0.051โค ++ โ
0.0059โคยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยท ยทยทยทยทยทยทยทยทยทยทยทยทยทยท ยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทโ0.017โค+++++++++++++++++++++++++++++++++++++++++++++++++++++โ
โโฌโโโโโโโโโโโโโฌโโโโโโโโโโโโโฌโโโโโโโโโโโโฌโโโโโโโโโโโโโฌโ โโฌโโโโโโโโโโโโโฌโโโโโโโโโโโโโฌโโโโโโโโโโโโโฌโโโโโโโโโโโโโฌโ
1.0 49.2 97.5 145.8 194.0 1.0 49.2 97.5 145.8 194.0
dtf/mean iter dtf/max iter
text saved in /Users/samforeman/vibes/saforem2/ezpz/outputs/ezpz.examples.test/2025-12-31-124216/plots/tplot/dtf.txt
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
0.073โค ++ dtf/max + โ
โ -- dtf/min + โ
โ ยทยท dtf/mean + โ
0.062โค โโ dtf + โ
โ + โ
โ + โ
0.051โค + โ
โ + โ
โ + โ
0.039โค ยท โ
โ ยท โ
โ ยท โ
0.028โค ยท โ
โ ยท โ
โ ยท โ
0.017โค ยท โ
โ ยท ยท ยท + โ
โ ยทโยทยทโยทโ โยทโ ยทยท โโยทยทโ โ ยท โ โโ ยทยทยท โโโโ โโยท+ โโโโ โยทโโโโโ + โ โ โโยท โโโ + ยท ยทยทยทโโยทโ ยทโโโโยทโโ ยทโ โ ยท โยทโโโ
0.006โคโโโโโโโโโโโโโโโโโโโโ โโโโโโโโโโโโโ โโโโโโโโ โโโโโโโโ โโโโโโโ โ โโโโโโโโโโโโ โโโโโโโโโโโโโโโโ โโโโ โโโโโโโโโโโโโโ
โโฌโโโโโโโโโโโโโโโโโโโโโโโโโโโโฌโโโโโโโโโโโโโโโโโโโโโโโโโโโโฌโโโโโโโโโโโโโโโโโโโโโโโโโโโโฌโโโโโโโโโโโโโโโโโโโโโโโโโโโโฌโ
1.0 49.2 97.5 145.8 194.0
text saved in /Users/samforeman/vibes/saforem2/ezpz/outputs/ezpz.examples.test/2025-12-31-124216/plots/tplot/dtf_summary.txt
dtf/mean hist dtf/max hist
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
157.0โคโโโโโ โ193.0โคโโโโโ โ
130.8โคโโโโโ โ160.8โคโโโโโ โ
104.7โคโโโโโ โ128.7โคโโโโโ โ
78.5โคโโโโโ โ 96.5โคโโโโโ โ
52.3โคโโโโโ โ 64.3โคโโโโโ โ
26.2โคโโโโโโโโโโโ โ 32.2โคโโโโโ โ
0.0โคโโโโโโโโโโโ โโโโโโ 0.0โคโโโโโ โโโโโโ
โโฌโโโโโโโโโโโโโฌโโโโโโโโโโโโโฌโโโโโโโโโโโโโฌโโโโโโโโโโโโโฌโ โโฌโโโโโโโโโโโโโฌโโโโโโโโโโโโโฌโโโโโโโโโโโโโฌโโโโโโโโโโโโโฌโ
0.004 0.014 0.023 0.032 0.042 0.003 0.021 0.039 0.058 0.076
dtf/min hist dtf/std hist
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
48โคโโโโโ โ193.0โคโโโโโ โ
40โคโโโโโ โโโโโ โ160.8โคโโโโโ โ
32โคโโโโโ โโโโโ โ128.7โคโโโโโ โ
24โคโโโโโ โโโโโโโโโโโโโโโโโโโโโโ โ 96.5โคโโโโโ โ
16โคโโโโโ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ โ 64.3โคโโโโโ โ
8โคโโโโโ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ โ 32.2โคโโโโโ โ
0โคโโโโโ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ โโโโโโ 0.0โคโโโโโ โโโโโโ
โโฌโโโโโโโโโโโโโโฌโโโโโโโโโโโโโโฌโโโโโโโโโโโโโฌโโโโโโโโโโโโโโฌโ โโฌโโโโโโโโโโโโโฌโโโโโโโโโโโโโฌโโโโโโโโโโโโโฌโโโโโโโโโโโโโฌโ
0.0056 0.0074 0.0091 0.0108 0.0125 -0.0015 0.0075 0.0164 0.0254 0.0343
text saved in /Users/samforeman/vibes/saforem2/ezpz/outputs/ezpz.examples.test/2025-12-31-124216/plots/tplot/dtf_hist.txt
loss loss/min
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
1.69โคโ โ1.69โค--- โ
1.42โคโ โ0.61โค -----------------------------------------------------โ
1.15โค โโ โ โโฌโโโโโโโโโโโโโฌโโโโโโโโโโโโโโฌโโโโโโโโโโโโโฌโโโโโโโโโโโโโฌโ
0.61โค โโโโโ โ 1.0 49.2 97.5 145.8 194.0
0.34โค โโโโโโโโโโโโโ โโโโโโโโโโโโ โโ โโ โ โ โ โ โloss/min iter
0.07โค โ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ loss/std
โโฌโโโโโโโโโโโโโฌโโโโโโโโโโโโโโฌโโโโโโโโโโโโโฌโโโโโโโโโโโโโฌโ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
1.0 49.2 97.5 145.8 194.0 0.146โค ** * * * ** ** * * * * ** * * * โ
loss iter 0.049โค*****************************************************โ
loss/mean โโฌโโโโโโโโโโโโโฌโโโโโโโโโโโโโฌโโโโโโโโโโโโโฌโโโโโโโโโโโโโฌโ
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ 1.0 49.2 97.5 145.8 194.0
1.72โคยท โloss/std iter
1.45โค ยท โ loss/max
1.18โค ยทยท โ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
0.65โค ยทยทยทยทยท โ1.74โค++ โ
0.38โค ยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยท ยท ยท ยท ยท ยท ยท โ1.20โค +++++++++ ++ + + โ
0.11โค ยทยท ยท ยทยท ยทยทยทยทยท ยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทโ0.39โค ++++++++++++++++++++++++++++++++++++++++++++++++โ
โโฌโโโโโโโโโโโโโฌโโโโโโโโโโโโโโฌโโโโโโโโโโโโโฌโโโโโโโโโโโโโฌโ โโฌโโโโโโโโโโโโโฌโโโโโโโโโโโโโโฌโโโโโโโโโโโโโฌโโโโโโโโโโโโโฌโ
1.0 49.2 97.5 145.8 194.0 1.0 49.2 97.5 145.8 194.0
loss/mean iter loss/max iter
text saved in /Users/samforeman/vibes/saforem2/ezpz/outputs/ezpz.examples.test/2025-12-31-124216/plots/tplot/loss.txt
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
1.74โค ++ loss/max โ
โ -- loss/min โ
โ ยทยท loss/mean โ
1.46โค โโ loss โ
โ โ โ
โ โ โ
1.18โค โโ โ
โ โโ โ
โ โโ+ โ
0.91โค -โโ+ โ
โ -โโโ + โโ โ
โ โโ โ +โโ โ
0.63โค โโโโโโโโ+ + โ
โ -โ-ยทโโโโโโโโโ โ+ ++ โ + โ
โ - -โโ โยทโโโโโโ โโยทยทยทยทยท ยท โ +โ +โ โ โ + โ +ยท โโ โ
0.35โค โโ โโโโโโโโโโโโโโโโยทโโ โโโโโโโยทโ+โโ+ยทโโ โโ โโโ โโ + โ โ โ+ + โ +โ โโ โ
โ โโ โ โโโ- โโโโโโ โโ โโโโโโโโโโโโโโโโโโโโโโยท ยทยทยท+โโยทโ+โโโโโ+โโโโยทยทยทโโโโโโ++โยทโโ โโโ +ยทโโยท โ
โ โโ โ -- โโโ โโ โโ-โโโโโโโโโโโโโโโ-โ โโ โโโโโโโโโ โโโโโโโโโโโยทโโโโโโโ
0.07โค โ โโ โ โ โ โ โโโโ โ
โโฌโโโโโโโโโโโโโโโโโโโโโโโโโโโโฌโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฌโโโโโโโโโโโโโโโโโโโโโโโโโโโโฌโโโโโโโโโโโโโโโโโโโโโโโโโโโโฌโ
1.0 49.2 97.5 145.8 194.0
text saved in /Users/samforeman/vibes/saforem2/ezpz/outputs/ezpz.examples.test/2025-12-31-124216/plots/tplot/loss_summary.txt
loss/mean hist loss/max hist
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
95.0โคโโโโโ โ84โคโโโโโ โ
79.2โคโโโโโ โ70โคโโโโโ โโโโโ โ
63.3โคโโโโโโโโโโโ โ56โคโโโโโ โโโโโ โ
47.5โคโโโโโโโโโโโ โ42โคโโโโโ โโโโโ โ
31.7โคโโโโโโโโโโโ โ28โคโโโโโ โโโโโ โ
15.8โคโโโโโโโโโโโโโโโโโโโโโโ โ14โคโโโโโ โโโโโโโโโโโโโโโโ โ
0.0โคโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ โโโโโโโโโโโโ 0โคโโโโโ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ โโโโโโ
โโฌโโโโโโโโโโโโโฌโโโโโโโโโโโโโโฌโโโโโโโโโโโโโฌโโโโโโโโโโโโโฌโ โโฌโโโโโโโโโโโโโโฌโโโโโโโโโโโโโโฌโโโโโโโโโโโโโฌโโโโโโโโโโโโโโฌโ
0.04 0.48 0.91 1.35 1.79 0.05 0.49 0.93 1.37 1.81
loss/min hist loss/std hist
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
84โคโโโโโ โ51.0โคโโโโโ โ
70โคโโโโโ โโโโโ โ42.5โคโโโโโโโโโโโ โ
56โคโโโโโ โโโโโ โ34.0โคโโโโโโโโโโโโโโโโโโโโโโ โ
42โคโโโโโ โโโโโ โ25.5โคโโโโโโโโโโโโโโโโโโโโโโ โ
28โคโโโโโ โโโโโ โ17.0โคโโโโโโโโโโโโโโโโโโโโโโโโโโโ โ
14โคโโโโโ โโโโโโโโโโโโโโโโ โ 8.5โคโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ โ
0โคโโโโโ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ โโโโโ โโโโโโ 0.0โคโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
โโฌโโโโโโโโโโโโโโฌโโโโโโโโโโโโโโฌโโโโโโโโโโโโโฌโโโโโโโโโโโโโโฌโ โโฌโโโโโโโโโโโโโฌโโโโโโโโโโโโโโฌโโโโโโโโโโโโโฌโโโโโโโโโโโโโฌโ
-0.00 0.44 0.88 1.32 1.76 -0.006 0.034 0.073 0.113 0.153
text saved in /Users/samforeman/vibes/saforem2/ezpz/outputs/ezpz.examples.test/2025-12-31-124216/plots/tplot/loss_hist.txt
[2025-12-31 12:42:27,454023][I][ezpz/history:2433:finalize] Saving history report to /Users/samforeman/vibes/saforem2/ezpz/outputs/ezpz.examples.test/2025-12-31-124216/report.md
[2025-12-31 12:42:27,457202][I][ezpz/examples/test:348:finalize] dataset=<xarray.Dataset> Size: 39kB
Dimensions: (draw: 194)
Coordinates:
* draw (draw) int64 2kB 0 1 2 3 4 5 6 ... 188 189 190 191 192 193
Data variables: (12/25)
iter (draw) int64 2kB 6 7 8 9 10 11 12 ... 194 195 196 197 198 199
loss (draw) float32 776B 1.689 1.526 1.385 ... 0.2018 0.1583
accuracy (draw) float32 776B 0.7031 0.625 0.625 ... 0.8906 0.9453
dtf (draw) float64 2kB 0.006963 0.006409 ... 0.007307 0.01037
dtb (draw) float64 2kB 0.001635 0.001716 ... 0.002136 0.005472
iter_mean (draw) float64 2kB 6.0 7.0 8.0 9.0 ... 197.0 198.0 199.0
... ...
dtf_min (draw) float64 2kB 0.006873 0.006409 ... 0.007263 0.01029
dtf_std (draw) float64 2kB 4.481e-05 0.0001108 ... 4.349e-05
dtb_mean (draw) float64 2kB 0.001713 0.001719 ... 0.002266 0.005477
dtb_max (draw) float64 2kB 0.001791 0.001722 ... 0.002395 0.005482
dtb_min (draw) float64 2kB 0.001635 0.001716 ... 0.002136 0.005472
dtb_std (draw) float64 2kB 7.823e-05 3.127e-06 ... 5.046e-06
[2025-12-31 12:42:28,064428][I][ezpz/examples/test:500:train] Took: 8.49 seconds to finish training
[2025-12-31 12:42:28,065364][I][ezpz/examples/test:695:main] Took: 12.54 seconds
wandb:
wandb: ๐ View run glad-fire-6862 at:
wandb: Find logs at: wandb/run-20251231_124217-de0ra7dh/logs
[2025-12-23-162222] Execution time: 19s sec
Output on Sunspot
#[aurora_frameworks-2025.2.0](ezpz-aurora_frameworks-2025.2.0)
#[/t/d/f/p/s/ezpz][dev][2s]
#[12/31/25 @ 11:30:29][x1921c0s7b0n0]
; ezpz test
[2025-12-31 11:30:42,775692][I][ezpz/launch:396:launch] ----[๐ ezpz.launch][started][2025-12-31-113042]----
[2025-12-31 11:30:43,644334][I][ezpz/launch:416:launch] Job ID: 12458339
[2025-12-31 11:30:43,645146][I][ezpz/launch:417:launch] nodelist: ['x1921c0s3b0n0', 'x1921c0s7b0n0']
[2025-12-31 11:30:43,645554][I][ezpz/launch:418:launch] hostfile: /var/spool/pbs/aux/12458339.sunspot-pbs-0001.head.cm.sunspot.alcf.anl.gov
[2025-12-31 11:30:43,646228][I][ezpz/pbs:264:get_pbs_launch_cmd] โ
Using [24/24] GPUs [2 hosts] x [12 GPU/host]
[2025-12-31 11:30:43,647063][I][ezpz/launch:367:build_executable] Building command to execute by piecing together:
[2025-12-31 11:30:43,647473][I][ezpz/launch:368:build_executable] (1.) launch_cmd: mpiexec --envall --np=24 --ppn=12 --hostfile=/var/spool/pbs/aux/12458339.sunspot-pbs-0001.head.cm.sunspot.alcf.anl.gov --no-vni --cpu-bind=verbose,list:2-4:10-12:18-20:26-28:34-36:42-44:54-56:62-64:70-72:78-80:86-88:94-96
[2025-12-31 11:30:43,648172][I][ezpz/launch:369:build_executable] (2.) cmd_to_launch: /lus/tegu/projects/datascience/foremans/projects/saforem2/ezpz/venvs/sunspot/ezpz-aurora_frameworks-2025.2.0/bin/python3 -m ezpz.examples.test
[2025-12-31 11:30:43,648999][I][ezpz/launch:433:launch] Took: 0.87 seconds to build command.
[2025-12-31 11:30:43,649375][I][ezpz/launch:436:launch] Executing:
mpiexec
--envall
--np=24
--ppn=12
--hostfile=/var/spool/pbs/aux/12458339.sunspot-pbs-0001.head.cm.sunspot.alcf.anl.gov
--no-vni
--cpu-bind=verbose,list:2-4:10-12:18-20:26-28:34-36:42-44:54-56:62-64:70-72:78-80:86-88:94-96
/lus/tegu/projects/datascience/foremans/projects/saforem2/ezpz/venvs/sunspot/ezpz-aurora_frameworks-2025.2.0/bin/python3
-m
ezpz.examples.test
[2025-12-31 11:30:43,650731][I][ezpz/launch:443:launch] Execution started @ 2025-12-31-113043...
[2025-12-31 11:30:43,651220][I][ezpz/launch:139:run_command] Running command:
mpiexec --envall --np=24 --ppn=12 --hostfile=/var/spool/pbs/aux/12458339.sunspot-pbs-0001.head.cm.sunspot.alcf.anl.gov --no-vni --cpu-bind=verbose,list:2-4:10-12:18-20:26-28:34-36:42-44:54-56:62-64:70-72:78-80:86-88:94-96 /lus/tegu/projects/datascience/foremans/projects/saforem2/ezpz/venvs/sunspot/ezpz-aurora_frameworks-2025.2.0/bin/python3 -m ezpz.examples.test
cpubind:list x1921c0s7b0n0 pid 82824 rank 12 0: mask 0x1c
cpubind:list x1921c0s7b0n0 pid 82825 rank 13 1: mask 0x1c00
cpubind:list x1921c0s7b0n0 pid 82826 rank 14 2: mask 0x1c0000
cpubind:list x1921c0s7b0n0 pid 82827 rank 15 3: mask 0x1c000000
cpubind:list x1921c0s7b0n0 pid 82828 rank 16 4: mask 0x1c00000000
cpubind:list x1921c0s7b0n0 pid 82829 rank 17 5: mask 0x1c0000000000
cpubind:list x1921c0s7b0n0 pid 82830 rank 18 6: mask 0x1c0000000000000
cpubind:list x1921c0s7b0n0 pid 82831 rank 19 7: mask 0x1c000000000000000
cpubind:list x1921c0s7b0n0 pid 82832 rank 20 8: mask 0x1c00000000000000000
cpubind:list x1921c0s7b0n0 pid 82833 rank 21 9: mask 0x1c0000000000000000000
cpubind:list x1921c0s7b0n0 pid 82834 rank 22 10: mask 0x1c000000000000000000000
cpubind:list x1921c0s7b0n0 pid 82835 rank 23 11: mask 0x1c00000000000000000000000
cpubind:list x1921c0s3b0n0 pid 92001 rank 0 0: mask 0x1c
cpubind:list x1921c0s3b0n0 pid 92002 rank 1 1: mask 0x1c00
cpubind:list x1921c0s3b0n0 pid 92003 rank 2 2: mask 0x1c0000
cpubind:list x1921c0s3b0n0 pid 92004 rank 3 3: mask 0x1c000000
cpubind:list x1921c0s3b0n0 pid 92005 rank 4 4: mask 0x1c00000000
cpubind:list x1921c0s3b0n0 pid 92006 rank 5 5: mask 0x1c0000000000
cpubind:list x1921c0s3b0n0 pid 92007 rank 6 6: mask 0x1c0000000000000
cpubind:list x1921c0s3b0n0 pid 92008 rank 7 7: mask 0x1c000000000000000
cpubind:list x1921c0s3b0n0 pid 92009 rank 8 8: mask 0x1c00000000000000000
cpubind:list x1921c0s3b0n0 pid 92010 rank 9 9: mask 0x1c0000000000000000000
cpubind:list x1921c0s3b0n0 pid 92011 rank 10 10: mask 0x1c000000000000000000000
cpubind:list x1921c0s3b0n0 pid 92012 rank 11 11: mask 0x1c00000000000000000000000
[2025-12-31 11:30:49,869292][I][ezpz/examples/test:132:__post_init__] Outputs will be saved to /lus/tegu/projects/datascience/foremans/projects/saforem2/ezpz/outputs/ezpz.examples.test/2025-12-31-113049
[2025-12-31 11:30:49,871638][I][ezpz/dist:1501:setup_torch_distributed] Using torch_{device,backend}= {xpu, xccl}
[2025-12-31 11:30:49,872308][I][ezpz/dist:1366:setup_torch_DDP] Caught MASTER_PORT=32935 from environment!
[2025-12-31 11:30:49,872846][I][ezpz/dist:1382:setup_torch_DDP] Using torch.distributed.init_process_group with
- master_addr='x1921c0s3b0n0'
- master_port='32935'
- world_size=24
- rank=0
- local_rank=0
- timeout=datetime.timedelta(seconds=3600)
- backend='xccl'
[2025-12-31 11:30:49,873745][I][ezpz/dist:1014:init_process_group] Calling torch.distributed.init_process_group_with: rank=0 world_size=24 backend=xccl
[2025-12-31 11:30:50,606235][I][ezpz/dist:1727:setup_torch] Using device='xpu' with backend='xccl' + 'xccl' for distributed training.
[2025-12-31 11:30:50,607054][W][ezpz/dist:544:print_dist_setup] Using [24 / 24] available "xpu" devices !!
[2025-12-31 11:30:50,607503][I][ezpz/dist:1774:setup_torch] ['x1921c0s3b0n0'][device='xpu'][node=0/1][rank=00/23][local_rank=00/11]
[2025-12-31 11:30:50,606736][I][ezpz/dist:1774:setup_torch] ['x1921c0s3b0n0'][device='xpu'][node=0/1][rank=10/23][local_rank=10/11]
[2025-12-31 11:30:50,606796][I][ezpz/dist:1774:setup_torch] ['x1921c0s3b0n0'][device='xpu'][node=1/1][rank=01/23][local_rank=01/11]
[2025-12-31 11:30:50,606796][I][ezpz/dist:1774:setup_torch] ['x1921c0s3b0n0'][device='xpu'][node=0/1][rank=02/23][local_rank=02/11]
[2025-12-31 11:30:50,606807][I][ezpz/dist:1774:setup_torch] ['x1921c0s3b0n0'][device='xpu'][node=1/1][rank=03/23][local_rank=03/11]
[2025-12-31 11:30:50,606817][I][ezpz/dist:1774:setup_torch] ['x1921c0s3b0n0'][device='xpu'][node=0/1][rank=04/23][local_rank=04/11]
[2025-12-31 11:30:50,606806][I][ezpz/dist:1774:setup_torch] ['x1921c0s3b0n0'][device='xpu'][node=1/1][rank=05/23][local_rank=05/11]
[2025-12-31 11:30:50,606747][I][ezpz/dist:1774:setup_torch] ['x1921c0s3b0n0'][device='xpu'][node=0/1][rank=06/23][local_rank=06/11]
[2025-12-31 11:30:50,606787][I][ezpz/dist:1774:setup_torch] ['x1921c0s3b0n0'][device='xpu'][node=1/1][rank=07/23][local_rank=07/11]
[2025-12-31 11:30:50,606737][I][ezpz/dist:1774:setup_torch] ['x1921c0s3b0n0'][device='xpu'][node=0/1][rank=08/23][local_rank=08/11]
[2025-12-31 11:30:50,606783][I][ezpz/dist:1774:setup_torch] ['x1921c0s3b0n0'][device='xpu'][node=1/1][rank=09/23][local_rank=09/11]
[2025-12-31 11:30:50,606793][I][ezpz/dist:1774:setup_torch] ['x1921c0s3b0n0'][device='xpu'][node=1/1][rank=11/23][local_rank=11/11]
[2025-12-31 11:30:50,610395][I][ezpz/examples/test:678:main] Took: 0.76 seconds to setup torch
[2025-12-31 11:30:50,606862][I][ezpz/dist:1774:setup_torch] ['x1921c0s7b0n0'][device='xpu'][node=0/1][rank=12/23][local_rank=00/11]
[2025-12-31 11:30:50,606858][I][ezpz/dist:1774:setup_torch] ['x1921c0s7b0n0'][device='xpu'][node=0/1][rank=16/23][local_rank=04/11]
[2025-12-31 11:30:50,606933][I][ezpz/dist:1774:setup_torch] ['x1921c0s7b0n0'][device='xpu'][node=1/1][rank=13/23][local_rank=01/11]
[2025-12-31 11:30:50,606960][I][ezpz/dist:1774:setup_torch] ['x1921c0s7b0n0'][device='xpu'][node=0/1][rank=14/23][local_rank=02/11]
[2025-12-31 11:30:50,606980][I][ezpz/dist:1774:setup_torch] ['x1921c0s7b0n0'][device='xpu'][node=1/1][rank=15/23][local_rank=03/11]
[2025-12-31 11:30:50,606972][I][ezpz/dist:1774:setup_torch] ['x1921c0s7b0n0'][device='xpu'][node=1/1][rank=17/23][local_rank=05/11]
[2025-12-31 11:30:50,606946][I][ezpz/dist:1774:setup_torch] ['x1921c0s7b0n0'][device='xpu'][node=0/1][rank=18/23][local_rank=06/11]
[2025-12-31 11:30:50,606949][I][ezpz/dist:1774:setup_torch] ['x1921c0s7b0n0'][device='xpu'][node=1/1][rank=19/23][local_rank=07/11]
[2025-12-31 11:30:50,606925][I][ezpz/dist:1774:setup_torch] ['x1921c0s7b0n0'][device='xpu'][node=0/1][rank=20/23][local_rank=08/11]
[2025-12-31 11:30:50,606960][I][ezpz/dist:1774:setup_torch] ['x1921c0s7b0n0'][device='xpu'][node=1/1][rank=21/23][local_rank=09/11]
[2025-12-31 11:30:50,606970][I][ezpz/dist:1774:setup_torch] ['x1921c0s7b0n0'][device='xpu'][node=0/1][rank=22/23][local_rank=10/11]
[2025-12-31 11:30:50,606949][I][ezpz/dist:1774:setup_torch] ['x1921c0s7b0n0'][device='xpu'][node=1/1][rank=23/23][local_rank=11/11]
[2025-12-31 11:30:50,617739][I][ezpz/examples/test:461:train] Model size: 567434 parameters
[2025-12-31 11:30:50,619069][I][ezpz/examples/test:465:train]
=================================================================
Layer (type:depth-idx) Param #
=================================================================
SequentialLinearNet --
โโSequential: 1-1 567,434
=================================================================
Total params: 567,434
Trainable params: 567,434
Non-trainable params: 0
=================================================================
[2025-12-31 11:30:50,620391][I][ezpz/examples/test:473:train] Took: 0.006357558071613312 seconds to build model
[2025-12-31 11:30:50,622656][I][ezpz/examples/test:601:build_model_and_optimizer] model=
SequentialLinearNet(
(layers): Sequential(
(0): Linear(in_features=784, out_features=512, bias=True)
(1): ReLU()
(2): Linear(in_features=512, out_features=256, bias=True)
(3): ReLU()
(4): Linear(in_features=256, out_features=128, bias=True)
(5): ReLU()
(6): Linear(in_features=128, out_features=10, bias=True)
)
)
[2025-12-31 11:30:50,624614][I][ezpz/dist:685:wrap_model] Wrapping model with: ddp
2025:12:31-11:30:50:(92001) |CCL_WARN| value of CCL_OP_SYNC changed to be 1 (default:0)
2025:12:31-11:30:50:(92001) |CCL_WARN| value of CCL_PROCESS_LAUNCHER changed to be pmix (default:hydra)
[2025-12-31 11:31:03,431773][I][ezpz/examples/test:479:train] Took: 12.81 seconds to build optimizer
[2025-12-31 11:31:03,484655][I][ezpz/history:220:__init__] Using History with distributed_history=True
[2025-12-31 11:31:03,487406][I][ezpz/dist:2039:setup_wandb] Setting up wandb from rank=0
[2025-12-31 11:31:03,487916][I][ezpz/dist:2040:setup_wandb] Using WB_PROJECT=ezpz.examples.test
wandb: Currently logged in as: foremans (aurora_gpt) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin
wandb: Tracking run with wandb version 0.23.1
wandb: Run data is saved locally in /lus/tegu/projects/datascience/foremans/projects/saforem2/ezpz/wandb/run-20251231_113103-cppqal9m
wandb: Run `wandb offline` to turn off syncing.
wandb: Syncing run polar-surf-6861
wandb: View project at https://wandb.ai/aurora_gpt/ezpz.examples.test
wandb: View run at https://wandb.ai/aurora_gpt/ezpz.examples.test/runs/cppqal9m
[2025-12-31 11:31:05,498721][I][ezpz/dist:2069:setup_wandb] wandb.run=[polar-surf-6861](https://wandb.ai/aurora_gpt/ezpz.examples.test/runs/cppqal9m)
[2025-12-31 11:31:05,504592][I][ezpz/dist:2112:setup_wandb] Running on machine='SunSpot'
[2025-12-31 11:31:05,852704][I][ezpz/examples/test:482:train] Took: 2.42 seconds to build trainer
[2025-12-31 11:31:05,853930][I][ezpz/examples/test:486:train] config:
{
"acc_events": false,
"backend": "DDP",
"batch_size": 128,
"cp": 1,
"dataset": "mnist",
"dataset_root": "/lus/tegu/projects/datascience/foremans/projects/saforem2/ezpz/outputs/ezpz.examples.test/datasets/mnist",
"dtype": "bf16",
"input_size": 784,
"layer_sizes": [
512,
256,
128
],
"log_freq": 1,
"no_distributed_history": false,
"num_workers": 0,
"output_size": 10,
"pp": 1,
"print_freq": 10,
"profile_memory": true,
"pyinstrument_profiler": false,
"pytorch_profiler": false,
"pytorch_profiler_active": 3,
"pytorch_profiler_repeat": 5,
"pytorch_profiler_wait": 1,
"pytorch_profiler_warmup": 2,
"rank_zero_only": false,
"record_shapes": true,
"tp": 1,
"train_iters": 200,
"warmup": 5,
"with_flops": true,
"with_modules": true,
"with_stack": true
}
[2025-12-31 11:31:05,856304][I][ezpz/examples/test:488:train] Took: 19.73 to get here.
[2025-12-31 11:31:20,012644][I][ezpz/examples/test:369:train] Warmup complete at step 5
[2025-12-31 11:31:20,317949][I][ezpz/examples/test:325:train_step] iter=10 loss=1.046174 accuracy=0.726562 dtf=0.010201 dtb=0.001751 loss/mean=1.094917 loss/max=1.271076 loss/min=0.961706 loss/std=0.078834 accuracy/mean=0.704427 accuracy/max=0.765625 accuracy/min=0.593750 accuracy/std=0.042472 dtf/mean=0.010742 dtf/max=0.012226 dtf/min=0.010026 dtf/std=0.000653 dtb/mean=0.001594 dtb/max=0.001852 dtb/min=0.001306 dtb/std=0.000177
[2025-12-31 11:31:21,122273][I][ezpz/examples/test:325:train_step] iter=20 loss=0.931834 accuracy=0.779412 dtf=0.005889 dtb=0.178909 loss/mean=0.592798 loss/max=0.931834 loss/min=0.390783 loss/std=0.136802 accuracy/mean=0.817402 accuracy/max=0.897059 accuracy/min=0.691176 accuracy/std=0.050930 dtf/mean=0.006413 dtf/max=0.006798 dtf/min=0.005839 dtf/std=0.000321 dtb/mean=0.204767 dtb/max=0.234498 dtb/min=0.178215 dtb/std=0.020612
[2025-12-31 11:31:21,659906][I][ezpz/examples/test:325:train_step] iter=30 loss=0.500784 accuracy=0.851562 dtf=0.009988 dtb=0.001570 loss/mean=0.459434 loss/max=0.755573 loss/min=0.280539 loss/std=0.115654 accuracy/mean=0.861003 accuracy/max=0.937500 accuracy/min=0.773438 accuracy/std=0.038405 dtf/mean=0.010492 dtf/max=0.011835 dtf/min=0.009957 dtf/std=0.000546 dtb/mean=0.001607 dtb/max=0.001853 dtb/min=0.001314 dtb/std=0.000157
[2025-12-31 11:31:22,283836][I][ezpz/examples/test:325:train_step] iter=40 loss=0.478971 accuracy=0.867647 dtf=0.005750 dtb=0.001340 loss/mean=0.319425 loss/max=0.549011 loss/min=0.172847 loss/std=0.095734 accuracy/mean=0.903799 accuracy/max=0.970588 accuracy/min=0.823529 accuracy/std=0.040494 dtf/mean=0.006246 dtf/max=0.007002 dtf/min=0.005576 dtf/std=0.000431 dtb/mean=0.001377 dtb/max=0.001588 dtb/min=0.001080 dtb/std=0.000155
[2025-12-31 11:31:22,834823][I][ezpz/examples/test:325:train_step] iter=50 loss=0.349907 accuracy=0.875000 dtf=0.010030 dtb=0.001433 loss/mean=0.298854 loss/max=0.401245 loss/min=0.211164 loss/std=0.060323 accuracy/mean=0.910807 accuracy/max=0.953125 accuracy/min=0.867188 accuracy/std=0.024594 dtf/mean=0.010629 dtf/max=0.011482 dtf/min=0.009914 dtf/std=0.000466 dtb/mean=0.001609 dtb/max=0.001820 dtb/min=0.001307 dtb/std=0.000167
[2025-12-31 11:31:23,512006][I][ezpz/examples/test:325:train_step] iter=60 loss=0.313367 accuracy=0.926471 dtf=0.005733 dtb=0.001325 loss/mean=0.182718 loss/max=0.313367 loss/min=0.097909 loss/std=0.047081 accuracy/mean=0.952206 accuracy/max=0.985294 accuracy/min=0.911765 accuracy/std=0.018628 dtf/mean=0.006096 dtf/max=0.006441 dtf/min=0.005576 dtf/std=0.000266 dtb/mean=0.001393 dtb/max=0.001545 dtb/min=0.001141 dtb/std=0.000121
[2025-12-31 11:31:24,034381][I][ezpz/examples/test:325:train_step] iter=70 loss=0.262147 accuracy=0.929688 dtf=0.009596 dtb=0.001584 loss/mean=0.216635 loss/max=0.327601 loss/min=0.138058 loss/std=0.060615 accuracy/mean=0.938477 accuracy/max=0.968750 accuracy/min=0.890625 accuracy/std=0.025447 dtf/mean=0.009859 dtf/max=0.011898 dtf/min=0.009397 dtf/std=0.000751 dtb/mean=0.001568 dtb/max=0.001724 dtb/min=0.001312 dtb/std=0.000131
[2025-12-31 11:31:24,636021][I][ezpz/examples/test:325:train_step] iter=80 loss=0.156417 accuracy=0.955882 dtf=0.005665 dtb=0.001253 loss/mean=0.111892 loss/max=0.192739 loss/min=0.060052 loss/std=0.033597 accuracy/mean=0.971814 accuracy/max=1.000000 accuracy/min=0.941176 accuracy/std=0.012681 dtf/mean=0.006088 dtf/max=0.006498 dtf/min=0.005522 dtf/std=0.000325 dtb/mean=0.001360 dtb/max=0.001627 dtb/min=0.001064 dtb/std=0.000163
[2025-12-31 11:31:25,102420][I][ezpz/examples/test:325:train_step] iter=90 loss=0.195402 accuracy=0.937500 dtf=0.009428 dtb=0.001697 loss/mean=0.154431 loss/max=0.232959 loss/min=0.095484 loss/std=0.043264 accuracy/mean=0.954102 accuracy/max=0.976562 accuracy/min=0.921875 accuracy/std=0.017222 dtf/mean=0.010213 dtf/max=0.012565 dtf/min=0.009396 dtf/std=0.000995 dtb/mean=0.001599 dtb/max=0.001942 dtb/min=0.001136 dtb/std=0.000188
[2025-12-31 11:31:25,595109][I][ezpz/examples/test:325:train_step] iter=100 loss=0.079676 accuracy=1.000000 dtf=0.005687 dtb=0.001371 loss/mean=0.076284 loss/max=0.154268 loss/min=0.046152 loss/std=0.028206 accuracy/mean=0.987132 accuracy/max=1.000000 accuracy/min=0.955882 accuracy/std=0.012246 dtf/mean=0.006011 dtf/max=0.007985 dtf/min=0.005620 dtf/std=0.000624 dtb/mean=0.001366 dtb/max=0.001765 dtb/min=0.001072 dtb/std=0.000164
[2025-12-31 11:31:26,171868][I][ezpz/examples/test:325:train_step] iter=110 loss=0.154822 accuracy=0.960938 dtf=0.009538 dtb=0.001698 loss/mean=0.115471 loss/max=0.210122 loss/min=0.066416 loss/std=0.037737 accuracy/mean=0.969401 accuracy/max=0.984375 accuracy/min=0.937500 accuracy/std=0.013327 dtf/mean=0.009814 dtf/max=0.012341 dtf/min=0.009225 dtf/std=0.000823 dtb/mean=0.001554 dtb/max=0.001722 dtb/min=0.001096 dtb/std=0.000190
[2025-12-31 11:31:26,733522][I][ezpz/examples/test:325:train_step] iter=120 loss=0.053370 accuracy=0.985294 dtf=0.005611 dtb=0.001238 loss/mean=0.056127 loss/max=0.126032 loss/min=0.025829 loss/std=0.023853 accuracy/mean=0.990196 accuracy/max=1.000000 accuracy/min=0.955882 accuracy/std=0.013197 dtf/mean=0.006132 dtf/max=0.006825 dtf/min=0.005512 dtf/std=0.000383 dtb/mean=0.001366 dtb/max=0.001595 dtb/min=0.001062 dtb/std=0.000149
[2025-12-31 11:31:27,304428][I][ezpz/examples/test:325:train_step] iter=130 loss=0.106341 accuracy=0.976562 dtf=0.009662 dtb=0.001578 loss/mean=0.087535 loss/max=0.152122 loss/min=0.041659 loss/std=0.031179 accuracy/mean=0.979818 accuracy/max=1.000000 accuracy/min=0.953125 accuracy/std=0.010792 dtf/mean=0.010319 dtf/max=0.010932 dtf/min=0.009599 dtf/std=0.000411 dtb/mean=0.001625 dtb/max=0.001806 dtb/min=0.001306 dtb/std=0.000155
[2025-12-31 11:31:27,837406][I][ezpz/examples/test:325:train_step] iter=140 loss=0.038179 accuracy=1.000000 dtf=0.005624 dtb=0.001283 loss/mean=0.031276 loss/max=0.058039 loss/min=0.018561 loss/std=0.009414 accuracy/mean=1.000000 accuracy/max=1.000000 accuracy/min=1.000000 accuracy/std=0.000000 dtf/mean=0.006149 dtf/max=0.006750 dtf/min=0.005484 dtf/std=0.000363 dtb/mean=0.001352 dtb/max=0.001624 dtb/min=0.001071 dtb/std=0.000156
[2025-12-31 11:31:28,429546][I][ezpz/examples/test:325:train_step] iter=150 loss=0.075107 accuracy=0.976562 dtf=0.009395 dtb=0.001523 loss/mean=0.078655 loss/max=0.160285 loss/min=0.043498 loss/std=0.028539 accuracy/mean=0.978516 accuracy/max=1.000000 accuracy/min=0.945312 accuracy/std=0.014131 dtf/mean=0.010171 dtf/max=0.010841 dtf/min=0.009276 dtf/std=0.000518 dtb/mean=0.001609 dtb/max=0.001818 dtb/min=0.001068 dtb/std=0.000180
[2025-12-31 11:31:28,948358][I][ezpz/examples/test:325:train_step] iter=160 loss=0.039511 accuracy=0.985294 dtf=0.005642 dtb=0.001399 loss/mean=0.038084 loss/max=0.072799 loss/min=0.011212 loss/std=0.016665 accuracy/mean=0.993873 accuracy/max=1.000000 accuracy/min=0.970588 accuracy/std=0.008404 dtf/mean=0.006147 dtf/max=0.006805 dtf/min=0.005509 dtf/std=0.000402 dtb/mean=0.001383 dtb/max=0.001682 dtb/min=0.001094 dtb/std=0.000159
[2025-12-31 11:31:29,524806][I][ezpz/examples/test:325:train_step] iter=170 loss=0.090782 accuracy=0.968750 dtf=0.009549 dtb=0.001483 loss/mean=0.063093 loss/max=0.135812 loss/min=0.029736 loss/std=0.026772 accuracy/mean=0.984375 accuracy/max=1.000000 accuracy/min=0.960938 accuracy/std=0.011935 dtf/mean=0.010131 dtf/max=0.010809 dtf/min=0.009328 dtf/std=0.000468 dtb/mean=0.001589 dtb/max=0.001801 dtb/min=0.001083 dtb/std=0.000189
[2025-12-31 11:31:30,100256][I][ezpz/examples/test:325:train_step] iter=180 loss=0.028730 accuracy=1.000000 dtf=0.005630 dtb=0.001255 loss/mean=0.031807 loss/max=0.089583 loss/min=0.009972 loss/std=0.017174 accuracy/mean=0.995098 accuracy/max=1.000000 accuracy/min=0.970588 accuracy/std=0.008130 dtf/mean=0.006067 dtf/max=0.006434 dtf/min=0.005511 dtf/std=0.000292 dtb/mean=0.001388 dtb/max=0.001594 dtb/min=0.001066 dtb/std=0.000147
[2025-12-31 11:31:30,617119][I][ezpz/examples/test:325:train_step] iter=190 loss=0.044844 accuracy=0.984375 dtf=0.009522 dtb=0.001602 loss/mean=0.051969 loss/max=0.151458 loss/min=0.025844 loss/std=0.027686 accuracy/mean=0.985677 accuracy/max=1.000000 accuracy/min=0.953125 accuracy/std=0.011423 dtf/mean=0.010067 dtf/max=0.012187 dtf/min=0.009460 dtf/std=0.000770 dtb/mean=0.001618 dtb/max=0.001843 dtb/min=0.001297 dtb/std=0.000141
[2025-12-31 11:31:32,515303][I][ezpz/history:2385:finalize] Saving plots to /lus/tegu/projects/datascience/foremans/projects/saforem2/ezpz/outputs/ezpz.examples.test/2025-12-31-113049/plots/mplot (matplotlib) and /lus/tegu/projects/datascience/foremans/projects/saforem2/ezpz/outputs/ezpz.examples.test/2025-12-31-113049/plots/tplot (tplot)
accuracy accuracy/min
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
1.000โค โโโโโโโโโโโโโโโโโโโโ1.000โค ---------------------โ
0.931โค โโโโโโโโโโโโโโโโโ โ โ โ โ0.826โค ------------- - - โ
โ โโโโโโโโโ โ โ0.651โค---- โ
0.862โค โโโโโ โ0.477โค- โ
0.793โค โโโโ โ โโฌโโโโโโโโฌโโโโโโโโฌโโโโโโโโฌโโโโโโโโฌโ
0.724โคโโโ โ 1.0 49.2 97.5 145.8 194.0
โโโโ โaccuracy/min iter
0.655โคโโโ โ accuracy/std
0.586โคโโ โ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
โโฌโโโโโโโโฌโโโโโโโโฌโโโโโโโโฌโโโโโโโโฌโ0.067โค** โ
1.0 49.2 97.5 145.8 194.0 0.056โค**** โ
accuracy iter 0.034โค ************ * โ
accuracy/mean 0.022โค **************************โ
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ0.000โค * * * โ
1.000โค ยท ยทยทยทยทยทยทยทยทยทยทยทยทยทยทโ โโฌโโโโโโโโฌโโโโโโโโฌโโโโโโโโฌโโโโโโโโฌโ
0.928โค ยทยทยทยทยทยทยทยทยทยทยทยท ยทยทยทยท โ 1.0 49.2 97.5 145.8 194.0
โ ยทยทยทยทยท โaccuracy/std iter
0.856โค ยทยทยท โ accuracy/max
0.784โค ยทยท โ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
โ ยทยท โ1.000โค + +++++++++++++++++++++++++โ
0.712โค ยท โ0.952โค +++++++++ โ
0.640โคยท โ0.855โค +++ โ
โยท โ0.807โค + โ
0.568โคยท โ0.711โค+ โ
โโฌโโโโโโโโฌโโโโโโโโฌโโโโโโโโฌโโโโโโโโฌโ โโฌโโโโโโโโฌโโโโโโโโฌโโโโโโโโฌโโโโโโโโฌโ
1.0 49.2 97.5 145.8 194.0 1.0 49.2 97.5 145.8 194.0
accuracy/mean iter accuracy/max iter
text saved in /lus/tegu/projects/datascience/foremans/projects/saforem2/ezpz/outputs/ezpz.examples.test/2025-12-31-113049/plots/tplot/accuracy.txt
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
1.000โค ++ accuracy/max + ++ +โ++โ++โ++++โโ++โ++โ+โ++โ+ยทโ+โโ++โโ+โโโโโโโ
โ -- accuracy/min ++++++++++ยท++โโ+โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโยทโโโโยทโ
โ ยทยท accuracy/mean ++ยท++โ โ โโโโโโยทโโโโโโ-โ-โโโโโ-โโโโ----โ---โ--โ--- --- โ
โ โโ accuracy +โโยทโโโโโโโโ โ-โโโโ------โ------ - ----- ---- - -- --โ
0.913โค +++ + โโโโโโโโโโโโโโ------ โ- - - - - - - โ
โ ++++ โ โโโโโโโ-โ-โ--- -- - - - โ
โ + โโ โโโโโโโโ------- - โ
โ +++ โโโโโโ โ-โ- - - โ
0.826โค +++ โโโ โ- --- โ
โ + + ยทโโโ-โ----- โ
โ +โ ยทโโโโ-- - - โ
โ +โยทยทโ - -- โ
0.738โค +โโโโ-- โ
โ+โโโโ - โ
โ+โโ โ- โ
โ โโ โ- โ
โโโโ โ- โ
0.651โคโโโ- โ
โโโโ- โ
โโโโ โ
โยท-โ โ
0.564โคยท- โ
โ - โ
โ - โ
โ - โ
0.477โค-- โ
โโฌโโโโโโโโโโโโโโโโโโฌโโโโโโโโโโโโโโโโโโฌโโโโโโโโโโโโโโโโโโฌโโโโโโโโโโโโโโโโโโฌโ
1.0 49.2 97.5 145.8 194.0
text saved in /lus/tegu/projects/datascience/foremans/projects/saforem2/ezpz/outputs/ezpz.examples.test/2025-12-31-113049/plots/tplot/accuracy_summary.txt
accuracy/mean hist accuracy/max hist
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
108โค โโโโโ132โค โโโโโ
90โค โโโโโ110โค โโโโโ
โ โโโโโ โ โโโโโ
72โค โโโโโ 88โค โโโโโ
54โค โโโโโ 66โค โโโโโ
โ โโโโโ โ โโโโโ
36โค โโโโโโโโ 44โค โโโโโ
18โค โโโโโโโโโโโโ 22โค โโโโโโโโ
โ โโโโโโโโโโโโโโโโโโโโโโ โ โโโโโโโโโโโโ
0โคโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ 0โคโโโโโโโ โโโโโโโโโโโโโโโโโโโโโโโโโโ
โโฌโโโโโโโโโฌโโโโโโโโฌโโโโโโโโโฌโโโโโโโโฌโ โโฌโโโโโโโโโฌโโโโโโโโฌโโโโโโโโโฌโโโโโโโโฌโ
0.55 0.67 0.78 0.90 1.02 0.698 0.777 0.855 0.934 1.013
accuracy/min hist accuracy/std hist
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
85.0โค โโโโ โ73.0โค โโโ โ
โ โโโโ โ โ โโโ โ
70.8โค โโโโ โ60.8โค โโโ โ
56.7โค โโโโ โ48.7โค โโโโโโโ โ
โ โโโโ โ โ โโโโโโโ โ
42.5โค โโโโโโโโ36.5โค โโโโโโโ โ
โ โโโโโโโโ โ โโโโโโโโโโโ โ
28.3โค โโโโโโโโโโโ24.3โค โโโโโโโโโโโโโโ โ
14.2โค โโโโโโโโโโโโโโโ12.2โค โโโโโโโโโโโโโโ โ
โ โโโโ โโโโโโโโโโโโโโโโโโ โ โโโโโโโโโโโโโโโโโโโโโโโโ โ
0.0โคโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ 0.0โคโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
โโฌโโโโโโโโฌโโโโโโโโโฌโโโโโโโโฌโโโโโโโโฌโ โโฌโโโโโโโโฌโโโโโโโโโฌโโโโโโโโฌโโโโโโโโฌโ
0.45 0.60 0.74 0.88 1.02 -0.003 0.015 0.034 0.052 0.070
text saved in /lus/tegu/projects/datascience/foremans/projects/saforem2/ezpz/outputs/ezpz.examples.test/2025-12-31-113049/plots/tplot/accuracy_hist.txt
dtb dtb/min
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
0.179โค โ โ0.178โค - โ
0.149โค โ โ0.119โค - โ
โ โ โ0.060โค - โ
0.120โค โ โ0.001โค---------------------------------โ
0.090โค โ โ โโฌโโโโโโโโฌโโโโโโโโฌโโโโโโโโฌโโโโโโโโฌโ
0.060โค โ โ 1.0 49.2 97.5 145.8 194.0
โ โ โdtb/min iter
0.031โค โ โ dtb/std
0.001โคโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
โโฌโโโโโโโโฌโโโโโโโโฌโโโโโโโโฌโโโโโโโโฌโ0.0206โค * โ
1.0 49.2 97.5 145.8 194.0 0.0172โค * โ
dtb iter 0.0103โค * โ
dtb/mean 0.0069โค * โ
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ0.0001โค********************************โ
0.205โค ยท โ โโฌโโโโโโโโฌโโโโโโโโฌโโโโโโโฌโโโโโโโโฌโ
0.171โค ยท โ 1.0 49.2 97.5 145.8 194.0
โ ยท โdtb/std iter
0.137โค ยท โ dtb/max
0.103โค ยท โ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
โ ยท โ0.234โค + โ
0.069โค ยท โ0.196โค + โ
0.035โค ยท โ0.118โค + โ
โ ยท โ0.079โค + โ
0.001โคยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทโ0.002โค+++++++++++++++++++++++++++++++++โ
โโฌโโโโโโโโฌโโโโโโโโฌโโโโโโโโฌโโโโโโโโฌโ โโฌโโโโโโโโฌโโโโโโโโฌโโโโโโโโฌโโโโโโโโฌโ
1.0 49.2 97.5 145.8 194.0 1.0 49.2 97.5 145.8 194.0
dtb/mean iter dtb/max iter
text saved in /lus/tegu/projects/datascience/foremans/projects/saforem2/ezpz/outputs/ezpz.examples.test/2025-12-31-113049/plots/tplot/dtb.txt
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
0.234โค ++ dtb/max โ
โ -- dtb/min โ
โ ยทยท dtb/mean โ
โ โโ dtb โ
0.196โค ยท โ
โ ยท โ
โ โ โ
โ โ โ
0.157โค โ โ
โ โ โ
โ โ โ
โ โ โ
0.118โค โ โ
โ โ โ
โ โ โ
โ โ โ
โ โ โ
0.079โค โ โ
โ โ โ
โ โ โ
โ โ โ
0.040โค โ โ
โ โ โ
โ โ โ
โ โ โ
0.001โคโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
โโฌโโโโโโโโโโโโโโโโโโฌโโโโโโโโโโโโโโโโโโฌโโโโโโโโโโโโโโโโโโฌโโโโโโโโโโโโโโโโโโฌโ
1.0 49.2 97.5 145.8 194.0
text saved in /lus/tegu/projects/datascience/foremans/projects/saforem2/ezpz/outputs/ezpz.examples.test/2025-12-31-113049/plots/tplot/dtb_summary.txt
dtb/mean hist dtb/max hist
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
193.0โคโโโโ โ193.0โคโโโโ โ
160.8โคโโโโ โ160.8โคโโโโ โ
โโโโโ โ โโโโโ โ
128.7โคโโโโ โ128.7โคโโโโ โ
96.5โคโโโโ โ 96.5โคโโโโ โ
โโโโโ โ โโโโโ โ
64.3โคโโโโ โ 64.3โคโโโโ โ
32.2โคโโโโ โ 32.2โคโโโโ โ
โโโโโ โ โโโโโ โ
0.0โคโโโ โโโโโ 0.0โคโโโ โโโโโ
โโฌโโโโโโโโฌโโโโโโโโฌโโโโโโโโฌโโโโโโโโฌโ โโฌโโโโโโโโฌโโโโโโโโฌโโโโโโโโฌโโโโโโโโฌโ
-0.008 0.048 0.103 0.158 0.214 -0.009 0.055 0.118 0.181 0.245
dtb/min hist dtb/std hist
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
193.0โคโโโโ โ193.0โคโโโโ โ
โโโโโ โ โโโโโ โ
160.8โคโโโโ โ160.8โคโโโโ โ
128.7โคโโโโ โ128.7โคโโโโ โ
โโโโโ โ โโโโโ โ
96.5โคโโโโ โ 96.5โคโโโโ โ
โโโโโ โ โโโโโ โ
64.3โคโโโโ โ 64.3โคโโโโ โ
32.2โคโโโโ โ 32.2โคโโโโ โ
โโโโโ โ โโโโโ โ
0.0โคโโโ โโโโโ 0.0โคโโโ โโโโโ
โโฌโโโโโโโโฌโโโโโโโโฌโโโโโโโโฌโโโโโโโโฌโ โโฌโโโโโโโโฌโโโโโโโโฌโโโโโโโโฌโโโโโโโโโ
-0.007 0.041 0.090 0.138 0.186 -0.0008 0.0048 0.0103 0.0159
text saved in /lus/tegu/projects/datascience/foremans/projects/saforem2/ezpz/outputs/ezpz.examples.test/2025-12-31-113049/plots/tplot/dtb_hist.txt
dtf dtf/min
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
0.0122โค โ โ0.0115โค - - - - - - - - - โ
0.0111โค โ โ โ โโ โ โ โ โ0.0095โค--------------------------------โ
โโ โ โโ โ โโ โ โโ โ โโโโ โโโ0.0075โค - - - - - - -- - - โ
0.0100โคโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ0.0055โค - - - - - - - - - โ
0.0089โค โโ โ โโโ โ โ โ โโโโโ โ โโ โโฌโโโโโโโโฌโโโโโโโโฌโโโโโโโฌโโโโโโโโฌโ
0.0078โค โ โ โโ โ โ โ โโ โ โ โ 1.0 49.2 97.5 145.8 194.0
โ โ โ โโ โ โ โ โโ โ โ โdtf/min iter
0.0067โค โ โ โโ โ โ โ โโ โ โ โ dtf/std
0.0056โค โ โ โโ โ โ โ โโ โ โ โ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
โโฌโโโโโโโโฌโโโโโโโโฌโโโโโโโฌโโโโโโโโฌโ0.00104โค * ** * * * **** * *โ
1.0 49.2 97.5 145.8 194.0 0.00091โค ** ****** ********************โ
dtf iter 0.00065โค*******************************โ
dtf/mean 0.00052โค*******************************โ
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ0.00027โค ** * ** * * * *** *** โ
0.0123โค ยท ยท โ โโฌโโโโโโโโฌโโโโโโโฌโโโโโโโโฌโโโโโโโโ
0.0113โค ยท ยท ยท ยท ยท ยท ยท ยท ยท โ 1.0 49.2 97.5 145.8
โยทยทยทยท ยทยทยท ยท ยทยทยทยทยท ยท ยทยท ยทยท ยท ยท โdtf/std iter
0.0102โคยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทโ dtf/max
0.0092โคยทยทยทยทยทยท ยทยทยทยทยท ยท ยทยท ยทยทยทยท ยท ยทยทยท โ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
โ ยท ยท ยท ยท ยท ยท ยทยท ยท ยท โ0.0144โค + + + + + + โ
0.0081โค ยท ยท ยท ยท ยท ยท ยทยท ยท ยท โ0.0131โค++++++++++++++++++++++++++++++++โ
0.0071โค ยท ยท ยท ยท ยท ยท ยทยท ยท ยท โ0.0104โค +++++++++++++++++++++++++++++++โ
โ ยท ยท ยท ยท ยท ยท ยทยท ยท ยท โ0.0091โค + + + + + + ++ + + โ
0.0060โค ยท ยท ยท ยท ยท ยท ยท ยท โ0.0064โค + + + + + + + + โ
โโฌโโโโโโโโฌโโโโโโโโฌโโโโโโโฌโโโโโโโโฌโ โโฌโโโโโโโโฌโโโโโโโโฌโโโโโโโฌโโโโโโโโฌโ
1.0 49.2 97.5 145.8 194.0 1.0 49.2 97.5 145.8 194.0
dtf/mean iter dtf/max iter
text saved in /lus/tegu/projects/datascience/foremans/projects/saforem2/ezpz/outputs/ezpz.examples.test/2025-12-31-113049/plots/tplot/dtf.txt
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
0.0144โค ++ dtf/max + + โ
โ -- dtf/min + + + + โ
โ ยทยท dtf/mean + + + ++ + โ
โ โโ dtf + + + ++ + + โ
0.0129โค ++ + + + + ++ + ++ + + ++ ++ + + โ
โ ++ + +++ + +++ ++++ +++ ++ ++ + ++ ++ +++ +++ ++ + + โ
โ ++++โยท+++++++++ +++++ +++++++++++ +++++++++++++++++++++++++++++++++++++โ
โ ++++โยท+++++++++++++++++++++++++++ ++++++++++++++++++++++ยท++++++++++++++โ
0.0115โค+ +++โยท++++++โ++++++++++++++++++++ ยท++++++โ+++++++ยท++++++โ++++++++++++++โ
โ +++โยท+++++โโ +++++ยท+++++++ยท++++++โ++++++โ+ ++++โโ++++++โ++++++ยทโ++++++โ
โโยท+++โยทยท+ + โโ ++++โ++ ++ยทโโ+ยท+ ยท+โ+ ++ +โ+ + +โโ +++++โ+ยท++++โโ+++++ โ
โโยท โยทยท โ โโยท ยท +โ ยท ยทยทยทโโยทยท ยทโโ + ยทโ+ โโ ยท++โยทโ + โโ++ ยท โโ
0.0100โคโโยทยทยทโยทยทยทโยทยทโโโยทโยทยทยทโยทยทยทยทยทยทโโโโยทยทโโโยทยทยทโยทโโยทยทยทยทยทยทโโยทยทยทยทยทยทโยทโยทโยทยทโโยทยทยทโโโโ
โโโโโโโโโโโยทยทโโโโโยทยทยทโโโโโโโโโโโโ โโโยท โโโโโยทยทยทโยทโโโยทยทโโโยทโโโโโ โโยทยทโโโโโ
โโ-โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
โ โ - -โยท โโ - โ โโ -- โ ---โ-- - โโ - โ - -โ
โ โ โยท โโ โ โโ โ โ โโ โ โ
0.0085โค โ โยท โโ โ โโ โ โ โโ โ โ
โ โ โยท โโ โ โโ โ โ โโ โ โ
โ โ โยท โโ โ โโ โ โ โโ โ โ
โ โ โยท โโ โ โโ โ โ โโ โ โ
0.0070โค โ โยท โโ โ โโ โ โ โโ โ โ
โ โ โยท โโ โ โโ โ โ โโ โ โ
โ โ โยท โโ โ โโ โ โ โโ โ โ
โ โ โ- โโ โ โโ โ โ โโ โ โ
0.0055โค - โ โ โ โ โ โ โ โ
โโฌโโโโโโโโโโโโโโโโโโฌโโโโโโโโโโโโโโโโโโฌโโโโโโโโโโโโโโโโโฌโโโโโโโโโโโโโโโโโโฌโ
1.0 49.2 97.5 145.8 194.0
text saved in /lus/tegu/projects/datascience/foremans/projects/saforem2/ezpz/outputs/ezpz.examples.test/2025-12-31-113049/plots/tplot/dtf_summary.txt
dtf/mean hist dtf/max hist
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
152.0โค โโโ โ75.0โค โโโ โ
126.7โค โโโ โ62.5โค โโโโ โโโ โ
โ โโโ โ โ โโโโ โโโ โ
101.3โค โโโ โ50.0โค โโโโ โโโ โ
76.0โค โโโ โ37.5โค โโโโ โโโ โ
โ โโโ โ โ โโโโ โโโ โ
50.7โค โโโ โ25.0โค โโโโโโโโโโ โ
25.3โค โโโ โ12.5โค โโโโโโโโโโโโโโ โ
โโโโโ โโโโโโ โ โโโโโ โโโโโโโโโโโโโโโโโโ
0.0โคโโโ โโโโโโโโโโโโโโโโโโ 0.0โคโโโโโโโ โโโโโโโโโโโโโโโโโโ
โโฌโโโโโโโโฌโโโโโโโโฌโโโโโโโโฌโโโโโโโโโ โโฌโโโโโโโโฌโโโโโโโโโฌโโโโโโโโฌโโโโโโโโโ
0.0057 0.0074 0.0092 0.0109 0.0061 0.0083 0.0104 0.0126
dtf/min hist dtf/std hist
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
156โค โโโโ โ31.0โค โโโ โ
โ โโโโ โ โ โโโ โ
130โค โโโโ โ25.8โค โโโ โโโ โ
104โค โโโโ โ20.7โค โโโโโโโ โโโโโโโ โ
โ โโโโ โ โ โโโโโโโโโโโโโโ โโโโโโโ โ
78โค โโโโ โ15.5โค โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
โ โโโโ โ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
52โค โโโโ โ10.3โคโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
26โค โโโโ โ 5.2โคโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
โโโโโ โโโโโโโ โ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
0โคโโโ โโโโโโโโโโโโโโโ 0.0โคโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
โโฌโโโโโโโโโฌโโโโโโโโฌโโโโโโโโโฌโโโโโโโโโ โโฌโโโโโโโโฌโโโโโโโโโฌโโโโโโโโฌโโโโโโโโโ
0.0052 0.0069 0.0085 0.0101 0.00023 0.00044 0.00065 0.00086
text saved in /lus/tegu/projects/datascience/foremans/projects/saforem2/ezpz/outputs/ezpz.examples.test/2025-12-31-113049/plots/tplot/dtf_hist.txt
loss loss/min
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
1.75โคโ โ1.66โค- โ
1.46โคโ โ1.11โค-- โ
โโ โ0.56โค ----- โ
1.17โคโโ โ0.01โค ------------------------------โ
0.89โค โโ โ โโฌโโโโโโโโฌโโโโโโโโโฌโโโโโโโโฌโโโโโโโโฌโ
0.60โค โโโ โ 1.0 49.2 97.5 145.8 194.0
โ โโโโโโ โloss/min iter
0.31โค โโโโโโโโโโโโโ โ โ โ loss/std
0.02โค โ โโโโโโโโโโโโโโโโโโโโโโ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
โโฌโโโโโโโโฌโโโโโโโโโฌโโโโโโโโฌโโโโโโโโฌโ0.137โค * โ
1.0 49.2 97.5 145.8 194.0 0.116โค ****** โ
loss iter 0.073โค** ********** โ
loss/mean 0.052โค* **************************โ
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ0.009โค *** * ******** โ
1.74โคยท โ โโฌโโโโโโโโฌโโโโโโโโฌโโโโโโโโฌโโโโโโโโฌโ
1.46โคยท โ 1.0 49.2 97.5 145.8 194.0
โยท โloss/std iter
1.17โค ยท โ loss/max
0.89โค ยท โ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
โ ยทยท โ1.83โค+ โ
0.60โค ยทยทยท โ1.53โค++ โ
0.32โค ยทยทยทยท โ0.94โค ++++ โ
โ ยทยทยทยทยทยทยทยทยทยทยท โ0.65โค +++++++++++ โ
0.03โค ยท ยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทยทโ0.06โค +++++++++++++++++++++++โ
โโฌโโโโโโโโฌโโโโโโโโโฌโโโโโโโโฌโโโโโโโโฌโ โโฌโโโโโโโโฌโโโโโโโโโฌโโโโโโโโฌโโโโโโโโฌโ
1.0 49.2 97.5 145.8 194.0 1.0 49.2 97.5 145.8 194.0
loss/mean iter loss/max iter
text saved in /lus/tegu/projects/datascience/foremans/projects/saforem2/ezpz/outputs/ezpz.examples.test/2025-12-31-113049/plots/tplot/loss.txt
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
1.83โค ++ loss/max โ
โ -- loss/min โ
โ ยทยท loss/mean โ
โ โโ loss โ
1.53โคโ+ โ
โโ+ โ
โโยท โ
โ โ โ
1.22โค โ+ โ
โ โ+ โ
โ โโ โ
โ โโ+ โ
0.92โค โ+ โ โ
โ โ โโ + โ
โ โโโโ+++ โ
โ -โโโ+ ++ โ
โ -โโโ ++ โ
0.62โค --ยทโโโ +++ + โ
โ --โโโโ ++++++ โ
โ ---โโโโโโโ+++โ + โ
โ ----โโโโโโโ โ+++โ++++ + โ
0.31โค ---- ยทยทโโโโโโโโ++โ++โ ++ + โ
โ - -----โโยทยทโโโโโโโโโ++โ+++โ+++++++โ + + + + + โ
โ - -----ยท---โโยทยทโโโโโโโโโโโโโ โโโ+โโโ++โโ++++++โ+++++โ ++++++โ
โ - ---โ--ยท----โ-โโโยทโโโโโโโโโโโโโโโโโโโโโโโโโโโโโยทโโยทยทยทโ
0.01โค - -โ---โ---โ--โ-โ-โ--โ-โโโโโโโโโโโโ
โโฌโโโโโโโโโโโโโโโโโโฌโโโโโโโโโโโโโโโโโโโฌโโโโโโโโโโโโโโโโโโฌโโโโโโโโโโโโโโโโโโฌโ
1.0 49.2 97.5 145.8 194.0
text saved in /lus/tegu/projects/datascience/foremans/projects/saforem2/ezpz/outputs/ezpz.examples.test/2025-12-31-113049/plots/tplot/loss_summary.txt
loss/mean hist loss/max hist
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
127.0โคโโโโ โ107.0โคโโโโ โ
105.8โคโโโโ โ 89.2โคโโโโ โ
โโโโโ โ โโโโโ โ
84.7โคโโโโ โ 71.3โคโโโโ โ
63.5โคโโโโ โ 53.5โคโโโโ โ
โโโโโ โ โโโโโ โ
42.3โคโโโโ โ 35.7โคโโโโโโโ โ
21.2โคโโโโโโโ โ 17.8โคโโโโโโโ โ
โโโโโโโโโโโโโโ โ โโโโโโโโโโโโโโโโโโ โ
0.0โคโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ 0.0โคโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
โโฌโโโโโโโโฌโโโโโโโโฌโโโโโโโโฌโโโโโโโโฌโ โโฌโโโโโโโโฌโโโโโโโโฌโโโโโโโโฌโโโโโโโโฌโ
-0.04 0.42 0.89 1.35 1.82 -0.02 0.46 0.94 1.43 1.91
loss/min hist loss/std hist
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
146.0โคโโโโ โ73.0โค โโโโ โ
โโโโโ โ โ โโโโ โ
121.7โคโโโโ โ60.8โค โโโโ โ
97.3โคโโโโ โ48.7โค โโโโ โ
โโโโโ โ โ โโโโโโโ โ
73.0โคโโโโ โ36.5โค โโโโโโโ โ
โโโโโ โ โ โโโโโโโ โ
48.7โคโโโโ โ24.3โค โโโโโโโโโโโ โ
24.3โคโโโโโโโ โ12.2โค โโโโโโโโโโโโโโโโโโโโโ โ
โโโโโโโโโโโ โ โโโโโโโโโโโโโโโโโโโโโโโโโโโโ โ
0.0โคโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ 0.0โคโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
โโฌโโโโโโโโฌโโโโโโโโฌโโโโโโโโฌโโโโโโโโฌโ โโฌโโโโโโโโฌโโโโโโโโโฌโโโโโโโโฌโโโโโโโโฌโ
-0.06 0.39 0.83 1.28 1.73 0.004 0.038 0.073 0.108 0.142
text saved in /lus/tegu/projects/datascience/foremans/projects/saforem2/ezpz/outputs/ezpz.examples.test/2025-12-31-113049/plots/tplot/loss_hist.txt
[2025-12-31 11:31:37,843701][I][ezpz/history:2433:finalize] Saving history report to /lus/tegu/projects/datascience/foremans/projects/saforem2/ezpz/outputs/ezpz.examples.test/2025-12-31-113049/report.md
[2025-12-31 11:31:37,850659][I][ezpz/examples/test:348:finalize] dataset=<xarray.Dataset> Size: 39kB
Dimensions: (draw: 194)
Coordinates:
* draw (draw) int64 2kB 0 1 2 3 4 5 6 ... 188 189 190 191 192 193
Data variables: (12/25)
iter (draw) int64 2kB 6 7 8 9 10 11 12 ... 194 195 196 197 198 199
loss (draw) float32 776B 1.749 1.581 1.343 ... 0.02264 0.03851
accuracy (draw) float32 776B 0.6094 0.625 0.6797 ... 0.9922 1.0 1.0
dtf (draw) float64 2kB 0.01076 0.009483 ... 0.009295 0.009377
dtb (draw) float64 2kB 0.001477 0.001752 ... 0.00169 0.001521
iter_mean (draw) float64 2kB 6.0 7.0 8.0 9.0 ... 197.0 198.0 199.0
... ...
dtf_min (draw) float64 2kB 0.00996 0.009416 ... 0.009295 0.009235
dtf_std (draw) float64 2kB 0.0004828 0.0005322 ... 0.0005547
dtb_mean (draw) float64 2kB 0.001618 0.001605 ... 0.001609 0.001615
dtb_max (draw) float64 2kB 0.001847 0.001759 ... 0.001745 0.001847
dtb_min (draw) float64 2kB 0.001336 0.00131 ... 0.001343 0.00108
dtb_std (draw) float64 2kB 0.0001653 0.0001423 ... 0.0001303 0.000167
[2025-12-31 11:31:38,752472][I][ezpz/examples/test:500:train] Took: 32.90 seconds to finish training
[2025-12-31 11:31:38,753362][I][ezpz/examples/test:695:main] Took: 52.63 seconds
wandb:
wandb: ๐ View run polar-surf-6861 at:
wandb: Find logs at: ../../../../../../lus/tegu/projects/datascience/foremans/projects/saforem2/ezpz/wandb/run-20251231_113103-cppqal9m/logs
[2025-12-31 11:31:40,781540][I][ezpz/launch:447:launch] ----[๐ ezpz.launch][stop][2025-12-31-113140]----
[2025-12-31 11:31:40,782309][I][ezpz/launch:448:launch] Execution finished with 0.
[2025-12-31 11:31:40,782738][I][ezpz/launch:449:launch] Executing finished in 57.13 seconds.
[2025-12-31 11:31:40,783099][I][ezpz/launch:450:launch] Took 57.13 seconds to run. Exiting.
took: 1m 5s