How to: Create and Use Scalers#

ModularML’s Scaler class provides a unified interface for applying preprocessing transforms to FeatureSet data. It wraps any scikit-learn-compatible transformer and integrates with fit_transform, undo history, and serialization.

This notebook covers:

Data and Setup
Built-in Scalers
The Scaler Wrapper
PerSampleZeroStart
PerSampleMinMaxScaler
SegmentedScaler
Negate and Absolute
Chaining Transforms
Creating a Custom Scaler

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np

import modularml as mml
from modularml import FeatureSet, Scaler
from modularml.scalers import (
    Absolute,
    Negate,
    PerSampleMinMaxScaler,
    PerSampleZeroStart,
    SegmentedScaler,
)

We’ll use synthetic HPPC (Hybrid Pulse Power Characterization) battery data throughout this notebook. Each sample simulates a standard HPPC pulse sequence:

OCV observation (10 s) - cell resting at open-circuit voltage
Charge pulse (10 s) — 1.2 A applied; ohmic jump then exponential rise
Rest after charge (40 s) — current removed; ohmic recovery then slow relaxation
Discharge pulse (10 s) — 1.2 A drawn; ohmic drop then exponential decay
Rest after discharge (40 s) — ohmic recovery then slow relaxation back to OCV

Cells span a range of state-of-health (SOH) values, degrading from 100% to ~50%.

from utils.hppc_data_gen import get_mock_hppc_data

voltage, soh, cell_ids, group_ids = get_mock_hppc_data(n_samples=1000)

print(f"Samples:          {voltage.shape[0]}")
print(f"Voltage shape:    {voltage.shape}")
print(f"OCV range:        [{voltage[:, 0].min():.2f}, {voltage[:, 0].max():.2f}] V")
print(f"SOH range:        [{soh.min():.1f}, {soh.max():.1f}] %")
print(f"Voltage overall:  [{voltage.min():.3f}, {voltage.max():.3f}] V")

Data and Setup#

fs = FeatureSet.from_dict(
    label="HPPCData",
    data={
        "voltage": voltage.tolist(),
        "soh": soh.tolist(),
        "cell_id": cell_ids.tolist(),
        "group_id": group_ids.tolist(),
    },
    feature_keys="voltage",
    target_keys="soh",
    tag_keys=["cell_id", "group_id"],
)
print(fs)
print(f"Feature shapes: {fs.get_feature_shapes()}")

Split by cell group to prevent data leakage between train / val / test.

fs.split_random(
    ratios={"train": 0.6, "val": 0.2, "test": 0.2},
    group_by="group_id",
    seed=42,
)

for name, view in fs.splits.items():
    groups = view.get_tags(fmt="numpy", tags="group_id")
    print(f"  {name}: {len(view)} samples, groups: {np.unique(groups)}")

Define a reusable plotting helper. Each split gets its own panel; traces are colored by SOH (dark blue = high, light blue = low).

def plot_timeseries(
    fs: FeatureSet,
    columns: str | list[str],
    splits: list[str] | None = None,
    n_samples: int = 100,
    color_by: str = "targets.soh.raw",
    color_vbounds: tuple = (50, 100),
    xlabel: str = "Time (s)",
    ylabel: str = "Voltage (V)",
    clabel: str = "SOH (%)",
    marker="-",
    seed: int = 13,
):
    """
    Plot time-series columns from a FeatureSet, one panel per split.

    Args:
        fs:            FeatureSet to visualise.
        columns:       Fully-qualified column name(s), e.g. ``"features.voltage.raw"``.
                       Multiple columns are flattened and horizontally stacked.
        splits:        Splits to include. Defaults to all registered splits.
        n_samples:     Number of traces to draw per panel.
        color_by:      Fully-qualified scalar column used for the colormap.
        color_vbounds: ``(vmin, vmax)`` for the colormap.
        xlabel:        Axis x-label.
        ylabel:        Axis y-label.
        clabel:        Colorbar label.
        marker:        Marker style.
        seed:          RNG seed for reproducible sample selection.

    """

    def order_splits(values: list[str]) -> list[str]:
        priority = {"train": 0, "val": 1, "test": 2}
        return sorted(values, key=lambda x: priority.get(x, 99))

    rng = np.random.default_rng(seed)
    scm = plt.cm.ScalarMappable(
        cmap=plt.cm.Blues,
        norm=plt.Normalize(vmin=color_vbounds[0], vmax=color_vbounds[1]),
    )

    columns = columns if isinstance(columns, list) else [columns]
    split_names = order_splits(splits or fs.available_splits)

    fig, axes = plt.subplots(
        figsize=(7, 2.5),
        ncols=len(split_names),
        sharex=True,
        sharey=True,
    )

    for i, split_label in enumerate(split_names):
        split_view = fs.get_split(split_label)

        res = split_view.get_data(
            columns=columns,
            fmt="dict_numpy",
            include_domain_prefix=True,
            include_rep_suffix=True,
        )

        # Match user-supplied column specs to actual keys returned by get_data
        ordered_keys = []
        for c in columns:
            parts = [c.replace("*", "")]
            if "." in c:
                parts = c.replace("*", "").split(".")
            for k in res:
                if any(p == k.split(".")[1] for p in parts):
                    ordered_keys.append(k)
                    break

        color_vals = split_view.get_data(columns=[color_by], fmt="np").reshape(-1)
        flat_data = np.column_stack(
            [res[k].reshape(len(color_vals), -1) for k in ordered_keys],
        )

        sample_idxs = rng.choice(np.arange(len(color_vals)), size=n_samples)
        for idx in sample_idxs:
            axes[i].plot(flat_data[idx], marker, color=scm.to_rgba(color_vals[idx]))

        axes[i].set_title(split_label, fontsize=10)
        axes[i].set_xlabel(xlabel, fontsize=10)

    axes[0].set_ylabel(ylabel, fontsize=10)
    fig.tight_layout(pad=1)
    fig.subplots_adjust(right=0.85)
    cbar_ax = fig.add_axes([0.87, 0.19, 0.02, 0.7])
    cbar = fig.colorbar(scm, cax=cbar_ax)
    cbar.set_label(clabel, fontsize=10)
    return fig, axes

fig, axes = plot_timeseries(fs, columns="features.voltage.raw")
plt.suptitle("Raw HPPC voltage", y=1.02, fontsize=11)
plt.show()

Built-in Scalers#

All registered scalers are accessible via mml.supported_scalers. The registry contains both ModularML-native and scikit-learn scalers.

mml.supported_scalers

The ModularML-native (non-sklearn) scalers are:

Scaler	What it does
`PerSampleZeroStart`	Shifts each sample so its first value equals zero
`PerSampleMinMaxScaler`	Scales each sample independently to a target range (default [0, 1])
`SegmentedScaler`	Applies independent scalers to contiguous feature sub-regions
`Negate`	Multiplies all values by −1
`Absolute`	Replaces each value with its absolute value

The Scaler Wrapper#

Scaler is a thin adapter that gives any sklearn-compatible transformer a consistent ModularML interface. It can be constructed three ways:

Scaler("MinMaxScaler")          # by registry name (case-insensitive)
Scaler(MinMaxScaler)            # by class
Scaler(MinMaxScaler(clip=True)) # by instance

You can also pass a string, class, or instance directly to FeatureSet.fit_transform — it will be wrapped automatically.

Key methods:

Method	Description
`fit(X)`	Learn parameters from data
`transform(X)`	Apply the fitted transform
`fit_transform(X)`	Fit and transform in one step
`inverse_transform(X)`	Reverse the transform (if supported)
`clone_unfitted()`	Return a fresh copy with the same config but no learned state

from sklearn.preprocessing import MinMaxScaler

# Three equivalent constructors
s1 = Scaler("MinMaxScaler")
s2 = Scaler(MinMaxScaler)
s3 = Scaler(MinMaxScaler())

print(f"Name:             {s1.scaler_name}")
print(f"Is fit (before):  {s1._is_fit}")

X = np.random.default_rng(0).normal(size=(10, 5))
s1.fit(X)
print(f"Is fit (after):   {s1._is_fit}")

X_scaled = s1.transform(X)
print(f"Scaled range:     [{X_scaled.min():.3f}, {X_scaled.max():.3f}]")

# Clone: same config, no learned state
s1_clone = s1.clone_unfitted()
print(f"Clone is fit:     {s1_clone._is_fit}")

PerSampleZeroStart#

What it does: Subtracts the first value of each sample from every element in that sample:

$$x_i^\prime = x_i - x_i[0]$$

Why it’s useful for HPPC data: The absolute OCV varies with SOC and cell-to-cell spread (here 2.0–3.6 V). Subtracting the initial value removes this offset so all traces start at zero and only the delta-V response to the current pulse is retained. Models trained on zero-started data learn the electrochemical dynamics rather than the SOC level.

Fitting behaviour: Statistics are computed per-sample at transform time (no global statistics), so fit_to_split does not affect the result. Specifying it is still good practice.

fs.undo_all_transforms()

fs.fit_transform(
    scaler=PerSampleZeroStart,
    domain="features",
    keys="voltage",
    fit_to_split="train",
)

v_raw = fs["train"].get_features(fmt="numpy", features="voltage", rep="raw")
v_zs = fs["train"].get_features(fmt="numpy", features="voltage", rep="transformed")

print(
    f"Raw first-value range:       [{v_raw[:, 0].min():.3f}, {v_raw[:, 0].max():.3f}] V"
)
print(f"ZeroStart first-value |max|: {np.abs(v_zs[:, 0]).max():.3e} (all ~0)")

fig, axes = plot_timeseries(fs, columns="features.voltage.transformed")
plt.suptitle("After PerSampleZeroStart", y=1.02, fontsize=11)
plt.show()

PerSampleMinMaxScaler#

What it does: Scales each sample independently so its values span feature_range (default [0, 1]):

$$x_i^\prime = \frac{x_i - \min(x_i)}{\max(x_i) - \min(x_i)}$$

Contrast with sklearn’s MinMaxScaler: sklearn’s version computes min and max across the training set — one scalar per feature dimension. PerSampleMinMaxScaler uses per-sample statistics, making it invariant to the absolute voltage level and amplitude differences between cells and SOC levels.

Fitting behaviour: Like PerSampleZeroStart, statistics are recomputed per-sample at transform time.

fs.undo_all_transforms()

fs.fit_transform(
    scaler=PerSampleMinMaxScaler(),
    domain="features",
    keys="voltage",
    fit_to_split="train",
)

v_scaled = fs["train"].get_features(fmt="numpy", features="voltage", rep="transformed")
fig, axes = plot_timeseries(fs, columns="features.voltage.transformed")
plt.suptitle("After PerSampleMinMaxScaler", y=1.02, fontsize=11)
plt.show()

SegmentedScaler#

What it does: Partitions the feature vector into contiguous segments and fits an independent scaler on each segment:

[ ─── segment 0 ─── | ─── segment 1 ─── | ─── ... ─── ]
  boundaries[0:1]     boundaries[1:2]

A cloned copy of the template scaler is fit independently on each slice of the training data.

Why it’s useful for HPPC data: The OCV, charge pulse, rest, discharge, and rest regions occupy very different voltage ranges. A single global scaler compresses or expands regions unevenly. SegmentedScaler applies an independent normalization to each protocol region, preserving its full dynamic range.

Boundaries must be a tuple of strictly increasing integers, starting at 0 and ending at the total feature length.

Fitting behaviour: The underlying scaler (e.g., MinMaxScaler) learns global statistics across training samples for each segment, so fit_to_split="train" is important here.

import itertools

# Visualise the HPPC segment layout on a representative trace
SEGMENT_LABELS = ["OCV", "Charge", "Rest 1", "Discharge", "Rest 2"]
SEGMENT_COLORS = ["#cce5f0", "#f0cccc", "#ccf0cc", "#f0e0cc", "#e0ccf0"]
HPPC_BOUNDARIES = [0, 9, 20, 59, 70, 110]


fig, ax = plt.subplots(figsize=(8, 3))
sample = fs.get_features(fmt="numpy", features="voltage", rep="raw")[0][0]
ax.plot(sample, "k-", lw=1.5)

for (start, end), color, label in zip(
    itertools.pairwise(HPPC_BOUNDARIES),
    SEGMENT_COLORS,
    SEGMENT_LABELS,
    strict=True,
):
    ax.axvspan(start, end - 0.5, alpha=0.4, color=color)
    ax.text(
        (start + end) / 2, ax.get_ylim()[0], label, ha="center", va="bottom", fontsize=8
    )

ax.set_xlabel("Time (s)")
ax.set_ylabel("Voltage (V)")
ax.set_title(
    f"HPPC Protocol — SegmentedScaler boundaries: {HPPC_BOUNDARIES}", fontsize=10
)
plt.tight_layout()
plt.show()

fs.undo_all_transforms()

new_bounds = [0, 20, 60, 70, 110]
fs.fit_transform(
    scaler=SegmentedScaler(boundaries=new_bounds, scaler=PerSampleZeroStart),
    domain="features",
    keys="voltage",
    fit_to_split="train",
)

v_seg = fs["train"].get_features(fmt="numpy", features="voltage", rep="transformed")
fig, axes = plot_timeseries(fs, columns="features.voltage.transformed", marker=".")
plt.suptitle("After SegmentedScaler(PerSampleZeroStart)", y=1.03, fontsize=11)
plt.show()

Note that segment boundaries in the val / test panels may slightly exceed [0, 1] because those splits contain cell groups not seen during training.

Negate and Absolute#

These are simple element-wise transforms, most useful as building blocks in a transform chain.

Negate — multiplies every value by −1. Useful when a model or loss expects positive values (e.g., converting a discharge voltage drop to a positive deviation).
Absolute — replaces every value with its absolute value, recording the per-element sign mask so the transform can be inverted exactly.

Both have a no-op fit, so fit_to_split has no effect on the output.

fs.undo_all_transforms()

# Negate
fs.fit_transform(
    scaler=Negate(), domain="features", keys="voltage", fit_to_split="train"
)

v_raw = fs["train"].get_features(fmt="numpy", features="voltage", rep="raw")
v_neg = fs["train"].get_features(fmt="numpy", features="voltage", rep="transformed")
print("Negate")
print(f"  Raw range:     [{v_raw.min():.3f}, {v_raw.max():.3f}] V")
print(f"  Negated range: [{v_neg.min():.3f}, {v_neg.max():.3f}] V")

fs.undo_all_transforms()

# Absolute
# Apply ZeroStart first so voltage deviations straddle zero (+/-);
# Absolute then maps both directions into the positive half-plane.
fs.fit_transform(
    scaler=PerSampleZeroStart, domain="features", keys="voltage", fit_to_split="train"
)
fs.fit_transform(
    scaler=Absolute(), domain="features", keys="voltage", fit_to_split="train"
)

v_abs = fs["train"].get_features(fmt="numpy", features="voltage", rep="transformed")
print("Absolute (after PerSampleZeroStart)")
print(f"  Min value: {v_abs.min():.6f}  (all non-negative)")

fs.undo_all_transforms()

Chaining Transforms#

Each call to fit_transform builds on the current "transformed" representation, creating a chain that can be unwound one step at a time (undo_last_transform) or all at once (undo_all_transforms).

A natural preprocessing pipeline for HPPC voltage:

PerSampleZeroStart — removes the per-sample OCV offset so all traces start at 0
SegmentedScaler(MinMaxScaler) — independently normalises each protocol region to [0, 1]
MinMaxScaler on the SOH target — normalises the prediction target across training samples

fs.undo_all_transforms()

# Step 1: Remove per-sample OCV offset
fs.fit_transform(
    scaler=PerSampleZeroStart,
    domain="features",
    keys="voltage",
    fit_to_split="train",
)

fig, axes = plot_timeseries(fs, columns="features.voltage.transformed")
plt.suptitle("Step 1 - PerSampleZeroStart", y=1.02, fontsize=11)
plt.show()

# Step 2: Normalise each HPPC segment independently
fs.fit_transform(
    scaler=SegmentedScaler(boundaries=new_bounds, scaler="MinMaxScaler"),
    domain="features",
    keys="voltage",
    fit_to_split="train",
)

fig, axes = plot_timeseries(fs, columns="features.voltage.transformed")
plt.suptitle("Step 2 - SegmentedScaler(MinMaxScaler)", y=1.02, fontsize=11)
plt.show()

# Step 3: Scale SOH target across training samples
fs.fit_transform(
    scaler=MinMaxScaler(),
    domain="targets",
    keys="soh",
    fit_to_split="train",
)

soh_raw = fs["test"].get_targets(fmt="numpy", targets="soh", rep="raw")
soh_scaled = fs["test"].get_targets(fmt="numpy", targets="soh", rep="transformed")
print(f"SOH raw range:    [{soh_raw.min():.1f}, {soh_raw.max():.1f}] %")
print(f"SOH scaled range: [{soh_scaled.min():.3f}, {soh_scaled.max():.3f}]")

Undo the last feature transform (SegmentedScaler) while keeping PerSampleZeroStart active.

fs.undo_last_transform(domain="features", keys="voltage")

v = fs["train"].get_features(fmt="numpy", features="voltage", rep="transformed")
print("After undoing SegmentedScaler (PerSampleZeroStart still active):")
print(
    f"  First-value |max|: {np.abs(v[:, 0]).max():.2e}  (ZeroStart preserved, all ~0)"
)
print(
    f"  Overall range:     [{v.min():.3f}, {v.max():.3f}]  (no longer bounded to [0,1])"
)

# SOH target transform is unaffected (different domain)
soh_scaled = fs["test"].get_targets(fmt="numpy", targets="soh", rep="transformed")
print(
    f"  SOH scaled range:  [{soh_scaled.min():.3f}, {soh_scaled.max():.3f}]  (unchanged)"
)

fig, axes = plot_timeseries(fs, columns="features.voltage.transformed")
plt.show()

Creating a Custom Scaler#

Any object that satisfies the scikit-learn estimator interface can be wrapped by Scaler and used directly in FeatureSet.fit_transform:

class MyScaler(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None): ...
    def transform(self, X): ...
    def inverse_transform(self, X): ...  # optional but enables undo

The hard requirements are fit and transform. Adding inverse_transform enables undo_last_transform and unscale_data_for_cols.

Example: `PerSampleStandardScaler`#

sklearn’s StandardScaler standardizes across samples using training-set statistics. The custom scaler below standardizes each sample independently — useful when every measurement has its own mean and variance that should be removed.

Note: Custom scalers cannot be serialized unless they are defined in a separate file, or registered to the mml.supported_scalers registry.

from sklearn.base import BaseEstimator, TransformerMixin


class PerSampleStandardScaler(BaseEstimator, TransformerMixin):
    """
    Standardize each sample to zero mean and unit variance.

    Unlike sklearn's ``StandardScaler``, statistics are computed per sample
    at transform time; no global state is learned from the training set.
    """

    def __init__(self):
        self._sample_mean = None
        self._sample_std = None

    def fit(self, X, y=None):
        # No global statistics to learn; all computation is deferred to transform
        return self

    def transform(self, X):
        if X.ndim != 2:
            msg = f"Expected 2D array, got shape {X.shape}"
            raise ValueError(msg)
        X = np.asarray(X)
        self._sample_mean = X.mean(axis=1, keepdims=True)
        self._sample_std = X.std(axis=1, keepdims=True)
        # Guard against constant samples (std = 0)
        self._sample_std = np.where(self._sample_std == 0, 1.0, self._sample_std)
        return (X - self._sample_mean) / self._sample_std

    def inverse_transform(self, X):
        if self._sample_mean is None:
            raise RuntimeError("Scaler has not been applied yet.")
        return X * self._sample_std + self._sample_mean

fs.undo_all_transforms()

try:
    fs.fit_transform(
        scaler=Scaler(PerSampleStandardScaler()),
        domain="features",
        keys="voltage",
        fit_to_split="train",
    )
except RuntimeError as e:
    print(e)

# We can register it to the builtin scaler register
mml.scaler_registry.register(
    "PerSampleStandardScaler",
    PerSampleStandardScaler,
)

# And now we can use the scaler
fs.fit_transform(
    scaler=PerSampleStandardScaler(),
    domain="features",
    keys="voltage",
    fit_to_split="train",
)
fig, axes = plot_timeseries(fs, columns="features.voltage.transformed")
plt.show()

However, adding custom classes to the built-in registry only allows usage of that class in this same environment. Restarting the kernel would clear user-added items in the registry.

For more robust serailization, move the the scaler class to its own Python file.

from utils.my_scaler import PerSampleStandardScaler as MyScaler

fs.undo_all_transforms()
fs.fit_transform(
    scaler=MyScaler(),
    domain="features",
    keys="voltage",
    fit_to_split="train",
)

v_std = fs["train"].get_features(fmt="numpy", features="voltage", rep="transformed")
fig, axes = plot_timeseries(fs, columns="features.voltage.transformed")
plt.suptitle("After PerSampleStandardScaler (custom)", y=1.02, fontsize=11)
plt.show()

Summary#

Task	Code
List all scalers	`mml.supported_scalers`
Create scaler by name	`Scaler("MinMaxScaler")`
Create scaler by instance	`Scaler(MinMaxScaler())`
Apply a transform	`fs.fit_transform(scaler, domain, keys, fit_to_split)`
Undo last transform	`fs.undo_last_transform(domain, keys)`
Undo all transforms	`fs.undo_all_transforms()`
Per-sample OCV removal	`PerSampleZeroStart`
Per-sample normalisation	`PerSampleMinMaxScaler()`
Segment-wise normalisation	`SegmentedScaler(boundaries=(...), scaler="MinMaxScaler")`
Sign flip	`Negate()`
Absolute value	`Absolute()`
Custom scaler	Subclass `BaseEstimator, TransformerMixin`; implement `fit` + `transform`
Inverse-scale external data	`fs.unscale_data_for_cols(data, domain, columns)`