How to: Create and Use Scalers#
ModularML’s Scaler class provides a unified interface for applying preprocessing transforms to
FeatureSet data. It wraps any scikit-learn-compatible transformer and integrates with
fit_transform, undo history, and serialization.
This notebook covers:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import modularml as mml
from modularml import FeatureSet, Scaler
from modularml.scalers import (
Absolute,
Negate,
PerSampleMinMaxScaler,
PerSampleZeroStart,
SegmentedScaler,
)
We’ll use synthetic HPPC (Hybrid Pulse Power Characterization) battery data throughout this notebook. Each sample simulates a standard HPPC pulse sequence:
OCV observation (10 s) - cell resting at open-circuit voltage
Charge pulse (10 s) — 1.2 A applied; ohmic jump then exponential rise
Rest after charge (40 s) — current removed; ohmic recovery then slow relaxation
Discharge pulse (10 s) — 1.2 A drawn; ohmic drop then exponential decay
Rest after discharge (40 s) — ohmic recovery then slow relaxation back to OCV
Cells span a range of state-of-health (SOH) values, degrading from 100% to ~50%.
from utils.hppc_data_gen import get_mock_hppc_data
voltage, soh, cell_ids, group_ids = get_mock_hppc_data(n_samples=1000)
print(f"Samples: {voltage.shape[0]}")
print(f"Voltage shape: {voltage.shape}")
print(f"OCV range: [{voltage[:, 0].min():.2f}, {voltage[:, 0].max():.2f}] V")
print(f"SOH range: [{soh.min():.1f}, {soh.max():.1f}] %")
print(f"Voltage overall: [{voltage.min():.3f}, {voltage.max():.3f}] V")
Data and Setup#
fs = FeatureSet.from_dict(
label="HPPCData",
data={
"voltage": voltage.tolist(),
"soh": soh.tolist(),
"cell_id": cell_ids.tolist(),
"group_id": group_ids.tolist(),
},
feature_keys="voltage",
target_keys="soh",
tag_keys=["cell_id", "group_id"],
)
print(fs)
print(f"Feature shapes: {fs.get_feature_shapes()}")
Split by cell group to prevent data leakage between train / val / test.
fs.split_random(
ratios={"train": 0.6, "val": 0.2, "test": 0.2},
group_by="group_id",
seed=42,
)
for name, view in fs.splits.items():
groups = view.get_tags(fmt="numpy", tags="group_id")
print(f" {name}: {len(view)} samples, groups: {np.unique(groups)}")
Define a reusable plotting helper. Each split gets its own panel; traces are colored by SOH (dark blue = high, light blue = low).
def plot_timeseries(
fs: FeatureSet,
columns: str | list[str],
splits: list[str] | None = None,
n_samples: int = 100,
color_by: str = "targets.soh.raw",
color_vbounds: tuple = (50, 100),
xlabel: str = "Time (s)",
ylabel: str = "Voltage (V)",
clabel: str = "SOH (%)",
marker="-",
seed: int = 13,
):
"""
Plot time-series columns from a FeatureSet, one panel per split.
Args:
fs: FeatureSet to visualise.
columns: Fully-qualified column name(s), e.g. ``"features.voltage.raw"``.
Multiple columns are flattened and horizontally stacked.
splits: Splits to include. Defaults to all registered splits.
n_samples: Number of traces to draw per panel.
color_by: Fully-qualified scalar column used for the colormap.
color_vbounds: ``(vmin, vmax)`` for the colormap.
xlabel: Axis x-label.
ylabel: Axis y-label.
clabel: Colorbar label.
marker: Marker style.
seed: RNG seed for reproducible sample selection.
"""
def order_splits(values: list[str]) -> list[str]:
priority = {"train": 0, "val": 1, "test": 2}
return sorted(values, key=lambda x: priority.get(x, 99))
rng = np.random.default_rng(seed)
scm = plt.cm.ScalarMappable(
cmap=plt.cm.Blues,
norm=plt.Normalize(vmin=color_vbounds[0], vmax=color_vbounds[1]),
)
columns = columns if isinstance(columns, list) else [columns]
split_names = order_splits(splits or fs.available_splits)
fig, axes = plt.subplots(
figsize=(7, 2.5),
ncols=len(split_names),
sharex=True,
sharey=True,
)
for i, split_label in enumerate(split_names):
split_view = fs.get_split(split_label)
res = split_view.get_data(
columns=columns,
fmt="dict_numpy",
include_domain_prefix=True,
include_rep_suffix=True,
)
# Match user-supplied column specs to actual keys returned by get_data
ordered_keys = []
for c in columns:
parts = [c.replace("*", "")]
if "." in c:
parts = c.replace("*", "").split(".")
for k in res:
if any(p == k.split(".")[1] for p in parts):
ordered_keys.append(k)
break
color_vals = split_view.get_data(columns=[color_by], fmt="np").reshape(-1)
flat_data = np.column_stack(
[res[k].reshape(len(color_vals), -1) for k in ordered_keys],
)
sample_idxs = rng.choice(np.arange(len(color_vals)), size=n_samples)
for idx in sample_idxs:
axes[i].plot(flat_data[idx], marker, color=scm.to_rgba(color_vals[idx]))
axes[i].set_title(split_label, fontsize=10)
axes[i].set_xlabel(xlabel, fontsize=10)
axes[0].set_ylabel(ylabel, fontsize=10)
fig.tight_layout(pad=1)
fig.subplots_adjust(right=0.85)
cbar_ax = fig.add_axes([0.87, 0.19, 0.02, 0.7])
cbar = fig.colorbar(scm, cax=cbar_ax)
cbar.set_label(clabel, fontsize=10)
return fig, axes
fig, axes = plot_timeseries(fs, columns="features.voltage.raw")
plt.suptitle("Raw HPPC voltage", y=1.02, fontsize=11)
plt.show()
Built-in Scalers#
All registered scalers are accessible via mml.supported_scalers.
The registry contains both ModularML-native and scikit-learn scalers.
mml.supported_scalers
The ModularML-native (non-sklearn) scalers are:
Scaler |
What it does |
|---|---|
|
Shifts each sample so its first value equals zero |
|
Scales each sample independently to a target range (default [0, 1]) |
|
Applies independent scalers to contiguous feature sub-regions |
|
Multiplies all values by −1 |
|
Replaces each value with its absolute value |
The Scaler Wrapper#
Scaler is a thin adapter that gives any sklearn-compatible transformer a consistent ModularML
interface. It can be constructed three ways:
Scaler("MinMaxScaler") # by registry name (case-insensitive)
Scaler(MinMaxScaler) # by class
Scaler(MinMaxScaler(clip=True)) # by instance
You can also pass a string, class, or instance directly to FeatureSet.fit_transform — it
will be wrapped automatically.
Key methods:
Method |
Description |
|---|---|
|
Learn parameters from data |
|
Apply the fitted transform |
|
Fit and transform in one step |
|
Reverse the transform (if supported) |
|
Return a fresh copy with the same config but no learned state |
from sklearn.preprocessing import MinMaxScaler
# Three equivalent constructors
s1 = Scaler("MinMaxScaler")
s2 = Scaler(MinMaxScaler)
s3 = Scaler(MinMaxScaler())
print(f"Name: {s1.scaler_name}")
print(f"Is fit (before): {s1._is_fit}")
X = np.random.default_rng(0).normal(size=(10, 5))
s1.fit(X)
print(f"Is fit (after): {s1._is_fit}")
X_scaled = s1.transform(X)
print(f"Scaled range: [{X_scaled.min():.3f}, {X_scaled.max():.3f}]")
# Clone: same config, no learned state
s1_clone = s1.clone_unfitted()
print(f"Clone is fit: {s1_clone._is_fit}")
PerSampleZeroStart#
What it does: Subtracts the first value of each sample from every element in that sample:
$$x_i^\prime = x_i - x_i[0]$$
Why it’s useful for HPPC data: The absolute OCV varies with SOC and cell-to-cell spread (here 2.0–3.6 V). Subtracting the initial value removes this offset so all traces start at zero and only the delta-V response to the current pulse is retained. Models trained on zero-started data learn the electrochemical dynamics rather than the SOC level.
Fitting behaviour: Statistics are computed per-sample at transform time (no global
statistics), so fit_to_split does not affect the result. Specifying it is still good practice.
fs.undo_all_transforms()
fs.fit_transform(
scaler=PerSampleZeroStart,
domain="features",
keys="voltage",
fit_to_split="train",
)
v_raw = fs["train"].get_features(fmt="numpy", features="voltage", rep="raw")
v_zs = fs["train"].get_features(fmt="numpy", features="voltage", rep="transformed")
print(
f"Raw first-value range: [{v_raw[:, 0].min():.3f}, {v_raw[:, 0].max():.3f}] V"
)
print(f"ZeroStart first-value |max|: {np.abs(v_zs[:, 0]).max():.3e} (all ~0)")
fig, axes = plot_timeseries(fs, columns="features.voltage.transformed")
plt.suptitle("After PerSampleZeroStart", y=1.02, fontsize=11)
plt.show()
PerSampleMinMaxScaler#
What it does: Scales each sample independently so its values span feature_range (default [0, 1]):
$$x_i^\prime = \frac{x_i - \min(x_i)}{\max(x_i) - \min(x_i)}$$
Contrast with sklearn’s MinMaxScaler: sklearn’s version computes min and max across the
training set — one scalar per feature dimension. PerSampleMinMaxScaler uses per-sample
statistics, making it invariant to the absolute voltage level and amplitude differences between
cells and SOC levels.
Fitting behaviour: Like PerSampleZeroStart, statistics are recomputed per-sample at
transform time.
fs.undo_all_transforms()
fs.fit_transform(
scaler=PerSampleMinMaxScaler(),
domain="features",
keys="voltage",
fit_to_split="train",
)
v_scaled = fs["train"].get_features(fmt="numpy", features="voltage", rep="transformed")
fig, axes = plot_timeseries(fs, columns="features.voltage.transformed")
plt.suptitle("After PerSampleMinMaxScaler", y=1.02, fontsize=11)
plt.show()
SegmentedScaler#
What it does: Partitions the feature vector into contiguous segments and fits an independent scaler on each segment:
[ ─── segment 0 ─── | ─── segment 1 ─── | ─── ... ─── ]
boundaries[0:1] boundaries[1:2]
A cloned copy of the template scaler is fit independently on each slice of the training data.
Why it’s useful for HPPC data: The OCV, charge pulse, rest, discharge, and rest regions
occupy very different voltage ranges. A single global scaler compresses or expands regions
unevenly. SegmentedScaler applies an independent normalization to each protocol region,
preserving its full dynamic range.
Boundaries must be a tuple of strictly increasing integers, starting at 0 and ending at the total feature length.
Fitting behaviour: The underlying scaler (e.g., MinMaxScaler) learns global statistics
across training samples for each segment, so fit_to_split="train" is important here.
import itertools
# Visualise the HPPC segment layout on a representative trace
SEGMENT_LABELS = ["OCV", "Charge", "Rest 1", "Discharge", "Rest 2"]
SEGMENT_COLORS = ["#cce5f0", "#f0cccc", "#ccf0cc", "#f0e0cc", "#e0ccf0"]
HPPC_BOUNDARIES = [0, 9, 20, 59, 70, 110]
fig, ax = plt.subplots(figsize=(8, 3))
sample = fs.get_features(fmt="numpy", features="voltage", rep="raw")[0][0]
ax.plot(sample, "k-", lw=1.5)
for (start, end), color, label in zip(
itertools.pairwise(HPPC_BOUNDARIES),
SEGMENT_COLORS,
SEGMENT_LABELS,
strict=True,
):
ax.axvspan(start, end - 0.5, alpha=0.4, color=color)
ax.text(
(start + end) / 2, ax.get_ylim()[0], label, ha="center", va="bottom", fontsize=8
)
ax.set_xlabel("Time (s)")
ax.set_ylabel("Voltage (V)")
ax.set_title(
f"HPPC Protocol — SegmentedScaler boundaries: {HPPC_BOUNDARIES}", fontsize=10
)
plt.tight_layout()
plt.show()
fs.undo_all_transforms()
new_bounds = [0, 20, 60, 70, 110]
fs.fit_transform(
scaler=SegmentedScaler(boundaries=new_bounds, scaler=PerSampleZeroStart),
domain="features",
keys="voltage",
fit_to_split="train",
)
v_seg = fs["train"].get_features(fmt="numpy", features="voltage", rep="transformed")
fig, axes = plot_timeseries(fs, columns="features.voltage.transformed", marker=".")
plt.suptitle("After SegmentedScaler(PerSampleZeroStart)", y=1.03, fontsize=11)
plt.show()
Note that segment boundaries in the val / test panels may slightly exceed [0, 1] because those splits contain cell groups not seen during training.
Negate and Absolute#
These are simple element-wise transforms, most useful as building blocks in a transform chain.
Negate— multiplies every value by −1. Useful when a model or loss expects positive values (e.g., converting a discharge voltage drop to a positive deviation).Absolute— replaces every value with its absolute value, recording the per-element sign mask so the transform can be inverted exactly.
Both have a no-op fit, so fit_to_split has no effect on the output.
fs.undo_all_transforms()
# Negate
fs.fit_transform(
scaler=Negate(), domain="features", keys="voltage", fit_to_split="train"
)
v_raw = fs["train"].get_features(fmt="numpy", features="voltage", rep="raw")
v_neg = fs["train"].get_features(fmt="numpy", features="voltage", rep="transformed")
print("Negate")
print(f" Raw range: [{v_raw.min():.3f}, {v_raw.max():.3f}] V")
print(f" Negated range: [{v_neg.min():.3f}, {v_neg.max():.3f}] V")
fs.undo_all_transforms()
# Absolute
# Apply ZeroStart first so voltage deviations straddle zero (+/-);
# Absolute then maps both directions into the positive half-plane.
fs.fit_transform(
scaler=PerSampleZeroStart, domain="features", keys="voltage", fit_to_split="train"
)
fs.fit_transform(
scaler=Absolute(), domain="features", keys="voltage", fit_to_split="train"
)
v_abs = fs["train"].get_features(fmt="numpy", features="voltage", rep="transformed")
print("Absolute (after PerSampleZeroStart)")
print(f" Min value: {v_abs.min():.6f} (all non-negative)")
fs.undo_all_transforms()
Chaining Transforms#
Each call to fit_transform builds on the current "transformed" representation, creating a
chain that can be unwound one step at a time (undo_last_transform) or all at once
(undo_all_transforms).
A natural preprocessing pipeline for HPPC voltage:
PerSampleZeroStart — removes the per-sample OCV offset so all traces start at 0
SegmentedScaler(MinMaxScaler) — independently normalises each protocol region to [0, 1]
MinMaxScaler on the SOH target — normalises the prediction target across training samples
fs.undo_all_transforms()
# Step 1: Remove per-sample OCV offset
fs.fit_transform(
scaler=PerSampleZeroStart,
domain="features",
keys="voltage",
fit_to_split="train",
)
fig, axes = plot_timeseries(fs, columns="features.voltage.transformed")
plt.suptitle("Step 1 - PerSampleZeroStart", y=1.02, fontsize=11)
plt.show()
# Step 2: Normalise each HPPC segment independently
fs.fit_transform(
scaler=SegmentedScaler(boundaries=new_bounds, scaler="MinMaxScaler"),
domain="features",
keys="voltage",
fit_to_split="train",
)
fig, axes = plot_timeseries(fs, columns="features.voltage.transformed")
plt.suptitle("Step 2 - SegmentedScaler(MinMaxScaler)", y=1.02, fontsize=11)
plt.show()
# Step 3: Scale SOH target across training samples
fs.fit_transform(
scaler=MinMaxScaler(),
domain="targets",
keys="soh",
fit_to_split="train",
)
soh_raw = fs["test"].get_targets(fmt="numpy", targets="soh", rep="raw")
soh_scaled = fs["test"].get_targets(fmt="numpy", targets="soh", rep="transformed")
print(f"SOH raw range: [{soh_raw.min():.1f}, {soh_raw.max():.1f}] %")
print(f"SOH scaled range: [{soh_scaled.min():.3f}, {soh_scaled.max():.3f}]")
Undo the last feature transform (SegmentedScaler) while keeping PerSampleZeroStart active.
fs.undo_last_transform(domain="features", keys="voltage")
v = fs["train"].get_features(fmt="numpy", features="voltage", rep="transformed")
print("After undoing SegmentedScaler (PerSampleZeroStart still active):")
print(
f" First-value |max|: {np.abs(v[:, 0]).max():.2e} (ZeroStart preserved, all ~0)"
)
print(
f" Overall range: [{v.min():.3f}, {v.max():.3f}] (no longer bounded to [0,1])"
)
# SOH target transform is unaffected (different domain)
soh_scaled = fs["test"].get_targets(fmt="numpy", targets="soh", rep="transformed")
print(
f" SOH scaled range: [{soh_scaled.min():.3f}, {soh_scaled.max():.3f}] (unchanged)"
)
fig, axes = plot_timeseries(fs, columns="features.voltage.transformed")
plt.show()
Creating a Custom Scaler#
Any object that satisfies the scikit-learn estimator interface can be wrapped by Scaler and
used directly in FeatureSet.fit_transform:
class MyScaler(BaseEstimator, TransformerMixin):
def fit(self, X, y=None): ...
def transform(self, X): ...
def inverse_transform(self, X): ... # optional but enables undo
The hard requirements are fit and transform. Adding inverse_transform enables
undo_last_transform and unscale_data_for_cols.
Example: PerSampleStandardScaler#
sklearn’s StandardScaler standardizes across samples using training-set statistics.
The custom scaler below standardizes each sample independently — useful when every
measurement has its own mean and variance that should be removed.
Note: Custom scalers cannot be serialized unless they are defined in a separate file,
or registered to the mml.supported_scalers registry.
from sklearn.base import BaseEstimator, TransformerMixin
class PerSampleStandardScaler(BaseEstimator, TransformerMixin):
"""
Standardize each sample to zero mean and unit variance.
Unlike sklearn's ``StandardScaler``, statistics are computed per sample
at transform time; no global state is learned from the training set.
"""
def __init__(self):
self._sample_mean = None
self._sample_std = None
def fit(self, X, y=None):
# No global statistics to learn; all computation is deferred to transform
return self
def transform(self, X):
if X.ndim != 2:
msg = f"Expected 2D array, got shape {X.shape}"
raise ValueError(msg)
X = np.asarray(X)
self._sample_mean = X.mean(axis=1, keepdims=True)
self._sample_std = X.std(axis=1, keepdims=True)
# Guard against constant samples (std = 0)
self._sample_std = np.where(self._sample_std == 0, 1.0, self._sample_std)
return (X - self._sample_mean) / self._sample_std
def inverse_transform(self, X):
if self._sample_mean is None:
raise RuntimeError("Scaler has not been applied yet.")
return X * self._sample_std + self._sample_mean
fs.undo_all_transforms()
try:
fs.fit_transform(
scaler=Scaler(PerSampleStandardScaler()),
domain="features",
keys="voltage",
fit_to_split="train",
)
except RuntimeError as e:
print(e)
# We can register it to the builtin scaler register
mml.scaler_registry.register(
"PerSampleStandardScaler",
PerSampleStandardScaler,
)
# And now we can use the scaler
fs.fit_transform(
scaler=PerSampleStandardScaler(),
domain="features",
keys="voltage",
fit_to_split="train",
)
fig, axes = plot_timeseries(fs, columns="features.voltage.transformed")
plt.show()
However, adding custom classes to the built-in registry only allows usage of that class in this same environment. Restarting the kernel would clear user-added items in the registry.
For more robust serailization, move the the scaler class to its own Python file.
from utils.my_scaler import PerSampleStandardScaler as MyScaler
fs.undo_all_transforms()
fs.fit_transform(
scaler=MyScaler(),
domain="features",
keys="voltage",
fit_to_split="train",
)
v_std = fs["train"].get_features(fmt="numpy", features="voltage", rep="transformed")
fig, axes = plot_timeseries(fs, columns="features.voltage.transformed")
plt.suptitle("After PerSampleStandardScaler (custom)", y=1.02, fontsize=11)
plt.show()
Summary#
Task |
Code |
|---|---|
List all scalers |
|
Create scaler by name |
|
Create scaler by instance |
|
Apply a transform |
|
Undo last transform |
|
Undo all transforms |
|
Per-sample OCV removal |
|
Per-sample normalisation |
|
Segment-wise normalisation |
|
Sign flip |
|
Absolute value |
|
Custom scaler |
Subclass |
Inverse-scale external data |
|