Source code for modularml.core.data.featureset_view

"""Immutable row and column projection of a :class:`FeatureSet`."""

from __future__ import annotations

from dataclasses import dataclass
from typing import TYPE_CHECKING, Any

import numpy as np
import pyarrow as pa

from modularml.core.data.sample_collection_mixin import SampleCollectionMixin
from modularml.core.data.schema_constants import (
    DOMAIN_FEATURES,
    DOMAIN_SAMPLE_UUIDS,
    DOMAIN_TAGS,
    DOMAIN_TARGETS,
)
from modularml.core.experiment.experiment_context import ExperimentContext
from modularml.core.io.protocols import Configurable
from modularml.core.splitting.split_mixin import SplitMixin
from modularml.utils.representation.summary import Summarizable

if TYPE_CHECKING:
    from numpy.typing import NDArray

    from modularml.core.data.featureset import FeatureSet
    from modularml.core.data.sample_collection import SampleCollection



[docs]
@dataclass
class FeatureSetView(SampleCollectionMixin, SplitMixin, Summarizable, Configurable):
    """
    Immutable row and column projection of a :class:`FeatureSet`.

    Intended for inspection, export, and analysis.

    Attributes:
        source (FeatureSet): The parent :class:`FeatureSet` this view projects.
        indices (NDArray[np.int64]): Row indices into the source.
        columns (list[str]): Column names included in this view.
        label (str | None): Optional label for this view.

    """

    source: FeatureSet
    indices: NDArray[np.int64]
    columns: list[str]
    label: str | None = None


[docs]
    @classmethod
    def from_featureset(
        cls,
        fs: FeatureSet,
        *,
        rows: NDArray[np.int64] | None = None,
        columns: list[str] | None = None,
        label: str | None = None,
    ) -> FeatureSetView:
        """
        Create a :class:`FeatureSetView` from a :class:`FeatureSet`.

        Args:
            fs (FeatureSet): Source :class:`FeatureSet`.
            rows (NDArray[np.int64] | None): Row indices to include.
                Defaults to all rows.
            columns (list[str] | None): Column names to include.
                Defaults to all columns.
            label (str | None): Optional label for the view.

        Returns:
            FeatureSetView: A new view over the given :class:`FeatureSet`.

        """
        if rows is None:
            rows = np.arange(fs.n_samples)
        if columns is None:
            columns = fs.collection.get_all_keys(
                include_domain_prefix=True,
                include_rep_suffix=True,
            )
        return cls(
            source=fs,
            indices=np.asarray(rows, dtype=np.int64),
            columns=columns,
            label=label,
        )


    # ================================================
    # Properties & Dunders
    # ================================================
    def __repr__(self):
        return f"FeatureSetView(source='{self.source.label}', n_samples={self.n_samples}, label='{self.label}')"

    def __eq__(self, other):
        """Compare source and indices. Label is ignored."""
        if not isinstance(other, FeatureSetView):
            msg = f"Cannot compare equality between FeatureSetView and {type(other)}"
            raise TypeError(msg)

        return (self.source == other.source) and (self.indices == other.indices)

    __hash__ = None

    def __setattr__(self, name, value):
        # Overrriding __setattr__ to make all attributes frozen except label
        frozen_attrs = ["source", "indices", "columns"]

        # Check if attr is frozen and if it's already set (to allow init)
        if name in frozen_attrs and name in self.__dict__:
            msg = f"Cannot reassign frozen attribute '{name}'"
            raise AttributeError(msg)

        # Use default __setattr__ behavior
        super().__setattr__(name, value)

    @property
    def valid_indices(self) -> np.ndarray:
        """Indices >= 0 (used for data lookup)."""
        return self.indices[self.indices >= 0]

    # ================================================
    # SampleCollectionMixin
    # ================================================
    def _resolve_caller_attributes(
        self,
    ) -> tuple[SampleCollection, list[str] | None, np.ndarray | None]:
        return (
            self.source.collection,
            self.columns,
            self.indices,
            self.source.node_id,
        )

    # ================================================
    # Comparators
    # ================================================

[docs]
    def is_disjoint_with(self, other: FeatureSetView) -> bool:
        """
        Check if this view has no overlapping samples.

        Description:
            If both views share the same source :class:`FeatureSet`, comparison
            is based on indices. If they originate from different sources,
            comparison falls back to `DOMAIN_SAMPLE_UUIDS` to ensure identity
            consistency across saved or merged datasets.

        Args:
            other (FeatureSetView): The other view to compare against.

        Returns:
            bool: True if the views share no common samples.

        """
        if not isinstance(other, FeatureSetView):
            msg = f"Comparison only valid between FeatureSetViews, not {type(other)}"
            raise TypeError(msg)

        # Same collection (fast index-based check)
        if self.source is other.source:
            return (
                len(np.intersect1d(self.indices, other.indices, assume_unique=True))
                == 0
            )

        # Otherwise compare SAMPLE_IDs
        ids_self = {
            self.source.collection.table[DOMAIN_SAMPLE_UUIDS].to_pylist()[i]
            for i in self.indices
        }
        ids_other = {
            other.source.collection.table[DOMAIN_SAMPLE_UUIDS].to_pylist()[i]
            for i in other.indices
        }
        return ids_self.isdisjoint(ids_other)



[docs]
    def get_overlap_with(self, other: FeatureSetView) -> list[str]:
        """
        Get overlapping sample identifiers between two :class:`FeatureSetView` instances.

        Args:
            other (FeatureSetView): The other view to compare against.

        Returns:
            list[str]: A list of overlapping `DOMAIN_SAMPLE_UUIDS` values.

        """
        if not isinstance(other, FeatureSetView):
            msg = f"Comparison only valid between FeatureSetViews, not {type(other)}"
            raise TypeError(msg)

        # Same collection (fast index-based check)
        if self.source is other.source:
            overlap = np.intersect1d(self.indices, other.indices, assume_unique=True)
            return (
                self.source.collection.table[DOMAIN_SAMPLE_UUIDS]
                .take(pa.array(overlap))
                .to_pylist()
            )

        # Otherwise compare SAMPLE_IDs
        ids_self = {
            self.source.collection.table[DOMAIN_SAMPLE_UUIDS].to_pylist()[i]
            for i in self.indices
        }
        ids_other = {
            other.source.collection.table[DOMAIN_SAMPLE_UUIDS].to_pylist()[i]
            for i in other.indices
        }
        return list(ids_self.intersection(ids_other))


    # ================================================
    # Convenience Methods
    # ================================================

[docs]
    def expand_columns(self, label: str | None = None) -> FeatureSetView:
        """
        Create a new view without column filtering.

        Description:
            A new view is created with the same indices, but without
            any filters over which columns to include. The returned
            view will include all columns of the source :class:`FeatureSet`.

        Args:
            label (str | None): Optional label for the expanded view.
                Defaults to `"{self.label}_expanded"`.

        Returns:
            FeatureSetView: A new view with all columns included.

        """
        return FeatureSetView.from_featureset(
            fs=self.source,
            rows=self.indices,
            columns=None,
            label=label or f"{self.label}_expanded",
        )


    # ================================================
    # Configurable
    # ================================================

[docs]
    def get_config(self) -> dict[str, Any]:
        """
        Return configuration required to reconstruct this view.

        Returns:
            dict[str, Any]: View configuration.

        """
        return {
            "source": {
                "node_label": self.source.label,
                "node_id": self.source.node_id,
            },
            "indices": np.asarray(self.indices).tolist(),
            "columns": self.columns,
            "label": self.label,
        }



[docs]
    @classmethod
    def from_config(cls, config: dict[str, Any]) -> FeatureSetView:
        """
        Construct a view from configuration.

        Args:
            config (dict[str, Any]): View configuration.

        Returns:
            FeatureSetView: Reconstructed view.

        """
        from modularml.core.data.featureset import FeatureSet

        if not all(x in config for x in ["source", "indices", "columns", "label"]):
            raise ValueError("Invalid config for FeatureSetView.")

        # Re-link source using ExperimentContext
        exp_ctx = ExperimentContext.get_active()
        try:
            node = exp_ctx.get_node(node_id=config["source"]["node_id"])
        except KeyError as e:
            msg = (
                f"There are no registered nodes with id: '{config['source']['node_id']}'. "
                f"Ensure FeatureSet '{config['source']['node_label']}' exists in the current ExperimentContext."
            )
            raise RuntimeError(msg) from e
        if not isinstance(node, FeatureSet):
            msg = f"Node with ID ('{config['source']['node_id']}') is not a FeatureSet. Received: {type(node)}."
            raise TypeError(msg)

        # Create view
        return FeatureSetView(
            source=node,
            indices=np.asarray(config["indices"], dtype=np.int64),
            columns=config["columns"],
            label=config["label"],
        )


    # ================================================
    # Representation
    # ================================================
    def _summary_rows(self) -> list[tuple]:
        return [
            ("label", self.label),
            ("source", self.source.label),
            ("n_samples", self.n_samples),
            (
                "columns",
                [
                    (
                        DOMAIN_FEATURES,
                        str(
                            self.get_feature_keys(
                                include_domain_prefix=False,
                                include_rep_suffix=True,
                            ),
                        ),
                    ),
                    (
                        DOMAIN_TARGETS,
                        str(
                            self.get_target_keys(
                                include_domain_prefix=False,
                                include_rep_suffix=True,
                            ),
                        ),
                    ),
                    (
                        DOMAIN_TAGS,
                        str(
                            self.get_tag_keys(
                                include_domain_prefix=False,
                                include_rep_suffix=True,
                            ),
                        ),
                    ),
                ],
            ),
        ]