Source code for skrough.dataprep

"""Data preparation functions.

The :mod:`skrough.dataprep` module delivers helper functions to prepare data to the form
required by other methods and algorithms.
"""

from __future__ import annotations

import logging
from typing import Literal, overload

import numpy as np
import pandas as pd

import skrough.typing as rght
from skrough.logs import log_start_end

DEFAULT_SHUFFLED_PREFIX = "shuffled_"


logger = logging.getLogger(__name__)


@overload
def prepare_factorized_vector(
    values: np.ndarray,
    return_unique_values: Literal[False] = False,
) -> tuple[np.ndarray, int]:
    ...


@overload
def prepare_factorized_vector(
    values: np.ndarray,
    return_unique_values: Literal[True],
) -> tuple[np.ndarray, int, np.ndarray]:
    ...


# TODO: add handling also for pd.Series
[docs]@log_start_end(logger)
def prepare_factorized_vector(
    values: np.ndarray, return_unique_values: bool = False
) -> tuple[np.ndarray, int] | tuple[np.ndarray, int, np.ndarray]:
    """Factorize values.

    Prepare enumerated values along with a number of distinct values.

    Args:
        values: A 1d array to be factorized.

    Returns:
        Result is consisted of the following elements

        - factorized data returned in form of 1d array
        - feature domain size

    Examples:
        >>> ar = np.array([3, 4, 3, 3, 2])
        >>> prepare_factorized_vector(ar)
        (array([0, 1, 0, 0, 2]), 3)
    """
    # TODO: check if get_uniques_and_compacted can be used instead of pd.factorize
    factorized_values, uniques = pd.factorize(values, use_na_sentinel=False)
    count_distinct = len(uniques)
    if return_unique_values:
        return factorized_values, count_distinct, uniques
    return factorized_values, count_distinct


# TODO: add handling also for pd.DataFrame
[docs]@log_start_end(logger)
def prepare_factorized_array(
    data_x: np.ndarray,
) -> tuple[np.ndarray, np.ndarray]:
    """Factorize data table.

    Factorize data table and return statistics of feature domain sizes.

    Args:
        data_x: A dataset to be factorized.

    Returns:
        Result is consisted of the following elements

        - factorized data returned in a form of a 2D array
        - data feature domain sizes returned in a form of 1d array, i.e., a single value
          (domain size) returned for each column

    Examples:
        >>> ar = np.array([[5, 3],
        ...                [9, 3],
        ...                [5, 2]])
        >>> prepare_factorized_array(ar)
        (array([[0, 0],
                [1, 0],
                [0, 1]]),
        array([2, 2]))
    """
    if data_x.size == 0:
        return data_x, np.zeros(data_x.shape[1])
    factorized = [
        prepare_factorized_vector(data_x[:, i]) for i in range(data_x.shape[1])
    ]
    res1, res2 = zip(*factorized)
    x: np.ndarray = np.column_stack(res1)
    x_counts = np.array(res2)
    return x, x_counts


[docs]@log_start_end(logger)
def prepare_factorized_data(
    df: pd.DataFrame,
    target_attr: str | int,
) -> tuple[np.ndarray, np.ndarray, np.ndarray, int]:
    """Factorize conditional and target attrs from data frame.

    Factorize data frame and return statistics of feature domain sizes for conditional
    and target attrs.

    Args:
        df: A dataset to be factorized.
        target_attr: Identifier of the target column in the input dataset.

    Returns:
        Result is consisted of the following elements

        - factorized conditional data returned in a form of a 2D array
        - conditional data feature domain sizes returned in a form of 1D array, i.e., a
          single value (domain size) returned for each column
        - factorized target data returned in form of 1d array
        - target feature domain size

    Examples:
        >>> df = pd.DataFrame([[5, 3, 3],
        ...                    [9, 3, 1],
        ...                    [5, 2, 3]], columns=["a", "b", "dec"])
        >>> prepare_factorized_data(df, target_attr="dec")
        (array([[0, 0],
                [1, 0],
                [0, 1]]),
        array([2, 2]),
        array([0, 1, 0]),
        2)
    """
    data_y = df[target_attr]
    data_x = df.drop(columns=target_attr)
    x, x_counts = prepare_factorized_array(data_x.to_numpy())
    # pylint: disable-next=unbalanced-tuple-unpacking
    y, y_count = prepare_factorized_vector(data_y.to_numpy())
    return x, x_counts, y, y_count


# TODO: make target_attr optional - so one can shuffle just conditional attrs without
# the need the target attr to be present
[docs]@log_start_end(logger)
def add_shuffled_attrs(
    df: pd.DataFrame,
    target_attr: str | int,
    shuffled_attrs_prefix: str = DEFAULT_SHUFFLED_PREFIX,
    seed: rght.Seed = None,
) -> pd.DataFrame:
    """Add shuffled attrs.

    Add shuffled counterpart attribute for each conditional attribute (for all but one
    distinguished target attribute) of the input dataset. A shuffled (reordered)
    attribute for a given original attribute consists of the same values but permuted in
    random order. In other words, a shuffled attribute is an attribute of the same
    empirical distribution as the original one but (possibly) uncorrelated with the
    target attribute.

    Args:
        df: Input dataset.
        target_attr: Identifier of the target column in the input dataset.
        shuffled_attrs_prefix: A prefix for shuffled attribute names.
        seed: Random seed. Defaults to :obj:`None`.

    Returns:
        A dataset with shuffled counterpart attributes added.

    Examples:
        >>> df = pd.DataFrame([[5, 3, 3],
        ...                    [9, 3, 1],
        ...                    [5, 2, 3]], columns=["a", "b", "d"])
        >>> add_shuffled_attrs(df, target_attr="d", shuffled_attrs_prefix="s_", seed=0)
           a  b  s_a  s_b  dec
        0  5  3    5    2    3
        1  9  3    5    3    1
        2  5  2    9    3    3
    """
    rng = np.random.default_rng(seed)
    data_y = df[target_attr]
    data_x = df.drop(columns=target_attr)
    data_x_shuffled = data_x.apply(rng.permutation)
    col_names = list(shuffled_attrs_prefix + data_x_shuffled.columns.astype(str))
    data_x_shuffled.columns = col_names
    result = pd.concat([data_x, data_x_shuffled, data_y], axis=1)
    return result