Source code for skrough.dataprep

"""Data preparation functions.

The :mod:`skrough.dataprep` module delivers helper functions to prepare data to the form
required by other methods and algorithms.
"""

from __future__ import annotations

import logging
from typing import Literal, overload

import numpy as np
import pandas as pd

import skrough.typing as rght
from skrough.logs import log_start_end

DEFAULT_SHUFFLED_PREFIX = "shuffled_"


logger = logging.getLogger(__name__)


@overload
def prepare_factorized_vector(
    values: np.ndarray,
    return_unique_values: Literal[False] = False,
) -> tuple[np.ndarray, int]:
    ...


@overload
def prepare_factorized_vector(
    values: np.ndarray,
    return_unique_values: Literal[True],
) -> tuple[np.ndarray, int, np.ndarray]:
    ...


# TODO: add handling also for pd.Series
[docs]@log_start_end(logger) def prepare_factorized_vector( values: np.ndarray, return_unique_values: bool = False ) -> tuple[np.ndarray, int] | tuple[np.ndarray, int, np.ndarray]: """Factorize values. Prepare enumerated values along with a number of distinct values. Args: values: A 1d array to be factorized. Returns: Result is consisted of the following elements - factorized data returned in form of 1d array - feature domain size Examples: >>> ar = np.array([3, 4, 3, 3, 2]) >>> prepare_factorized_vector(ar) (array([0, 1, 0, 0, 2]), 3) """ # TODO: check if get_uniques_and_compacted can be used instead of pd.factorize factorized_values, uniques = pd.factorize(values, use_na_sentinel=False) count_distinct = len(uniques) if return_unique_values: return factorized_values, count_distinct, uniques return factorized_values, count_distinct
# TODO: add handling also for pd.DataFrame
[docs]@log_start_end(logger) def prepare_factorized_array( data_x: np.ndarray, ) -> tuple[np.ndarray, np.ndarray]: """Factorize data table. Factorize data table and return statistics of feature domain sizes. Args: data_x: A dataset to be factorized. Returns: Result is consisted of the following elements - factorized data returned in a form of a 2D array - data feature domain sizes returned in a form of 1d array, i.e., a single value (domain size) returned for each column Examples: >>> ar = np.array([[5, 3], ... [9, 3], ... [5, 2]]) >>> prepare_factorized_array(ar) (array([[0, 0], [1, 0], [0, 1]]), array([2, 2])) """ if data_x.size == 0: return data_x, np.zeros(data_x.shape[1]) factorized = [ prepare_factorized_vector(data_x[:, i]) for i in range(data_x.shape[1]) ] res1, res2 = zip(*factorized) x: np.ndarray = np.column_stack(res1) x_counts = np.array(res2) return x, x_counts
[docs]@log_start_end(logger) def prepare_factorized_data( df: pd.DataFrame, target_attr: str | int, ) -> tuple[np.ndarray, np.ndarray, np.ndarray, int]: """Factorize conditional and target attrs from data frame. Factorize data frame and return statistics of feature domain sizes for conditional and target attrs. Args: df: A dataset to be factorized. target_attr: Identifier of the target column in the input dataset. Returns: Result is consisted of the following elements - factorized conditional data returned in a form of a 2D array - conditional data feature domain sizes returned in a form of 1D array, i.e., a single value (domain size) returned for each column - factorized target data returned in form of 1d array - target feature domain size Examples: >>> df = pd.DataFrame([[5, 3, 3], ... [9, 3, 1], ... [5, 2, 3]], columns=["a", "b", "dec"]) >>> prepare_factorized_data(df, target_attr="dec") (array([[0, 0], [1, 0], [0, 1]]), array([2, 2]), array([0, 1, 0]), 2) """ data_y = df[target_attr] data_x = df.drop(columns=target_attr) x, x_counts = prepare_factorized_array(data_x.to_numpy()) # pylint: disable-next=unbalanced-tuple-unpacking y, y_count = prepare_factorized_vector(data_y.to_numpy()) return x, x_counts, y, y_count
# TODO: make target_attr optional - so one can shuffle just conditional attrs without # the need the target attr to be present
[docs]@log_start_end(logger) def add_shuffled_attrs( df: pd.DataFrame, target_attr: str | int, shuffled_attrs_prefix: str = DEFAULT_SHUFFLED_PREFIX, seed: rght.Seed = None, ) -> pd.DataFrame: """Add shuffled attrs. Add shuffled counterpart attribute for each conditional attribute (for all but one distinguished target attribute) of the input dataset. A shuffled (reordered) attribute for a given original attribute consists of the same values but permuted in random order. In other words, a shuffled attribute is an attribute of the same empirical distribution as the original one but (possibly) uncorrelated with the target attribute. Args: df: Input dataset. target_attr: Identifier of the target column in the input dataset. shuffled_attrs_prefix: A prefix for shuffled attribute names. seed: Random seed. Defaults to :obj:`None`. Returns: A dataset with shuffled counterpart attributes added. Examples: >>> df = pd.DataFrame([[5, 3, 3], ... [9, 3, 1], ... [5, 2, 3]], columns=["a", "b", "d"]) >>> add_shuffled_attrs(df, target_attr="d", shuffled_attrs_prefix="s_", seed=0) a b s_a s_b dec 0 5 3 5 2 3 1 9 3 5 3 1 2 5 2 9 3 3 """ rng = np.random.default_rng(seed) data_y = df[target_attr] data_x = df.drop(columns=target_attr) data_x_shuffled = data_x.apply(rng.permutation) col_names = list(shuffled_attrs_prefix + data_x_shuffled.columns.astype(str)) data_x_shuffled.columns = col_names result = pd.concat([data_x, data_x_shuffled, data_y], axis=1) return result