Source code for skrough.ranks

from __future__ import annotations

from dataclasses import dataclass

import numpy as np
import pandas as pd
import scipy

from skrough.dataprep import DEFAULT_SHUFFLED_PREFIX

COMPARE_RANKS_COL_ATTR_TYPE = "attr_type"
COMPARE_RANKS_COL_TOP_K = "top_k"
COMPARE_RANKS_COL_AVG_RANK = "avg_rank"

ATTR_TYPE_VALUE_ORIGINAL = "original"
ATTR_TYPE_VALUE_SHUFFLED = "shuffled"
TOP_K_VALUE_ALL = "all"


[docs]@dataclass class AttrRanks: original: np.ndarray shuffled: np.ndarray
[docs]def get_attr_ranks( scores, attr_col, score_col, shuffled_prefix=DEFAULT_SHUFFLED_PREFIX ) -> AttrRanks: scores = scores.sort_values([score_col], ascending=False).reset_index(drop=True) ranks = ( pd.Series(np.arange(len(scores)) + 1) .groupby(scores[score_col]) .transform(np.mean) ) is_shuffled = scores[attr_col].str.startswith(shuffled_prefix) return AttrRanks( original=ranks[~is_shuffled].to_numpy(), shuffled=ranks[is_shuffled].to_numpy(), )
[docs]def compare_ranks( scores, attr_col, score_col, top_ks=None, shuffled_prefix=DEFAULT_SHUFFLED_PREFIX ): attr_ranks = get_attr_ranks( scores=scores, attr_col=attr_col, score_col=score_col, shuffled_prefix=shuffled_prefix, ) result = [ [ATTR_TYPE_VALUE_ORIGINAL, TOP_K_VALUE_ALL, attr_ranks.original.mean()], [ATTR_TYPE_VALUE_SHUFFLED, TOP_K_VALUE_ALL, attr_ranks.shuffled.mean()], ] if top_ks is not None: if isinstance(top_ks, int): top_ks = [top_ks] for top_k in top_ks: result.extend( [ [ ATTR_TYPE_VALUE_ORIGINAL, str(top_k), attr_ranks.original[:top_k].mean(), ], [ ATTR_TYPE_VALUE_SHUFFLED, str(top_k), attr_ranks.shuffled[:top_k].mean(), ], ] ) return pd.DataFrame( result, columns=[ COMPARE_RANKS_COL_ATTR_TYPE, COMPARE_RANKS_COL_TOP_K, COMPARE_RANKS_COL_AVG_RANK, ], )
[docs]def compare_ranksum( scores, attr_col, score_col, shuffled_prefix=DEFAULT_SHUFFLED_PREFIX ): shuffled_indicator = scores[attr_col].str.contains(shuffled_prefix) scores_original = scores[~shuffled_indicator][score_col] scores_shuffled = scores[shuffled_indicator][score_col] return scipy.stats.ranksums(scores_original, scores_shuffled)