Feature importance example#
import pprint
import more_itertools
import numpy as np
import pandas as pd
from skrough.dataprep import prepare_factorized_data
from skrough.disorder_measures import conflicts_count, entropy, gini_impurity
from skrough.disorder_score import get_disorder_score_for_data
from skrough.feature_importance import get_feature_importance
Dataset#
Let’s prepare a sample data set - “Play Golf Dataset”.
df = pd.DataFrame(
np.array(
[
["sunny", "hot", "high", "weak", "no"],
["sunny", "hot", "high", "strong", "no"],
["overcast", "hot", "high", "weak", "yes"],
["rain", "mild", "high", "weak", "yes"],
["rain", "cool", "normal", "weak", "yes"],
["rain", "cool", "normal", "strong", "no"],
["overcast", "cool", "normal", "strong", "yes"],
["sunny", "mild", "high", "weak", "no"],
["sunny", "cool", "normal", "weak", "yes"],
["rain", "mild", "normal", "weak", "yes"],
["sunny", "mild", "normal", "strong", "yes"],
["overcast", "mild", "high", "strong", "yes"],
["overcast", "hot", "normal", "weak", "yes"],
["rain", "mild", "high", "strong", "no"],
],
dtype=object,
),
columns=["Outlook", "Temperature", "Humidity", "Wind", "Play"],
)
TARGET_COLUMN = "Play"
df
| Outlook | Temperature | Humidity | Wind | Play | |
|---|---|---|---|---|---|
| 0 | sunny | hot | high | weak | no |
| 1 | sunny | hot | high | strong | no |
| 2 | overcast | hot | high | weak | yes |
| 3 | rain | mild | high | weak | yes |
| 4 | rain | cool | normal | weak | yes |
| 5 | rain | cool | normal | strong | no |
| 6 | overcast | cool | normal | strong | yes |
| 7 | sunny | mild | high | weak | no |
| 8 | sunny | cool | normal | weak | yes |
| 9 | rain | mild | normal | weak | yes |
| 10 | sunny | mild | normal | strong | yes |
| 11 | overcast | mild | high | strong | yes |
| 12 | overcast | hot | normal | weak | yes |
| 13 | rain | mild | high | strong | no |
Prepare data#
Factorize dataset and obtain the sizes of feature domains.
x, x_counts, y, y_count = prepare_factorized_data(df, TARGET_COLUMN)
column_names = np.array([col for col in df.columns if col != TARGET_COLUMN])
print("Conditional data:")
print(x)
print()
print("Conditional data feature domain sizes:")
print(x_counts)
print()
print("Target data:")
print(y)
print()
print("Target data feature domain size:")
print(y_count)
Conditional data:
[[0 0 0 0]
[0 0 0 1]
[1 0 0 0]
[2 1 0 0]
[2 2 1 0]
[2 2 1 1]
[1 2 1 1]
[0 1 0 0]
[0 2 1 0]
[2 1 1 0]
[0 1 1 1]
[1 1 0 1]
[1 0 1 0]
[2 1 0 1]]
Conditional data feature domain sizes:
[3 3 2 2]
Target data:
[0 0 1 1 1 0 1 0 1 1 1 1 1 0]
Target data feature domain size:
2
Measure of disorder in the dataset - disorder score#
In the context of the given dataset, a disorder score values is quantity that characterizes a subset of features and, more or less, presents the disorder of decisions in the equivalence classes induced by the subsets of features.
In most cases it is reasonable to assume that the disorder score function is monotonic with respect to subset relation, i.e., for subsets of features $A \subseteq B$, the disorder score for $A$ should be less or equal to that for $B$.
Attributes are given by their ordinal numbers.
Let’s try three standard approaches, i.e., conflicts_count, gini_impurity and
entropy.
for disorder_function in [conflicts_count, entropy, gini_impurity]:
print(disorder_function.__name__)
for attrs in [[0], [0, 1], [0, 1, 3], [0, 1, 2, 3]]:
print(
f"disorder score for attrs {attrs}({column_names[attrs]}) = ",
get_disorder_score_for_data(
x=x,
x_counts=x_counts,
y=y,
y_count=y_count,
disorder_fun=disorder_function,
attrs=attrs,
),
)
print()
print()
conflicts_count
disorder score for attrs [0](['Outlook']) = 12.0
disorder score for attrs [0, 1](['Outlook' 'Temperature']) = 4.0
disorder score for attrs [0, 1, 3](['Outlook' 'Temperature' 'Wind']) = 0.0
disorder score for attrs [0, 1, 2, 3](['Outlook' 'Temperature' 'Humidity' 'Wind']) = 0.0
entropy
disorder score for attrs [0](['Outlook']) = 0.6935361388961918
disorder score for attrs [0, 1](['Outlook' 'Temperature']) = 0.4824919644402477
disorder score for attrs [0, 1, 3](['Outlook' 'Temperature' 'Wind']) = 0.0
disorder score for attrs [0, 1, 2, 3](['Outlook' 'Temperature' 'Humidity' 'Wind']) = 0.0
gini_impurity
disorder score for attrs [0](['Outlook']) = 0.34285714285714286
disorder score for attrs [0, 1](['Outlook' 'Temperature']) = 0.23809523809523808
disorder score for attrs [0, 1, 3](['Outlook' 'Temperature' 'Wind']) = 0.0
disorder score for attrs [0, 1, 2, 3](['Outlook' 'Temperature' 'Humidity' 'Wind']) = 0.0
Assessing feature importance#
We can use the above disorder score functions for assessing the features, i.e., we can observe the disorder score change if a given feature is removed.
To follow a more realistic example, we can use an enseble of feature subsets, i.e., a family of subsets of all atributes, and not just a single subset of features, computing the total or average disorder score change over several possible appearances of the attribute in the ensemble elements.
attr_subset_ensemble = [
[[0, 2], [0, 3], [0], [2, 3], [1, 2, 3]],
[[0], [0, 1], [1, 2]],
[list(elem) for elem in more_itertools.powerset(range(4))],
]
for disorder_function in [conflicts_count, entropy, gini_impurity]:
print(disorder_function.__name__)
for attr_subset in attr_subset_ensemble:
print("feature importance for attribute subset ensemble: ")
pprint.pprint(attr_subset, compact=True)
print(
get_feature_importance(
x,
x_counts,
y,
y_count,
column_names,
attr_subset,
disorder_fun=disorder_function,
)
)
print()
print()
print()
conflicts_count
feature importance for attribute subset ensemble:
[[0, 2], [0, 3], [0], [2, 3], [1, 2, 3]]
column count global_gain avg_global_gain
0 Outlook 3.0 66.0 22.000000
1 Temperature 1.0 4.0 4.000000
2 Humidity 3.0 25.0 8.333333
3 Wind 3.0 24.0 8.000000
feature importance for attribute subset ensemble:
[[0], [0, 1], [1, 2]]
column count global_gain avg_global_gain
0 Outlook 2.0 44.0 22.0
1 Temperature 2.0 17.0 8.5
2 Humidity 1.0 6.0 6.0
3 Wind 0.0 0.0 0.0
feature importance for attribute subset ensemble:
[[], [0], [1], [2], [3], [0, 1], [0, 2], [0, 3], [1, 2], [1, 3], [2, 3],
[0, 1, 2], [0, 1, 3], [0, 2, 3], [1, 2, 3], [0, 1, 2, 3]]
column count global_gain avg_global_gain
0 Outlook 8.0 103.0 12.875
1 Temperature 8.0 69.0 8.625
2 Humidity 8.0 63.0 7.875
3 Wind 8.0 65.0 8.125
entropy
feature importance for attribute subset ensemble:
[[0, 2], [0, 3], [0], [2, 3], [1, 2, 3]]
column count global_gain avg_global_gain
0 Outlook 3.0 1.248090 0.416030
1 Temperature 1.0 0.107841 0.107841
2 Humidity 3.0 0.728552 0.242851
3 Wind 3.0 0.605939 0.201980
feature importance for attribute subset ensemble:
[[0], [0, 1], [1, 2]]
column count global_gain avg_global_gain
0 Outlook 2.0 0.675321 0.337661
1 Temperature 2.0 0.285209 0.142604
2 Humidity 1.0 0.196778 0.196778
3 Wind 0.0 0.000000 0.000000
feature importance for attribute subset ensemble:
[[], [0], [1], [2], [3], [0, 1], [0, 2], [0, 3], [1, 2], [1, 3], [2, 3],
[0, 1, 2], [0, 1, 3], [0, 2, 3], [1, 2, 3], [0, 1, 2, 3]]
column count global_gain avg_global_gain
0 Outlook 8.0 4.089121 0.511140
1 Temperature 8.0 0.974797 0.121850
2 Humidity 8.0 1.613578 0.201697
3 Wind 8.0 1.939781 0.242473
gini_impurity
feature importance for attribute subset ensemble:
[[0, 2], [0, 3], [0], [2, 3], [1, 2, 3]]
column count global_gain avg_global_gain
0 Outlook 3.0 0.578912 0.192971
1 Temperature 1.0 0.047619 0.047619
2 Humidity 3.0 0.342857 0.114286
3 Wind 3.0 0.269728 0.089909
feature importance for attribute subset ensemble:
[[0], [0, 1], [1, 2]]
column count global_gain avg_global_gain
0 Outlook 2.0 0.318707 0.159354
1 Temperature 2.0 0.126871 0.063435
2 Humidity 1.0 0.095238 0.095238
3 Wind 0.0 0.000000 0.000000
feature importance for attribute subset ensemble:
[[], [0], [1], [2], [3], [0, 1], [0, 2], [0, 3], [1, 2], [1, 3], [2, 3],
[0, 1, 2], [0, 1, 3], [0, 2, 3], [1, 2, 3], [0, 1, 2, 3]]
column count global_gain avg_global_gain
0 Outlook 8.0 1.959864 0.244983
1 Temperature 8.0 0.455102 0.056888
2 Humidity 8.0 0.791837 0.098980
3 Wind 8.0 0.931293 0.116412