Rough Set check functions#
import numpy as np
import pandas as pd
from skrough.checks import (
check_if_approx_reduct,
check_if_bireduct,
check_if_consistent_table,
check_if_functional_dependency,
check_if_reduct,
)
from skrough.dataprep import prepare_factorized_data
from skrough.disorder_measures import entropy
Dataset#
Let’s prepare a sample data set - “Play Golf Dataset”.
df = pd.DataFrame(
np.array(
[
["sunny", "hot", "high", "weak", "no"],
["sunny", "hot", "high", "strong", "no"],
["overcast", "hot", "high", "weak", "yes"],
["rain", "mild", "high", "weak", "yes"],
["rain", "cool", "normal", "weak", "yes"],
["rain", "cool", "normal", "strong", "no"],
["overcast", "cool", "normal", "strong", "yes"],
["sunny", "mild", "high", "weak", "no"],
["sunny", "cool", "normal", "weak", "yes"],
["rain", "mild", "normal", "weak", "yes"],
["sunny", "mild", "normal", "strong", "yes"],
["overcast", "mild", "high", "strong", "yes"],
["overcast", "hot", "normal", "weak", "yes"],
["rain", "mild", "high", "strong", "no"],
],
dtype=object,
),
columns=["Outlook", "Temperature", "Humidity", "Wind", "Play"],
)
TARGET_COLUMN = "Play"
x, x_counts, y, y_count = prepare_factorized_data(df, TARGET_COLUMN)
Data table consistency#
Let’s check if the data table is consistent:
check whole table
check using a given subset of attributes
check_if_consistent_table(x, y)
True
# check using only first two columns
check_if_consistent_table(x[:, 0:2], y)
False
Check functional dependency#
# check functional dependency on all objects (using default: `None`) and all attrs
# (using default: `None`)
check_if_functional_dependency(x, y)
True
# check on all objects (using default: `None`) and on attrs `0, 2, 3`
check_if_functional_dependency(x, y, attrs=[0, 2, 3])
True
# check on all objects (using default: `None`) and on attrs `0, 1`
check_if_functional_dependency(x, y, attrs=[0, 1])
False
# check on objects `0, 2, 5` and on attrs `0, 1`
check_if_functional_dependency(x, y, objs=[0, 2, 5], attrs=[0, 1])
True
Check reducts#
For “Play Golf Dataset” there are only two reducts:
“Outlook”, “Temperature”, “Humidity” -
attrs == [0, 1, 2]“Outlook”, “Humidity”, “Wind” -
attrs == [0, 2, 3]
check_if_reduct(x, x_counts, y, y_count, attrs=[0, 2, 3])
True
check_if_reduct(x, x_counts, y, y_count, attrs=[0, 2, 3])
True
# too few attributes ~ no functional dependency
check_if_reduct(x, x_counts, y, y_count, attrs=[0, 1])
False
# too many attributes ~ some of them can be removed
check_if_reduct(x, x_counts, y, y_count, attrs=[0, 1, 2, 3])
False
Check approximate reducts#
Check if a given subset of attributes is an approximate reduct with a given approximation level $\varepsilon$.
See that for the specified subset of attributes and lower values of $\varepsilon$ the answer is “no”. After reaching specific larger values, the subset become good enough to fulfill the approximation condition. However, increasing the $varepsilon$ value even further, the subset starts to have redundant attributes (not needed to still fulfill the approximate condition) and therefore the whole subset cannot be further considered as an approximate reduct.
attrs = [0, 3]
for eps in np.arange(0, 1, step=0.1):
is_approx_reduct = check_if_approx_reduct(
x, x_counts, y, y_count, attrs=attrs, disorder_fun=entropy, epsilon=eps
)
print(f"is approximate reduct {attrs=} for {eps=:.2} == {is_approx_reduct}")
is approximate reduct attrs=[0, 3] for eps=0.0 == False
is approximate reduct attrs=[0, 3] for eps=0.1 == False
is approximate reduct attrs=[0, 3] for eps=0.2 == False
is approximate reduct attrs=[0, 3] for eps=0.3 == False
is approximate reduct attrs=[0, 3] for eps=0.4 == True
is approximate reduct attrs=[0, 3] for eps=0.5 == True
is approximate reduct attrs=[0, 3] for eps=0.6 == True
is approximate reduct attrs=[0, 3] for eps=0.7 == True
is approximate reduct attrs=[0, 3] for eps=0.8 == False
is approximate reduct attrs=[0, 3] for eps=0.9 == False
Check bireducts#
Check if a given pair of objects and attributes subsets constitutes a decision bireduct.
df.sort_values(["Temperature", "Humidity"])
| Outlook | Temperature | Humidity | Wind | Play | |
|---|---|---|---|---|---|
| 4 | rain | cool | normal | weak | yes |
| 5 | rain | cool | normal | strong | no |
| 6 | overcast | cool | normal | strong | yes |
| 8 | sunny | cool | normal | weak | yes |
| 0 | sunny | hot | high | weak | no |
| 1 | sunny | hot | high | strong | no |
| 2 | overcast | hot | high | weak | yes |
| 12 | overcast | hot | normal | weak | yes |
| 3 | rain | mild | high | weak | yes |
| 7 | sunny | mild | high | weak | no |
| 11 | overcast | mild | high | strong | yes |
| 13 | rain | mild | high | strong | no |
| 9 | rain | mild | normal | weak | yes |
| 10 | sunny | mild | normal | strong | yes |
check_if_bireduct(
x, x_counts, y, y_count, objs=[0, 1, 2, 5, 6, 7, 11, 12, 13], attrs=[0]
)
True
check_if_bireduct(x, x_counts, y, y_count, objs=[0, 1], attrs=[0])
False
check_if_bireduct(x, x_counts, y, y_count, objs=[0, 1, 5, 7, 13], attrs=[1])
False
# too few objects
check_if_bireduct(x, x_counts, y, y_count, objs=[7, 9, 10, 12, 13], attrs=[1, 2])
False
check_if_bireduct(x, x_counts, y, y_count, objs=[2, 5, 7, 9, 10, 12, 13], attrs=[1, 2])
True
check_if_bireduct(
x,
x_counts,
y,
y_count,
objs=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
attrs=[0, 2, 3],
)
True
# all objects + all attrs - not a bireduct because some attrs are redundant
check_if_bireduct(
x,
x_counts,
y,
y_count,
objs=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
attrs=[0, 1, 2, 3],
)
False