Multi-Stage processing#
# import pprint
import numpy as np
import pandas as pd
from skrough.algorithms import hooks
from skrough.algorithms.key_names import (
CONFIG_DISORDER_FUN,
CONFIG_EPSILON,
CONFIG_SELECT_ATTRS_DISORDER_SCORE_BASED_MAX_COUNT,
INPUT_DATA_X,
INPUT_DATA_Y,
)
from skrough.algorithms.meta import describe, processing, stage
from skrough.checks import check_if_approx_reduct
from skrough.dataprep import prepare_factorized_data
from skrough.disorder_measures import entropy
from skrough.structs.attrs_subset import AttrsSubset
# from attrs import asdict
Dataset#
Let’s prepare a sample data set - “Play Golf Dataset”.
df = pd.DataFrame(
np.array(
[
["sunny", "hot", "high", "weak", "no"],
["sunny", "hot", "high", "strong", "no"],
["overcast", "hot", "high", "weak", "yes"],
["rain", "mild", "high", "weak", "yes"],
["rain", "cool", "normal", "weak", "yes"],
["rain", "cool", "normal", "strong", "no"],
["overcast", "cool", "normal", "strong", "yes"],
["sunny", "mild", "high", "weak", "no"],
["sunny", "cool", "normal", "weak", "yes"],
["rain", "mild", "normal", "weak", "yes"],
["sunny", "mild", "normal", "strong", "yes"],
["overcast", "mild", "high", "strong", "yes"],
["overcast", "hot", "normal", "weak", "yes"],
["rain", "mild", "high", "strong", "no"],
],
dtype=object,
),
columns=["Outlook", "Temperature", "Humidity", "Wind", "Play"],
)
TARGET_COLUMN = "Play"
x, x_counts, y, y_count = prepare_factorized_data(df, TARGET_COLUMN)
Approximate decision superreduct#
Let’s prepare a processing procedure to search for approximate decision superreduct.
Notice that despite of the ProcessingMultiStage name, we create the processing with
only one stage, cf., the below grow_stage.
A greedy heuristic algorithm is implemented in the below example. Its brief description is as follows:
initialization steps:
factorize the input data
initialize internal structures - group index and result subset of attributes
compute the approximation threshold, based on the data and the input approximation level $\varepsilon$
perform processing defined in stages (here just one processing stage):
grow_stage:
define stop criterion - reaching the approximation threshold
iteratively, until stop criterion
use all remaining attrs as pre-candidates
pass all pre-candidates as candidates
use greedy heuristic to choose the best attribute - maximizing the disorder score gain
update internal structures
finalize the processing - prepare the actual return value
grow_stage = stage.Stage.from_hooks(
stop_hooks=[
hooks.stop_hooks.stop_hook_approx_threshold,
],
init_hooks=None,
pre_candidates_hooks=[
hooks.pre_candidates_hooks.pre_candidates_hook_remaining_attrs,
],
candidates_hooks=[
hooks.common.process_elements.process_elements_hook_pass_everything,
],
select_hooks=[
hooks.select_hooks.select_hook_attrs_disorder_score_based,
],
filter_hooks=None,
inner_init_hooks=None,
inner_stop_hooks=hooks.inner_stop_hooks.inner_stop_hook_empty,
inner_process_hooks=hooks.inner_process_hooks.inner_process_hook_add_first_attr,
finalize_hooks=None,
)
get_approx_reduct = processing.ProcessingMultiStage.from_hooks(
init_multi_stage_hooks=[
hooks.init_hooks.init_hook_factorize_data_x_y,
hooks.init_hooks.init_hook_single_group_index,
hooks.init_hooks.init_hook_result_attrs_empty,
hooks.init_hooks.init_hook_epsilon_approx_threshold,
],
stages=[grow_stage],
finalize_hooks=None,
prepare_result_fun=hooks.prepare_result_hooks.prepare_result_hook_attrs_subset,
)
Processing procedure inspection#
There are ways to inspect the prepared processing procedures, either for checking or debugging purposes.
A structured representation can be obtained and further processed:
# description_graph = describe.describe(get_approx_reduct)
# print(pprint.pformat(asdict(description_graph))[:1500], "...")
One can inspect “config”/”input”/”values” keys used within a processing procedure and its descendant (nested) subprocedures:
print(f"config-keys: {describe.inspect_config_keys(get_approx_reduct)}")
print(f"input-keys: {describe.inspect_input_data_keys(get_approx_reduct)}")
print(f"values-keys: {describe.inspect_values_keys(get_approx_reduct)}")
config-keys: ['config_disorder_fun']
input-keys: ['input_data_x', 'input_data_y']
values-keys: ['values_result_attrs', 'values_y_count', 'values_disorder_score_approx_threshold', 'values_group_index', 'values_y', 'values_x_counts', 'values_x']
A visual representation using the sklearn framework/templates:
get_approx_reduct
ProcessingMultiStage(init_multi_stage_agg=UpdateStateHooksAggregate(normalized_hooks=[<function init_hook_factorize_data_x_y at 0x7f666711bee0>, <function init_hook_single_group_index at 0x7f66670a7160>, <function init_hook_result_attrs_empty at 0x7f66670a73a0>, <function init_hook_epsilon_approx_threshold at 0x7f66670a74c0>]), init_agg=UpdateStateHooksAggregate(normalized_hooks=[]), stages=[Stage(stop_agg=StopHooksAggregate(normalized_hooks=[<function stop_hook_approx_threshold at 0x7f66670b0430>]), init_agg=UpdateStateHooksAggregate(normalized_hooks=[]), pre_candidates_agg=ProduceElementsHooksAggregate(normalized_hooks=[<function pre_candidates_hook_remaining_attrs at 0x7f66670a7d30>]), candidates_agg=ProcessElementsHooksAggregate(normalized_hooks=[<function process_elements_hook_pass_everything at 0x7f66670b0c10>]), select_agg=ProcessElementsHooksAggregate(normalized_hooks=[<function select_hook_attrs_disorder_score_based at 0x7f66670b01f0>]), filter_agg=ChainProcessElementsHooksAggregate(normalized_hooks=[]), inner_init_agg=ChainProcessElementsHooksAggregate(normalized_hooks=[]), inner_stop_agg=InnerStopHooksAggregate(normalized_hooks=[<function inner_stop_hook_empty at 0x7f66670a7af0>]), inner_process_agg=ChainProcessElementsHooksAggregate(normalized_hooks=[<function inner_process_hook_add_first_attr at 0x7f66670a7940>]), finalize_agg=UpdateStateHooksAggregate(normalized_hooks=[]))], finalize_agg=UpdateStateHooksAggregate(normalized_hooks=[]), prepare_result_fun=<function prepare_result_hook_attrs_subset at 0x7f66670a7f70>)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
ProcessingMultiStage(init_multi_stage_agg=UpdateStateHooksAggregate(normalized_hooks=[<function init_hook_factorize_data_x_y at 0x7f666711bee0>, <function init_hook_single_group_index at 0x7f66670a7160>, <function init_hook_result_attrs_empty at 0x7f66670a73a0>, <function init_hook_epsilon_approx_threshold at 0x7f66670a74c0>]), init_agg=UpdateStateHooksAggregate(normalized_hooks=[]), stages=[Stage(stop_agg=StopHooksAggregate(normalized_hooks=[<function stop_hook_approx_threshold at 0x7f66670b0430>]), init_agg=UpdateStateHooksAggregate(normalized_hooks=[]), pre_candidates_agg=ProduceElementsHooksAggregate(normalized_hooks=[<function pre_candidates_hook_remaining_attrs at 0x7f66670a7d30>]), candidates_agg=ProcessElementsHooksAggregate(normalized_hooks=[<function process_elements_hook_pass_everything at 0x7f66670b0c10>]), select_agg=ProcessElementsHooksAggregate(normalized_hooks=[<function select_hook_attrs_disorder_score_based at 0x7f66670b01f0>]), filter_agg=ChainProcessElementsHooksAggregate(normalized_hooks=[]), inner_init_agg=ChainProcessElementsHooksAggregate(normalized_hooks=[]), inner_stop_agg=InnerStopHooksAggregate(normalized_hooks=[<function inner_stop_hook_empty at 0x7f66670a7af0>]), inner_process_agg=ChainProcessElementsHooksAggregate(normalized_hooks=[<function inner_process_hook_add_first_attr at 0x7f66670a7940>]), finalize_agg=UpdateStateHooksAggregate(normalized_hooks=[]))], finalize_agg=UpdateStateHooksAggregate(normalized_hooks=[]), prepare_result_fun=<function prepare_result_hook_attrs_subset at 0x7f66670a7f70>)
Init hook function to factorize the input data. Factorize an input data table representing conditional features/attributes and decision values for the latter computations. It is assumed that that the input data array and decision values are available in :attr:`state.input_data` under :const:`~skrough.algorithms.key_names.INPUT_DATA_X` and :const:`~skrough.algorithms.key_names.INPUT_DATA_Y` keys, respectively. The :func:`skrough.dataprep.prepare_factorized_array` function is used to process the input data table and the corresponding results are stored in :attr:`state.values` under :const:`~skrough.algorithms.key_names.VALUES_X` and :const:`~skrough.algorithms.key_names.VALUES_X_COUNTS` keys. The :func:`skrough.dataprep.prepare_factorized_vector` function is used to process the decision values and the corresponding results are stored in :attr:`state.values` under :const:`~skrough.algorithms.key_names.VALUES_Y` and :const:`~skrough.algorithms.key_names.VALUES_Y_COUNT` keys.
Init hook function to initialize a uniform group index structure. It is assumed that the appropriate data set that is consisted of objects (typically rows of some tabular representation) is available in :attr:`state.values` under the :const:`~skrough.algorithms.key_names.VALUES_X` key. The function initializes a uniform group index, i.e., a group index that assigns each of the objects under consideration to the same group. The group index will be stored in :attr:`state.values` under the :const:`~skrough.algorithms.key_names.VALUES_GROUP_INDEX` key.
Init hook function to initialize an empty attributes locations collection. The function initializes an empty attributes locations list and stores it in :attr:`state.values` under the :const:`~skrough.algorithms.key_names.VALUES_RESULT_ATTRS` key. The initialized list is intended to be used as integer-location based indexing sequence of attributes, i.e., 0-based values that index attributes from the considered data set.
Check if the defined disorder score approximation threshold was reached.
The function checks if the defined level of disorder score approximation is reached.
The function is intended for use in cases of disorder score minimizing processes and
therefore it check if the disorder score value computed (cf.
:func:`~skrough.structs.group_index.GroupIndex.get_disorder_score` and
:mod:`~skrough.disorder_score` module) for the current group index falls below the
defined level of disorder score approximation.
The function uses the following config and intermediate mappings stored in the
``state`` argument and appropriate keys to access the actual values:
- config values (:attr:`skrough.structs.state.ProcessingState.config` mapping):
- disorder measure function (cf.
:mod:`~skrough.disorder_measures.disorder_measures`) to be used in disorder
score computation - accessed using
:const:`~skrough.algorithms.key_names.CONFIG_DISORDER_FUN` key
- intermediate values (:attr:`skrough.structs.state.ProcessingState.values`
mapping)
- disorder score approximation threshold - accessed using
:const:`~skrough.algorithms.key_names.VALUES_DISORDER_SCORE_APPROX_THRESHOLD`
key
- group index to be used in disorder score computation - accessed using
:const:`~skrough.algorithms.key_names.VALUES_GROUP_INDEX` key
- factorized values of the target attribute - accessed using
:const:`~skrough.algorithms.key_names.VALUES_Y` key
- number of distinct values of the target attribute - accessed using
:const:`~skrough.algorithms.key_names.VALUES_Y_COUNT` keyProcess elements hook returning the original input ``elements`` without change.
Check if the defined disorder score approximation threshold was reached.
The function checks if the defined level of disorder score approximation is reached.
The function is intended for use in cases of disorder score minimizing processes and
therefore it check if the disorder score value computed (cf.
:func:`~skrough.structs.group_index.GroupIndex.get_disorder_score` and
:mod:`~skrough.disorder_score` module) for the current group index falls below the
defined level of disorder score approximation.
The function uses the following config and intermediate mappings stored in the
``state`` argument and appropriate keys to access the actual values:
- config values (:attr:`skrough.structs.state.ProcessingState.config` mapping):
- disorder measure function (cf.
:mod:`~skrough.disorder_measures.disorder_measures`) to be used in disorder
score computation - accessed using
:const:`~skrough.algorithms.key_names.CONFIG_DISORDER_FUN` key
- intermediate values (:attr:`skrough.structs.state.ProcessingState.values`
mapping)
- disorder score approximation threshold - accessed using
:const:`~skrough.algorithms.key_names.VALUES_DISORDER_SCORE_APPROX_THRESHOLD`
key
- group index to be used in disorder score computation - accessed using
:const:`~skrough.algorithms.key_names.VALUES_GROUP_INDEX` key
- factorized values of the target attribute - accessed using
:const:`~skrough.algorithms.key_names.VALUES_Y` key
- number of distinct values of the target attribute - accessed using
:const:`~skrough.algorithms.key_names.VALUES_Y_COUNT` keyCheck if the defined disorder score approximation threshold was reached.
The function checks if the defined level of disorder score approximation is reached.
The function is intended for use in cases of disorder score minimizing processes and
therefore it check if the disorder score value computed (cf.
:func:`~skrough.structs.group_index.GroupIndex.get_disorder_score` and
:mod:`~skrough.disorder_score` module) for the current group index falls below the
defined level of disorder score approximation.
The function uses the following config and intermediate mappings stored in the
``state`` argument and appropriate keys to access the actual values:
- config values (:attr:`skrough.structs.state.ProcessingState.config` mapping):
- disorder measure function (cf.
:mod:`~skrough.disorder_measures.disorder_measures`) to be used in disorder
score computation - accessed using
:const:`~skrough.algorithms.key_names.CONFIG_DISORDER_FUN` key
- intermediate values (:attr:`skrough.structs.state.ProcessingState.values`
mapping)
- disorder score approximation threshold - accessed using
:const:`~skrough.algorithms.key_names.VALUES_DISORDER_SCORE_APPROX_THRESHOLD`
key
- group index to be used in disorder score computation - accessed using
:const:`~skrough.algorithms.key_names.VALUES_GROUP_INDEX` key
- factorized values of the target attribute - accessed using
:const:`~skrough.algorithms.key_names.VALUES_Y` key
- number of distinct values of the target attribute - accessed using
:const:`~skrough.algorithms.key_names.VALUES_Y_COUNT` keyInvoke the prepared procedure#
Prepare appropriate config values and input data.
eps = 0.4
disorder_measure = entropy
config = {
CONFIG_DISORDER_FUN: disorder_measure,
CONFIG_EPSILON: eps,
CONFIG_SELECT_ATTRS_DISORDER_SCORE_BASED_MAX_COUNT: 1,
}
input_data = {
INPUT_DATA_X: x,
INPUT_DATA_Y: y,
}
Sometimes it may be convenient to check if the given config and input data contain necessary keys, appropriate for the processing element/algorithm. Currently, the feature is limited to the presence of the appropriate key names (declared for the processing element and its descendant subelements).
print(
describe.check_compatibility(
get_approx_reduct, config=config, input_data=input_data
)
)
print("---")
insufficient_input_data = {
INPUT_DATA_X: x,
}
print(
describe.check_compatibility(
get_approx_reduct,
config=config,
input_data=insufficient_input_data,
)
)
print("---")
print(
describe.check_compatibility(
get_approx_reduct,
config=config,
input_data=insufficient_input_data,
verbose=True,
)
)
True
---
False
---
(False, {'missing_input_data_keys': ['input_data_y']})
Invoke the prepared procedure (processing element) and get the result.
result: AttrsSubset = get_approx_reduct(
config=config,
input_data=input_data,
)
result
AttrsSubset(attrs=[0, 2])
Check if the obtained result is a decision approximate superreduct - as we expected that designing the computing procedure appropriately.
check_if_approx_reduct(
x,
x_counts,
y,
y_count,
attrs=result.attrs,
disorder_fun=disorder_measure,
epsilon=eps,
check_attrs_reduction=False,
)
True