我正在sklearn上开发决策树(分类器),它工作得很好,我可以可视化树,并预测我的类。但是我想创建一列(在我的pandas数据框架中),这是在树中获得结果的路径。我的意思是,我想要一个所有规则的串联来得到我的结果,比如:-白色=假,黑色=假,重量=1,价格=5。请问你有什么想法吗?
根据这里的示例,您可以创建应用规则的解释。
>
估计器。decision_path
为您提供获得结果所遵循的节点is_leaves
是一个数组,如果每个节点是一个叶子,即终端(True
)或分支/决策(False
)节点\u指示器
,以获取已访问的节点阈值
和相关的功能
最后,将该函数应用于数据帧,就完成了。
def get_decision_path(estimator, feature_names, sample, precision=2, is_leaves=None):
if is_leaves is None:
is_leaves = get_leaves(estimator)
feature = estimator.tree_.feature
threshold = estimator.tree_.threshold
text = []
node_indicator = estimator.decision_path([sample])
node_index = node_indicator.indices[node_indicator.indptr[0]:
node_indicator.indptr[1]]
for node_id in node_index:
if is_leaves[node_id]:
break
if sample[feature[node_id]] <= threshold[node_id]:
threshold_sign = "<="
else:
threshold_sign = ">"
text.append('{}: {} {} {}'.format(feature_names[feature[node_id]],
sample[feature[node_id]],
threshold_sign,
round(threshold[node_id], precision)))
return '; '.join(text)
def get_leaves(estimator):
n_nodes = estimator.tree_.node_count
children_left = estimator.tree_.children_left
children_right = estimator.tree_.children_right
is_leaves = np.zeros(shape=n_nodes, dtype=bool)
stack = [(0, -1)]
while len(stack) > 0:
node_id, parent_depth = stack.pop()
if children_left[node_id] != children_right[node_id]:
stack.append((children_left[node_id], parent_depth + 1))
stack.append((children_right[node_id], parent_depth + 1))
else:
is_leaves[node_id] = True
return is_leaves
实例
print(get_decision_path(estimator,
['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)'],
[6.6, 3.0 , 4.4, 1.4]))
花瓣宽度(厘米):1.4
完整代码
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
from sklearn import tree
import pydotplus
from IPython.core.display import HTML, display
def get_decision_path(estimator, feature_names, sample, precision=2, is_leaves=None):
if is_leaves is None:
is_leaves = get_leaves(estimator)
feature = estimator.tree_.feature
threshold = estimator.tree_.threshold
text = []
node_indicator = estimator.decision_path([sample])
node_index = node_indicator.indices[node_indicator.indptr[0]:
node_indicator.indptr[1]]
for node_id in node_index:
if is_leaves[node_id]:
break
if sample[feature[node_id]] <= threshold[node_id]:
threshold_sign = "<="
else:
threshold_sign = ">"
text.append('{}: {} {} {}'.format(feature_names[feature[node_id]],
sample[feature[node_id]],
threshold_sign,
round(threshold[node_id], precision)))
return '; '.join(text)
def get_leaves(estimator):
n_nodes = estimator.tree_.node_count
children_left = estimator.tree_.children_left
children_right = estimator.tree_.children_right
is_leaves = np.zeros(shape=n_nodes, dtype=bool)
stack = [(0, -1)]
while len(stack) > 0:
node_id, parent_depth = stack.pop()
if children_left[node_id] != children_right[node_id]:
stack.append((children_left[node_id], parent_depth + 1))
stack.append((children_right[node_id], parent_depth + 1))
else:
is_leaves[node_id] = True
return is_leaves
# prepare data
iris = load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['target'] = iris.target
X = df.iloc[:, 0:4].to_numpy()
y = df.iloc[:, 4].to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
# create decision tree
estimator = DecisionTreeClassifier(max_leaf_nodes=5, random_state=0)
estimator.fit(X_train, y_train)
# visualize decision tree
dot_data = tree.export_graphviz(estimator, out_file=None,
feature_names=iris.feature_names,
class_names=iris.target_names,
filled=True, rounded=True,
special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data)
svg = graph.create_svg()
display(HTML(svg.decode('utf-8')))
# add explanation to data frame
is_leaves = get_leaves(estimator)
df['explanation'] = df.apply(lambda row: get_decision_path(estimator, df.columns[0:4], row[0:4], is_leaves=is_leaves), axis=1)
df.sample(5, axis=0, random_state=42)