I have the below code, but I just don't understand how to interpret the tree output data from the RandomForestClassifier, like how the gini was calculated, given the samples and how the totals in the 'value' lists can be higher than the initial samples of 3.
I am comparing this output to a DecisionTreeClassifier, which I can understand and interpret.
Any help is appreciated, thanks!
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
import numpy as np
from sklearn.externals.six import StringIO
import pydot
# Data
X = np.array([[0, 0],
[0, 1],
[1, 0],
[1, 1]])
Y = np.array([0, 1, 1, 0])
# Create object classifiers
clf = RandomForestClassifier()
clf_tree = tree.DecisionTreeClassifier()
# Fit data
clf_tree.fit(X,Y)
clf.fit(X, Y)
# Save data
dot_data = StringIO()
tree.export_graphviz(clf_tree, out_file = dot_data)
graph = pydot.graph_from_dot_data(dot_data.getvalue())
graph.write_pdf("orig_tree.pdf")
i_tree = 0
for tree_in_forest in clf.estimators_:
dot_data = StringIO()
tree.export_graphviz(tree_in_forest, out_file = dot_data)
graph = pydot.graph_from_dot_data(dot_data.getvalue())
f_name = 'tree_' + str(i_tree) + '.pdf'
graph.write_pdf(f_name)
i_tree += 1
The decision tree: https://i.sstatic.net/XZ7vU.png
A tree from the RandomForestClassifier: https://i.sstatic.net/Bb5t9.png