TheAlgorithms_Python/machine_learning/decision_tree.py at problem_122 · mindaugl/TheAlgorithms_Python · GitHub

Name: TheAlgorithms_Python/machine_learning/decision_tree.py at problem_122 · mindaugl/TheAlgorithms_Python · GitHub
Rating: 4.9 (5127 reviews)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
"""
Implementation of a basic regression decision tree.
Input data set: The input data set must be 1-dimensional with continuous labels.
Output: The decision tree maps a real number input to a real number output.
"""

importnumpyasnp


classDecisionTree:
def__init__(self, depth=5, min_leaf_size=5):
self.depth=depth
self.decision_boundary=0
self.left=None
self.right=None
self.min_leaf_size=min_leaf_size
self.prediction=None

defmean_squared_error(self, labels, prediction):
"""
 mean_squared_error:
 @param labels: a one-dimensional numpy array
 @param prediction: a floating point value
 return value: mean_squared_error calculates the error if prediction is used to
 estimate the labels
 >>> tester = DecisionTree()
 >>> test_labels = np.array([1,2,3,4,5,6,7,8,9,10])
 >>> test_prediction = float(6)
 >>> bool(tester.mean_squared_error(test_labels, test_prediction) == (
 ... TestDecisionTree.helper_mean_squared_error_test(test_labels,
 ... test_prediction)))
 True
 >>> test_labels = np.array([1,2,3])
 >>> test_prediction = float(2)
 >>> bool(tester.mean_squared_error(test_labels, test_prediction) == (
 ... TestDecisionTree.helper_mean_squared_error_test(test_labels,
 ... test_prediction)))
 True
 """
iflabels.ndim!=1:
print("Error: Input labels must be one dimensional")

returnnp.mean((labels-prediction) **2)

deftrain(self, x, y):
"""
 train:
 @param x: a one-dimensional numpy array
 @param y: a one-dimensional numpy array.
 The contents of y are the labels for the corresponding X values

 train() does not have a return value

 Examples:
 1. Try to train when x & y are of same length & 1 dimensions (No errors)
 >>> dt = DecisionTree()
 >>> dt.train(np.array([10,20,30,40,50]),np.array([0,0,0,1,1]))

 2. Try to train when x is 2 dimensions
 >>> dt = DecisionTree()
 >>> dt.train(np.array([[1,2,3,4,5],[1,2,3,4,5]]),np.array([0,0,0,1,1]))
 Traceback (most recent call last):
 ...
 ValueError: Input data set must be one-dimensional

 3. Try to train when x and y are not of the same length
 >>> dt = DecisionTree()
 >>> dt.train(np.array([1,2,3,4,5]),np.array([[0,0,0,1,1],[0,0,0,1,1]]))
 Traceback (most recent call last):
 ...
 ValueError: x and y have different lengths

 4. Try to train when x & y are of the same length but different dimensions
 >>> dt = DecisionTree()
 >>> dt.train(np.array([1,2,3,4,5]),np.array([[1],[2],[3],[4],[5]]))
 Traceback (most recent call last):
 ...
 ValueError: Data set labels must be one-dimensional

 This section is to check that the inputs conform to our dimensionality
 constraints
 """
ifx.ndim!=1:
raiseValueError("Input data set must be one-dimensional")
iflen(x) !=len(y):
raiseValueError("x and y have different lengths")
ify.ndim!=1:
raiseValueError("Data set labels must be one-dimensional")

iflen(x) <2*self.min_leaf_size:
self.prediction=np.mean(y)
return

ifself.depth==1:
self.prediction=np.mean(y)
return

best_split=0
min_error=self.mean_squared_error(x, np.mean(y)) *2

"""
 loop over all possible splits for the decision tree. find the best split.
 if no split exists that is less than 2 * error for the entire array
 then the data set is not split and the average for the entire array is used as
 the predictor
 """
foriinrange(len(x)):
iflen(x[:i]) <self.min_leaf_size: # noqa: SIM114
continue
eliflen(x[i:]) <self.min_leaf_size:
continue
else:
error_left=self.mean_squared_error(x[:i], np.mean(y[:i]))
error_right=self.mean_squared_error(x[i:], np.mean(y[i:]))
error=error_left+error_right
iferror<min_error:
best_split=i
min_error=error

ifbest_split!=0:
left_x=x[:best_split]
left_y=y[:best_split]
right_x=x[best_split:]
right_y=y[best_split:]

self.decision_boundary=x[best_split]
self.left=DecisionTree(
depth=self.depth-1, min_leaf_size=self.min_leaf_size
 )
self.right=DecisionTree(
depth=self.depth-1, min_leaf_size=self.min_leaf_size
 )
self.left.train(left_x, left_y)
self.right.train(right_x, right_y)
else:
self.prediction=np.mean(y)

return

defpredict(self, x):
"""
 predict:
 @param x: a floating point value to predict the label of
 the prediction function works by recursively calling the predict function
 of the appropriate subtrees based on the tree's decision boundary
 """
ifself.predictionisnotNone:
returnself.prediction
elifself.leftorself.rightisnotNone:
ifx>=self.decision_boundary:
returnself.right.predict(x)
else:
returnself.left.predict(x)
else:
print("Error: Decision tree not yet trained")
returnNone


classTestDecisionTree:
"""Decision Tres test class"""

@staticmethod
defhelper_mean_squared_error_test(labels, prediction):
"""
 helper_mean_squared_error_test:
 @param labels: a one dimensional numpy array
 @param prediction: a floating point value
 return value: helper_mean_squared_error_test calculates the mean squared error
 """
squared_error_sum=float(0)
forlabelinlabels:
squared_error_sum+= (label-prediction) **2

returnfloat(squared_error_sum/labels.size)


defmain():
"""
 In this demonstration we're generating a sample data set from the sin function in
 numpy. We then train a decision tree on the data set and use the decision tree to
 predict the label of 10 different test values. Then the mean squared error over
 this test is displayed.
 """
x=np.arange(-1.0, 1.0, 0.005)
y=np.sin(x)

tree=DecisionTree(depth=10, min_leaf_size=10)
tree.train(x, y)

rng=np.random.default_rng()
test_cases= (rng.random(10) *2) -1
predictions=np.array([tree.predict(x) forxintest_cases])
avg_error=np.mean((predictions-test_cases) **2)

print("Test values: "+str(test_cases))
print("Predictions: "+str(predictions))
print("Average error: "+str(avg_error))


if__name__=="__main__":
main()
importdoctest

doctest.testmod(name="mean_squarred_error", verbose=True)