forked from TheAlgorithms/Python
- Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdecision_tree.py
204 lines (172 loc) · 7.05 KB
/
decision_tree.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
"""
Implementation of a basic regression decision tree.
Input data set: The input data set must be 1-dimensional with continuous labels.
Output: The decision tree maps a real number input to a real number output.
"""
importnumpyasnp
classDecisionTree:
def__init__(self, depth=5, min_leaf_size=5):
self.depth=depth
self.decision_boundary=0
self.left=None
self.right=None
self.min_leaf_size=min_leaf_size
self.prediction=None
defmean_squared_error(self, labels, prediction):
"""
mean_squared_error:
@param labels: a one-dimensional numpy array
@param prediction: a floating point value
return value: mean_squared_error calculates the error if prediction is used to
estimate the labels
>>> tester = DecisionTree()
>>> test_labels = np.array([1,2,3,4,5,6,7,8,9,10])
>>> test_prediction = float(6)
>>> bool(tester.mean_squared_error(test_labels, test_prediction) == (
... TestDecisionTree.helper_mean_squared_error_test(test_labels,
... test_prediction)))
True
>>> test_labels = np.array([1,2,3])
>>> test_prediction = float(2)
>>> bool(tester.mean_squared_error(test_labels, test_prediction) == (
... TestDecisionTree.helper_mean_squared_error_test(test_labels,
... test_prediction)))
True
"""
iflabels.ndim!=1:
print("Error: Input labels must be one dimensional")
returnnp.mean((labels-prediction) **2)
deftrain(self, x, y):
"""
train:
@param x: a one-dimensional numpy array
@param y: a one-dimensional numpy array.
The contents of y are the labels for the corresponding X values
train() does not have a return value
Examples:
1. Try to train when x & y are of same length & 1 dimensions (No errors)
>>> dt = DecisionTree()
>>> dt.train(np.array([10,20,30,40,50]),np.array([0,0,0,1,1]))
2. Try to train when x is 2 dimensions
>>> dt = DecisionTree()
>>> dt.train(np.array([[1,2,3,4,5],[1,2,3,4,5]]),np.array([0,0,0,1,1]))
Traceback (most recent call last):
...
ValueError: Input data set must be one-dimensional
3. Try to train when x and y are not of the same length
>>> dt = DecisionTree()
>>> dt.train(np.array([1,2,3,4,5]),np.array([[0,0,0,1,1],[0,0,0,1,1]]))
Traceback (most recent call last):
...
ValueError: x and y have different lengths
4. Try to train when x & y are of the same length but different dimensions
>>> dt = DecisionTree()
>>> dt.train(np.array([1,2,3,4,5]),np.array([[1],[2],[3],[4],[5]]))
Traceback (most recent call last):
...
ValueError: Data set labels must be one-dimensional
This section is to check that the inputs conform to our dimensionality
constraints
"""
ifx.ndim!=1:
raiseValueError("Input data set must be one-dimensional")
iflen(x) !=len(y):
raiseValueError("x and y have different lengths")
ify.ndim!=1:
raiseValueError("Data set labels must be one-dimensional")
iflen(x) <2*self.min_leaf_size:
self.prediction=np.mean(y)
return
ifself.depth==1:
self.prediction=np.mean(y)
return
best_split=0
min_error=self.mean_squared_error(x, np.mean(y)) *2
"""
loop over all possible splits for the decision tree. find the best split.
if no split exists that is less than 2 * error for the entire array
then the data set is not split and the average for the entire array is used as
the predictor
"""
foriinrange(len(x)):
iflen(x[:i]) <self.min_leaf_size: # noqa: SIM114
continue
eliflen(x[i:]) <self.min_leaf_size:
continue
else:
error_left=self.mean_squared_error(x[:i], np.mean(y[:i]))
error_right=self.mean_squared_error(x[i:], np.mean(y[i:]))
error=error_left+error_right
iferror<min_error:
best_split=i
min_error=error
ifbest_split!=0:
left_x=x[:best_split]
left_y=y[:best_split]
right_x=x[best_split:]
right_y=y[best_split:]
self.decision_boundary=x[best_split]
self.left=DecisionTree(
depth=self.depth-1, min_leaf_size=self.min_leaf_size
)
self.right=DecisionTree(
depth=self.depth-1, min_leaf_size=self.min_leaf_size
)
self.left.train(left_x, left_y)
self.right.train(right_x, right_y)
else:
self.prediction=np.mean(y)
return
defpredict(self, x):
"""
predict:
@param x: a floating point value to predict the label of
the prediction function works by recursively calling the predict function
of the appropriate subtrees based on the tree's decision boundary
"""
ifself.predictionisnotNone:
returnself.prediction
elifself.leftorself.rightisnotNone:
ifx>=self.decision_boundary:
returnself.right.predict(x)
else:
returnself.left.predict(x)
else:
print("Error: Decision tree not yet trained")
returnNone
classTestDecisionTree:
"""Decision Tres test class"""
@staticmethod
defhelper_mean_squared_error_test(labels, prediction):
"""
helper_mean_squared_error_test:
@param labels: a one dimensional numpy array
@param prediction: a floating point value
return value: helper_mean_squared_error_test calculates the mean squared error
"""
squared_error_sum=float(0)
forlabelinlabels:
squared_error_sum+= (label-prediction) **2
returnfloat(squared_error_sum/labels.size)
defmain():
"""
In this demonstration we're generating a sample data set from the sin function in
numpy. We then train a decision tree on the data set and use the decision tree to
predict the label of 10 different test values. Then the mean squared error over
this test is displayed.
"""
x=np.arange(-1.0, 1.0, 0.005)
y=np.sin(x)
tree=DecisionTree(depth=10, min_leaf_size=10)
tree.train(x, y)
rng=np.random.default_rng()
test_cases= (rng.random(10) *2) -1
predictions=np.array([tree.predict(x) forxintest_cases])
avg_error=np.mean((predictions-test_cases) **2)
print("Test values: "+str(test_cases))
print("Predictions: "+str(predictions))
print("Average error: "+str(avg_error))
if__name__=="__main__":
main()
importdoctest
doctest.testmod(name="mean_squarred_error", verbose=True)