I've been working on random forest algorithm for classification with roulette selection to find best splits. I came up with this homebrew based on this article https://machinelearningmastery.com/implement-random-forest-scratch-python/.
It works well enough, but I have a major problem with execution time. It's reasonably fast for 1-6 trees, but it completely chokes on 7 and more. I'm looking for ways to make it more efficient, so it doesn't take a month to finish basic tests. I'm a beginner, and I don't know much about optimizing code, so I'd appreciate any suggestions on how I could improve. Here's the link to Github repo: https://github.com/pdabrow11/Random-Forest-Roulette. It contains all .py
files and datasets used for classification. Here's the raw code:
- data_load.py
import csv def read_file(file): with open(file, 'r') as myfile: data = myfile.readlines() return data def load_data(data, var): dataset = list() for row in data: row = row.rstrip("\n") if var == 1: dataset.append(row.split(",")) if var == 2: dataset.append(row.split(" ")) if var == 3: dataset.append(row.split(";")) return dataset def process_dat_data(data): dataset = list() for row in data: row_temp = list() for i in range(1, len(row)): res = row[i].find(':') row_temp.append(row[i][res+1:]) row_temp.append(row[0]) dataset.append(row_temp) return dataset def str2int(dataset): for i in range(0, len(dataset[0])): class_values = [row[i] for row in dataset] unique = set(class_values) lookup = dict() for j, value in enumerate(unique): lookup[value] = float(j) for row in dataset: row[i] = lookup[row[i]] def str2float(dataset): for row in dataset: for i in range(0, len(row)): row[i] = float(row[i]) return dataset def write_data(data): with open("tests_res.csv", 'a') as file: f = csv.writer(file, delimiter =',', lineterminator='\n') f.writerows(data)
- random_forest.py
from math import sqrt from random import seed from random import randrange from sklearn.ensemble import RandomForestClassifier from sklearn.base import clone import numpy as np import numpy.random as npr def cross_validation_split(dataset, folds_num): dataset_split = list() dataset_copy = list(dataset) fold_size = int(len(dataset) / folds_num) for i in range(folds_num): fold = list() while len(fold) < fold_size: index = randrange(len(dataset_copy)) fold.append(dataset_copy.pop(index)) dataset_split.append(fold) return dataset_split def calc_accuracy(actual, predicted): correct = 0 for i in range(len(actual)): if actual[i] == predicted[i]: correct += 1 return correct / float(len(actual)) * 100.0 def evaluate_algorithm(dataset, folds_num, trees_num, *args): attributes_num = int(sqrt(len(dataset[0])-1)) folds = cross_validation_split(dataset, folds_num) scores = list() scores_sklearn = list() res = RandomForestClassifier(n_estimators=trees_num) for fold in folds: train_set = list(folds) train_set.remove(fold) train_set = sum(train_set, []) test_set = list() y_test_set = list() X, y = list(), list() for row in fold: #row_copy = list(row) #row_copy[-1] = None test_set.append(row[:-1]) y_test_set.append(row[-1]) predicted = random_forest(train_set, test_set, trees_num, attributes_num, *args) actual = [row[-1] for row in fold] accuracy = calc_accuracy(actual, predicted) scores.append(accuracy) # Sklearn for row in train_set: X.append(row[:-1]) y.append(row[-1]) #res = clone(res) res.fit(X, y) accuracy_sklearn = res.score(test_set, y_test_set)*100 scores_sklearn.append(accuracy_sklearn) return scores, scores_sklearn def split_attribute(dataset, index, value): left, right = list(), list() for row in dataset: if row[index] < value: left.append(row) else: right.append(row) return left, right def subset(dataset, ratio): sample = list() sample_num = round(len(dataset) * ratio) while len(sample) < sample_num: index = randrange(len(dataset)) sample.append(dataset[index]) return sample def gini_index(groups, classes): examples_num = int(sum([len(group) for group in groups])) gini = 0.0 for group in groups: size = int(len(group)) if size == 0: continue P, E = 0.0, 1.0 for single_class in classes: P = [row[-1] for row in group].count(single_class) / size E -= P ** 2 gini += (size / examples_num) * E return gini def roulette_split(dataset, attributes_num, threshold): classes = list(set(row[-1] for row in dataset)) index_list, val_list, group_list, fit = [], [], [], [] attributes = list() while len(attributes) < attributes_num: index = randrange(len(dataset[0])-1) if index not in attributes: attributes.append(index) counter = 0 for index in attributes: for row in dataset: groups = split_attribute(dataset, index, row[index]) gini = gini_index(groups, classes) index_list.append(index) val_list.append(row[index]) group_list.append(groups) fit.append(1-gini) counter += 1 wheel_size = 0 fit_args_sorted = np.argsort(fit) for i in range (0, int(threshold*counter)): fit[fit_args_sorted[i]] = 0 for i in range (0, counter): wheel_size += fit[i] selection_probs = [fit[i]/wheel_size for i in range (0, counter)] winner = int(npr.choice(np.arange(counter), 1, p=selection_probs)) return {'val':val_list[winner], 'groups':group_list[winner], 'index':index_list[winner]} def terminal(group): results = [row[-1] for row in group] return max(results, key=results.count) def one_class(node): res = True for i in range(0, len(node)): if node[0][-1] != node[i][-1]: res = False return res return res def are_the_same(node): res = True for i in range(0, len(node[0])-1): for j in range(0, len(node)): for k in range(0, len(node)): if node[j][i] != node[k][i]: res = False return res return res def split(node, attributes_num, depth, threshold): left, right = node['groups'] del(node['groups']) if not left or not right: node['left'] = node['right'] = terminal(left + right) return if one_class(left) or are_the_same(left): node['left'] = terminal(left) else: node['left'] = roulette_split(left, attributes_num, threshold) split(node['left'], attributes_num, depth+1, threshold) if one_class(right) or are_the_same(right): node['right'] = terminal(right) else: node['right'] = roulette_split(right, attributes_num, threshold) split(node['right'], attributes_num, depth+1, threshold) def build_tree(train, attributes_num, threshold): root = roulette_split(train, attributes_num, threshold) split(root, attributes_num, 1, threshold) return root def predict(node, row): if row[node['index']] < node['val']: if isinstance(node['left'], dict): return predict(node['left'], row) else: return node['left'] else: if isinstance(node['right'], dict): return predict(node['right'], row) else: return node['right'] def bagging_predict(trees, row): predictions = [predict(tree, row) for tree in trees] return max(set(predictions), key=predictions.count) def random_forest(train, test, attributes_num, trees_num, sample_size, threshold): trees = list() for i in range(trees_num): sample = subset(train, sample_size) tree = build_tree(sample, attributes_num, threshold) trees.append(tree) predictions = [bagging_predict(trees, row) for row in test] return(predictions)
- launch.py
from asyncore import write from random_forest import * from data_load import * from random_forest import * from data_load import * def main(): data_absence = read_file("datasets_v2/Absenteeism_at_work.csv") dataset_absence = load_data(data_absence, 3) dataset_absence = dataset_absence[1:] str2float(dataset_absence) data_car = read_file("datasets_v2/car.data") dataset_car = load_data(data_car, 1) str2int(dataset_car) data_opt = read_file("datasets_v2/batch_optdigits.tra") dataset_opt = load_data(data_opt, 1) str2float(dataset_opt) data_gas = read_file("datasets_v2/GasSensorDataset/batch1.dat") dataset_gas = load_data(data_gas, 2) dataset_gas = process_dat_data(dataset_gas) str2float(dataset_gas) sk_scores = [] for_scores = [] col_1 = [] col_2 = [] col_3 = [] col_4 = [] col_5 = [] col_6 = [] col_7 = [] col_8 = [] for dataset in [dataset_car, dataset_absence, dataset_gas]: sample_size = 0.05 folds_num = 5 for threshold in [0.2 , 0.5, 0.9]: print('Dla mojej informacji - threshold: ', threshold, '\n') for trees_num in range(1,10): sk_scores.clear() for_scores.clear() for i in [1,2,3,5]: seed(i) scores = evaluate_algorithm(dataset, folds_num, trees_num, sample_size, threshold) score = sum(scores[0])/float(len(scores[0])) for_scores.append(score) sk_score = sum(scores[1])/float(len(scores[1])) sk_scores.append(sk_score) if dataset == dataset_car: col_1.append('SECOM Dataset') col_1.append('SECOM Dataset') elif dataset == dataset_gas: col_1.append('Gas Sensor Dataset') col_1.append('Gas Sensor Dataset') else: col_1.append('Absenteeism Dataset') col_1.append('Absenteeism Dataset') for_mean = round(np.mean(for_scores),2) for_stdev = round(np.std(for_scores),2) for_best = round(np.amax(for_scores),2) for_worst = round(np.amax(for_scores),2) sk_mean = round(np.mean(sk_scores),2) sk_stdev = round(np.std(sk_scores),2) sk_best = round(np.amax(sk_scores),2) sk_worst = round(np.amax(sk_scores),2) col_2.append('Własna implementacja') col_2.append('Sklearn') col_3.append(trees_num) col_3.append(trees_num) col_4.append(threshold) col_4.append(threshold) col_5.append(for_mean) col_6.append(for_stdev) col_7.append(for_best) col_8.append(for_worst) col_5.append(sk_mean) col_6.append(sk_stdev) col_7.append(sk_best) col_8.append(sk_worst) print(trees_num) header = ['Zbiór danych', 'Implementacja', 'Ilość drzew', 'Próg selekcji', 'Średnia jakość', 'Odchylenie standardowe', 'Najlepsza jakość', 'Najgorsza jakość'] file_data = np.column_stack((col_1, col_2, col_3, col_4, col_5, col_6, col_7, col_8)) file_data = np.vstack((header, file_data)) open('tests_res.csv', 'w').close() write_data(file_data) if __name__ == "__main__": main()