I'm writing a k nearest neighbors implementation to solve multiclass classification.
import heapq import logging import numpy as np from scipy import spatial logging.basicConfig() class KNN(object): similarities = { 1: lambda a, b: np.linalg.norm(a-b), 2: lambda a, b: spatial.distance.cosine(a, b), } def __init__(self, k, similarity_func, loglevel=logging.DEBUG): self.k = k self.logger = logging.getLogger(type(self).__name__) self.logger.setLevel(loglevel) if similarity_func not in KNN.similarities: raise ValueError("Illegal similarity value {0}. Legal values are {1}".format(similarity_func, sorted(KNN.similarities.keys()))) self.similarity_func = KNN.similarities[similarity_func] def train(self, X, y): self.training_X = X self.training_y = y self.num_classes = len(np.unique(y)) self.logger.debug("There are %s classes", self.num_classes) return self def probs(self, X): class_probs = [] for i, e in enumerate(X, 1): votes = np.zeros((self.num_classes,)) self.logger.debug("Votes: %s", votes) if i % 100 == 0: self.logger.info("Example %s", i) distance = [(self.similarity_func(e, x), y) for x, y in zip(self.training_X, self.training_y)] for (_, label) in heapq.nsmallest(self.k, distance, lambda t: t[0]): votes[label] += 1 class_probs.append(normalize(votes)) return class_probs def predict(self, X): return np.argmax(self.probs(X))
I find that this implementation's predict
is slow™ and think it could be sped up with vectorized operations in numpy, but I'm fairly inexperienced with numpy vectorization techniques.
Does anyone have some suggestions for performance boosts I can get from predict
?