I've created binary classification model from scratch, just to understand intuition behind that.
However when I compare my implementation to model from tensorflow/pytorch with the same parameters and configuration I noticed that my model achieved similar results in about 3 000 epochs but tensorflow/pytorch model achieved that in 300 epochs.
I also noticed that my model calculates very small gradient when tensorflow/pytorch calculates a much higher gradient in every epoch
- Is there any way to optimize calculating gradient in
backward
function to make model learning faster? - Is there any other such field that could be optimized/simplified and how could it be implemented
Below is my backward
function responsible for calculating gradient:
def backward( y: np.ndarray, y_pred: np.ndarray, layers: List[ Dict[ str, np.ndarray ] ] ) -> None: loss: np.ndarray = binary_cross_entropy_loss_prime(y, y_pred) for layer in reversed(layers): dZ: np.ndarray = layer['prime'](layer['z']) * loss layer['db'] = (dZ * np.ones_like(layer['b'])).sum(axis = 0, keepdims=True) / loss.shape[0] dU: np.ndarray = dZ * np.ones_like(layer['u']) layer['dw'] = np.dot(layer['x'].T, dU) / loss.shape[0] loss = np.dot(dU, layer['w'].T)
and also full code with data types to easier understanding:
"""# Dataset and libraries""" import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns from typing import List, Dict from sklearn.datasets import make_moons x, y = make_moons(n_samples = 1000, noise = 0.2, random_state = 100) # expand y second dim # before expand_dims -> y.shape = (1000, ) # after expand_dims -> y.shape = (1000, 1) y = np.expand_dims(y, 1) # final shapes: X -> (1000, 2), Y -> (1000, 1) """# Activations functions""" def sigma(x: np.ndarray) -> np.ndarray: return 1 / (1 + np.exp(-x)) def sigma_prime(x: np.ndarray) -> np.ndarray: e = np.exp(x) return e / (e + 1) ** 2 def relu(x: np.ndarray) -> np.ndarray: return np.maximum(0, x) def relu_prime(x: np.ndarray) -> np.ndarray: return np.where(x <= 0, 0, 1) """# Dense layers""" dense_layers = [ { 'w': np.random.rand(2, 8) * 0.1, 'b': np.random.rand(1, 8) * 0.1, 'activ': relu, 'prime': relu_prime }, { 'w': np.random.rand(8, 8) * 0.1, 'b': np.random.rand(1, 8) * 0.1, 'activ': relu, 'prime': relu_prime }, { 'w': np.random.rand(8, 1) * 0.1, 'b': np.random.rand(1, 1) * 0.1, 'activ': sigma, 'prime': sigma_prime } ] """# Losses and metrics """ def binary_cross_entropy_loss(y_true: np.ndarray, y_pred: np.ndarray) -> float: number_of_rows = y_true.shape[0] # 1000 rows number_of_cols = y_true.shape[1] # 1 cols return np.sum(-(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))) / number_of_rows * number_of_cols def binary_cross_entropy_loss_prime(y_true: np.ndarray, y_pred: np.ndarray) -> np.ndarray: return (1 - y_true) / (1 - y_pred) - y_true / y_pred def accuracy(y_true: np.ndarray, y_pred: np.ndarray, threshhold: float = 0.5) -> float: return (np.where(y_pred <= threshhold, 0, 1) == y_true).mean() """# Forward propagation""" def forward(x: np.ndarray, layers: List[ Dict[ str, np.ndarray ] ]) -> np.ndarray: for layer in layers: layer['x'] = x layer['u'] = np.dot(x, layer['w']) layer['z'] = layer['u'] + layer['b'] layer['a'] = layer['activ'](layer['z']) x = layer['a'] return x """# Backward propagation""" def backward(y: np.ndarray, y_pred: np.ndarray, layers: List[ Dict[ str, np.ndarray ] ]) -> None: loss: np.ndarray = binary_cross_entropy_loss_prime(y, y_pred) for layer in reversed(layers): dZ: np.ndarray = layer['prime'](layer['z']) * loss layer['db'] = (dZ * np.ones_like(layer['b'])).sum(axis = 0, keepdims=True) / loss.shape[0] dU: np.ndarray = dZ * np.ones_like(layer['u']) layer['dw'] = np.dot(layer['x'].T, dU) / loss.shape[0] loss = np.dot(dU, layer['w'].T) """# Update weights and biases (SGD optimizer)""" def update(layers: List[ Dict[ str, np.ndarray ] ], learning_rate: float) -> None: for layer in layers: layer['w'] -= learning_rate * layer['dw'] layer['b'] -= learning_rate * layer['db'] """# Train model""" def train(x: np.ndarray, y: np.ndarray, layers: List[ Dict[ str, np.ndarray ] ], epochs: int, learning_rate: float) -> None: for epoch in range(epochs): # Forward propagation y_hat = forward(x, layers) # Backward propagation backward(y, y_hat, layers) # Update layers update(layers, learning_rate) # show progress if epoch % 100 == 0: print('Iteration nr: ', epoch, ', loss: ', binary_cross_entropy_loss(y, y_hat), ', accuracy: ', accuracy(y, y_hat)) train(x, y, dense_layers, 3001, 0.01) ```