-1

I am trying to implement and train an SVM multi-class classifier from scratch using python and numpy in jupyter notebooks.

I have been using the CS231n course as my base of knowledge, especially this page: https://cs231n.github.io/optimization-1/ which discusses gradient descent. I have implemented a class, SVM, that I believe is on the right track.

Here is the basic profile for that class:

class SVM:
  def __init__(self):
    self.weights = np.random.randn(len(labels), X_train.shape[1]) * 0.1
    self.history = []

  def predict(self, X):
    '''
    returns class predictions in np array of size
    n x num_classes, where n is the number of examples in X
    '''

    #matrix multiplication to apply weights to X
    bounds = self.weights @ X.T

    #return the predictions
    return np.array(bounds).T

  def loss(self, scores, y, delta=1):
    '''computes the loss'''
    #calculate and return the loss for a prediction and corresponding truth label
    #hinge loss in this case
    total_loss = 0

    #compute loss for each example...
    for i in range(len(scores)):
      #extract values for this example
      scores_of_x = scores[i]
      label = y[i]
      correct_score = scores_of_x[label]
      incorrect_scores = np.concatenate((scores_of_x[:label], scores_of_x[label+1:]))

      #use the scores for example x to compute the loss at x
      wj_xi = correct_score           #these should be a vector of INCORRECT scores
      wyi_xi = incorrect_scores       #this should be a vector of the CORRECT score
      wy_xi = wj_xi - wyi_xi + delta  #core of the hinge loss formula
      losses = np.maximum(0, wy_xi)   #lower bound the losses at 0
      loss = np.sum(losses)           #sum the losses

      #add to the total loss
      total_loss += loss

    #return the loss
    avg_loss = total_loss / len(scores)
    return avg_loss

  def gradient(self, scores, X, y, delta=1):
    '''computes the gradient'''
    #calculate the loss and the gradient of the loss function
    #gradient of hinge loss function
    gradient = np.zeros(self.weights.shape)

    #calculate the gradient in each example in x
    for i in range(len(X)):
      #extract values for this example
      scores_of_x = scores[i]
      label = y[i]
      x = X[i]
      correct_score = scores_of_x[label]
      incorrect_scores = np.concatenate((scores_of_x[:label], scores_of_x[label+1:]))

      #
      ##
      ### start by computing the gradient of the weights of the correct classifier
      ##
      #
      wj_xi = correct_score           #these should be a vector of INCORRECT scores
      wyi_xi = incorrect_scores       #this should be a vector of the CORRECT score
      wy_xi = wj_xi - wyi_xi + delta  #core of the hinge loss formula
      losses = np.maximum(0, wy_xi)   #lower bound the losses at 0

      #get number of nonzero losses, and scale data vector by them to get the loss
      num_contributing_classifiers = np.count_nonzero(losses)
      #print(f"Num loss contributors: {num_contributing_classifiers}")
      g = -1 * x * num_contributing_classifiers   #NOTE the -, very important here, doesn't apply to other scores

      #add the gradient of the correct classifier to the gradient
      gradient[label] += g  #because arrays are 0-indexed, but the labels are 1-indexed
      # print(f"correct label: {label}")
      #print(f"gradient:\n{gradient}")
      #
      ##
      ### then, compute the gradient of the weights for each incorrect classifier
      ##
      #
      for j in range(len(scores_of_x)):

        #skip the correct score, since we already did it
        if j == label:
          continue
        wj_xi = scores_of_x[j]          #should be a vector containing the score of the CURRENT classifier
        wyi_xi = correct_score          #should be a vector containing the score of the CORRECT classifier
        wy_xi = wj_xi - wyi_xi + delta  #core of the hinge loss formula
        loss = np.maximum(0, wy_xi)   #lower bound the loss at 0

        #get whether this classifier contributed to the loss, and scale the data vector by that to get the gradient
        contributed_to_loss = 0
        if loss > 0:
          contributed_to_loss = 1

        g = x * contributed_to_loss        #either times 1 or times 0

        #add the gradient of the incorrect classifier to the gradient
        gradient[j] += g


    #divide the gradient by number of examples to get the average gradient
    return gradient / len(X)

  def fit(self, X, y, epochs = 1000, batch_size = 256, lr=1e-2, verbose=True):
    #gradient descent loop
    for epoch in range(epochs):
      self.history.append({'epoch': epoch})

      #create a batch of samples to calculate the gradient
      #NOTE: this significantly boosts the speed of training
      indices = np.random.choice(len(X), batch_size, replace=False)
      X_batch = X.iloc[indices]
      y_batch = y.iloc[indices]
      
      X_batch = X_batch.to_numpy()
      y_batch = y_batch.to_numpy()

      #evaluate class scores on training set
      predictions = self.predict(X_batch)
      predicted_classes = np.argmax(predictions, axis=1)

      #compute the loss: average hinge loss
      loss = self.loss(predictions, y_batch)
      self.history[-1]['loss'] = loss

      #compute accuracy on the test set, for an intuitive metric
      accuracy = np.mean(predicted_classes == y_batch)
      self.history[-1]['accuracy'] = accuracy
      
      #print progress
      if epoch%50 == 0 and verbose:
        print(f"Epoch: {epoch} | Loss: {loss} | Accuracy: {accuracy} | LR: {lr} \n")


      #compute the gradient on the scores assigned by the classifier
      gradient = self.gradient(predictions, X_batch, y_batch)
      
      #backpropagate the gradient to the weights + bias
      step = gradient * lr

      #perform a parameter update, in the negative??? direction of the gradient
      self.weights += step

That is my implementation. The fit() method is the one that trains the weights on the data passed in. I am at a stage where loss tends to decrease from one iteration to the next.

But, the problem is, accuracy drops down to zero even as loss decreases.

I know that they are not directly related, but shouldn't my accuracy generally trend upwards as loss goes down? This makes me think I have done something wrong in the loss() and gradient() methods. But, I can't seem to find where I went wrong. Also, sometimes, my loss will increase from one epoch to the next. This could be an impact of my batched evaluation of the gradient, but I am not certain.

Here is a link to my Jupyter notebook, which should let you run my code in its current state:

https://colab.research.google.com/drive/12z4DevKDicmT4iE6AlMGrRiN6He8R9_4#scrollTo=uBTUQlscWksP And here is a link to the data set I am using: https://www.kaggle.com/datasets/taweilo/fish-species-sampling-weight-and-height-data/code

1 Answer 1

0

To anyone who runs across this thread, I solved my problem. Turns out, I was misreading the formula, and had the locations of two of the terms mixed up. The comments in my original code were actually correct. The variables wj_xi and wyi_xi should actually be defined like this (in both the gradient and the loss methods):

wj_xi = incorrect_scores     #these should be a vector of INCORRECT scores
wyi_xi = correct_score       #this should be a vector of the CORRECT score

I had them flipped around. Also, as was mentioned in the replies, it is important to update the weights in the negative direction of the gradient, like so:

self.weights -= step

Full code:

class SVM:
  def __init__(self):
    self.weights = np.random.randn(len(labels), X_train.shape[1]) * 0.1  #9 sets of weights (9 classes) and 4 entries per set (3 features + 1 bias, shape= (9x4))
    self.history = []

  def predict(self, X):
    '''
    returns class predictions in np array of size
    n x 10, where n is the number of examples in X
    '''

    #matrix multiplication to apply weights to X
    bounds = self.weights @ X.T

    #return the predictions
    return np.array(bounds).T

  def loss(self, scores, y, delta=1):
    '''
    returns the average hinge loss of the batch
    '''
    #calculate and return the loss for a prediction and corresponding truth label
    #hinge loss in this case
    total_loss = 0

    #compute loss for each example...
    for i in range(len(scores)):
      #extract values for this example
      scores_of_x = scores[i]
      label = y[i]
      correct_score = scores_of_x[label]
      incorrect_scores = np.concatenate((scores_of_x[:label], scores_of_x[label+1:]))

      #use the scores for example x to compute the loss at x
      wj_xi = incorrect_scores           #these should be a vector of INCORRECT scores
      wyi_xi = correct_score       #this should be a vector of the CORRECT score
      wy_xi = wj_xi - wyi_xi + delta  #core of the hinge loss formula
      losses = np.maximum(0, wy_xi)   #lower bound the losses at 0
      loss = np.sum(losses)           #sum the losses

      #add to the total loss
      total_loss += loss

    #return the loss
    avg_loss = total_loss / len(scores)  #divide by the number of examples to fund the average hinge loss per-example
    return avg_loss

  def gradient(self, scores, X, y, delta=1):
    '''
    returns the gradient of the loss function
    '''
    #calculate the loss and the gradient of the loss function
    #gradient of hinge loss function
    gradient = np.zeros(self.weights.shape)

    #calculate the gradient in each example in x
    for i in range(len(X)):
      #extract values for this example
      scores_of_x = scores[i]
      label = y[i]
      x = X[i]
      correct_score = scores_of_x[label]  #because arrays are 0-indexed, but the labels are 1-indexed
      incorrect_scores = np.concatenate((scores_of_x[:label], scores_of_x[label+1:]))

      #
      ##
      ### start by computing the gradient of the weights of the correct classifier
      ##
      #
      wj_xi = incorrect_scores           #these should be a vector of INCORRECT scores
      wyi_xi = correct_score       #this should be a vector of the CORRECT score
      wy_xi = wj_xi - wyi_xi + delta  #core of the hinge loss formula
      losses = np.maximum(0, wy_xi)   #lower bound the losses at 0

      #get number of nonzero losses, and scale data vector by them to get the loss
      num_contributing_classifiers = np.count_nonzero(losses)
      #print(f"Num loss contributors: {num_contributing_classifiers}")
      g = -1 * x * num_contributing_classifiers   #NOTE the -, very important here, doesn't apply to other scores

      #add the gradient of the correct classifier to the gradient
      gradient[label] += g  #because arrays are 0-indexed, but the labels are 1-indexed
      # print(f"correct label: {label}")
      #print(f"gradient:\n{gradient}")
      #
      ##
      ### then, compute the gradient of the weights for each incorrect classifier
      ##
      #
      for j in range(len(scores_of_x)):

        #skip the correct score, since we already did it
        if j == label:
          continue
        wj_xi = scores_of_x[j]          #should be a vector containing the score of the CURRENT classifier
        wyi_xi = correct_score          #should be a vector containing the score of the CORRECT classifier
        wy_xi = wj_xi - wyi_xi + delta  #core of the hinge loss formula
        loss = np.maximum(0, wy_xi)   #lower bound the loss at 0

        #get whether this classifier contributed to the loss, and scale the data vector by that to get the gradient
        contributed_to_loss = 0
        if loss > 0:
          contributed_to_loss = 1

        g = x * contributed_to_loss        #either times 1 or times 0

        #add the gradient of the incorrect classifier to the gradient
        gradient[j] += g


    #divide the gradient by number of examples to get the average gradient
    return gradient / len(X)

  def fit(self, X, y, epochs = 1000, batch_size = 256, lr=1e-2, verbose=True):
    '''
    trains the model on the training set
    '''
    #gradient descent loop
    for epoch in range(epochs):
      self.history.append({'epoch': epoch})

      #create a batch of samples to calculate the gradient
      #NOTE: this significantly boosts the speed of training
      indices = np.random.choice(len(X), batch_size, replace=False)
      X_batch = X.iloc[indices]
      y_batch = y.iloc[indices]
      X_batch = X_batch.to_numpy()
      y_batch = y_batch.to_numpy()

      #evaluate class scores on training set
      predictions = self.predict(X_batch)
      predicted_classes = np.argmax(predictions, axis=1)

      if epoch%50 == 0 and verbose:
        print(f"pred: {predicted_classes[:10]}")
        print(f"true: {y_batch[:10]}")


      #compute the loss: average hinge loss
      loss = self.loss(predictions, y_batch)
      self.history[-1]['loss'] = loss


      #compute accuracy on the test set, for an intuitive metric
      accuracy = np.mean(predicted_classes == y_batch)
      self.history[-1]['accuracy'] = accuracy

      #reduce the learning rate as training progresses
      # lr *= 0.999
      # self.history[-1]['lr'] = lr

      if epoch%50 == 0 and verbose:
        print(f"Epoch: {epoch} | Loss: {loss} | Accuracy: {accuracy} | LR: {lr} \n")


      #compute the gradient on the scores assigned by the classifier
      gradient = self.gradient(predictions, X_batch, y_batch)

      #print(gradient)

      #backpropagate the gradient to the weights + bias
      step = gradient * lr

      #perform a parameter update, in the negative direction of the gradient
      self.weights -= step

sm = SVM()
pred = sm.predict(np.array(X_train[0:1]))

sm.fit(X_train, y_train)
Sign up to request clarification or add additional context in comments.

Comments

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.