In PyTorch I wrote a very simple CNN discriminator and trained it. Now I need to deploy it to make predictions. But the target machine has a small GPU memory and got out of memory error. So I think that I can set requires_grad = False to prevent PyTorch from storing the gradient values. However I didn't find it making any difference.
There are about 5 millions of parameters in my model. But when predicting a single batch of input, it consumes about 1.2GB of memory. I think there should be no need for such large memory.
The question is how to save GPU memory usage when I just want to use my model to make predictions?
Here is a demo, I use discriminator.requires_grad_ to disable/enable autograd of all parameters. But it seems to be no use.
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as functional
from pynvml.smi import nvidia_smi
nvsmi = nvidia_smi.getInstance()
def getMemoryUsage():
usage = nvsmi.DeviceQuery("memory.used")["gpu"][0]["fb_memory_usage"]
return "%d %s" % (usage["used"], usage["unit"])
print("Before GPU Memory: %s" % getMemoryUsage())
class Discriminator(nn.Module):
def __init__(self):
super().__init__()
# trainable layers
# input: 2x256x256
self.conv1 = nn.Conv2d(2, 8, 5, padding=2) # 8x256x256
self.pool1 = nn.MaxPool2d(2) # 8x128x128
self.conv2 = nn.Conv2d(8, 32, 5, padding=2) # 32x128x128
self.pool2 = nn.MaxPool2d(2) # 32x64x64
self.conv3 = nn.Conv2d(32, 96, 5, padding=2) # 96x64x64
self.pool3 = nn.MaxPool2d(4) # 96x16x16
self.conv4 = nn.Conv2d(96, 256, 5, padding=2) # 256x16x16
self.pool4 = nn.MaxPool2d(4) # 256x4x4
self.num_flat_features = 4096
self.fc1 = nn.Linear(4096, 1024)
self.fc2 = nn.Linear(1024, 256)
self.fc3 = nn.Linear(256, 1)
# loss function
self.loss = nn.MSELoss()
# other properties
self.requires_grad = True
def forward(self, x):
y = x
y = self.conv1(y)
y = self.pool1(y)
y = functional.relu(y)
y = self.conv2(y)
y = self.pool2(y)
y = functional.relu(y)
y = self.conv3(y)
y = self.pool3(y)
y = functional.relu(y)
y = self.conv4(y)
y = self.pool4(y)
y = functional.relu(y)
y = y.view((-1,self.num_flat_features))
y = self.fc1(y)
y = functional.relu(y)
y = self.fc2(y)
y = functional.relu(y)
y = self.fc3(y)
y = torch.sigmoid(y)
return y
def predict(self, x, score_th=0.5):
if len(x.shape) == 3:
singlebatch = True
x = x.view([1]+list(x.shape))
else:
singlebatch = False
y = self.forward(x)
label = (y > float(score_th))
if singlebatch:
y = y.view(list(y.shape)[1:])
return label, y
def requires_grad_(self, requires_grad=True):
for parameter in self.parameters():
parameter.requires_grad_(requires_grad)
self.requires_grad = requires_grad
x = torch.cuda.FloatTensor(np.zeros([2, 256, 256]))
discriminator = Discriminator()
discriminator.to("cuda:0")
# comment/uncomment this line to make difference
discriminator.requires_grad_(False)
discriminator.predict(x)
print("Requires grad", discriminator.requires_grad)
print("After GPU Memory: %s" % getMemoryUsage())
By comment out the line discriminator.requires_grad_(False), I got output:
Before GPU Memory: 6350MiB
Requires grad True
After GPU Memory: 7547MiB
While by uncomment the line, I got:
Before GPU Memory: 6350MiB
Requires grad False
After GPU Memory: 7543MiB