torch 1.7
dgl 0.5
pyg 1.6
# -*- coding: utf-8 -*-
"""
@Author:
@Time: 2020/12/1
@Description:
"""
import torch
import numpy as np
import torch_sparse
import dgl
import time
n = 60000
nnz = 50000
np.random.seed(123)
torch.manual_seed(123)
rows = np.random.randint(0, n, nnz)
cols = np.random.randint(0, n, nnz)
values = torch.randn(nnz).cuda().requires_grad_(True)
# torch.sparse.mm
X_sparse = torch.sparse_coo_tensor([rows, cols], values, size=(n, n)).cuda().requires_grad_(True)
Y_dense = torch.randn((n, 200)).cuda().requires_grad_(True)
print("memory allocated before multi: {} GB".format(torch.cuda.memory_allocated() / 10 ** 9))
print("max memory allocated before multi: {} GB".format(torch.cuda.max_memory_allocated() / 10 ** 9))
# t = torch_sparse.spmm(torch.tensor([rows, cols], dtype=torch.long).cuda(), values, n, n, Y_dense).sum()
torch.cuda.synchronize()
start_time = time.time()
t = torch.sparse.mm(X_sparse, Y_dense).sum()
print("memory allocated before backward: {} GB".format(torch.cuda.memory_allocated() / 10 ** 9))
print("max memory allocated before backward: {} GB".format(torch.cuda.max_memory_allocated() / 10 ** 9))
t.backward()
print("t: {}".format(t))
print("values grad: {}".format(values.grad))
# print("x.grad: {0} y.grad: {1}".format(X_sparse.grad, Y_dense.grad))
print("memory allocated after backward: {} GB".format(torch.cuda.memory_allocated() / 10 ** 9))
print("max memory allocated after backward: {} GB".format(torch.cuda.max_memory_allocated() / 10 ** 9))
torch.cuda.synchronize()
print("spmm and backward time is {} s".format(time.time() - start_time))
n = 60000, nnz = 50000时
Using backend: pytorch
memory allocated before multi: 0.049234944 GB
max memory allocated before multi: 0.049234944 GB
memory allocated before backward: 0.049235456 GB
max memory allocated before backward: 0.147145216 GB
t: 1653.47900390625
values grad: tensor([ -9.5381, -11.1868, -7.3291, ..., -7.0068, 10.7648, -11.1320],
device='cuda:0')
memory allocated after backward: 0.097670144 GB
max memory allocated after backward: 14.546705408 GB
spmm and backward time is 0.5331666469573975 s
n = 61000, nnz = 50000时(说明显存对结点敏感):
Using backend: pytorch
memory allocated before multi: 0.049800704 GB
max memory allocated before multi: 0.049800704 GB
memory allocated before backward: 0.049801216 GB
max memory allocated before backward: 0.148846592 GB
Traceback (most recent call last):
File "/home/maqy/gnn/ginn_batch_compare/GINN-1130/memory_test.py", line 34, in <module>
t.backward()
File "/root/miniconda3/lib/python3.7/site-packages/torch/tensor.py", line 221, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph)
File "/root/miniconda3/lib/python3.7/site-packages/torch/autograd/__init__.py", line 132, in backward
allow_unreachable=True) # allow_unreachable flag
RuntimeError: CUDA out of memory. Tried to allocate 13.86 GiB (GPU 0; 14.76 GiB total capacity; 94.99 MiB already allocated; 13.72 GiB free; 100.00 MiB reserved in total by PyTorch)
Process finished with exit code 1
import torch
import numpy as np
import torch_sparse
import dgl
import time
n = 60000
nnz = 50000
np.random.seed(123)
torch.manual_seed(123)
rows = np.random.randint(0, n, nnz)
cols = np.random.randint(0, n, nnz)
values = torch.randn(nnz).cuda().requires_grad_(True)
# torch.sparse.mm
# X_sparse = torch.sparse_coo_tensor([rows, cols], values, size=(n, n)).cuda().requires_grad_(True)
Y_dense = torch.randn((n, 200)).cuda().requires_grad_(True)
print("memory allocated before multi: {} GB".format(torch.cuda.memory_allocated() / 10 ** 9))
print("max memory allocated before multi: {} GB".format(torch.cuda.max_memory_allocated() / 10 ** 9))
torch.cuda.synchronize()
start_time = time.time()
t = torch_sparse.spmm(torch.tensor([rows, cols], dtype=torch.long).cuda(), values, n, n, Y_dense).sum()
print("memory allocated before backward: {} GB".format(torch.cuda.memory_allocated() / 10 ** 9))
print("max memory allocated before backward: {} GB".format(torch.cuda.max_memory_allocated() / 10 ** 9))
t.backward()
print("t: {}".format(t))
print("values grad: {}".format(values.grad))
# print("x.grad: {0} y.grad: {1}".format(X_sparse.grad, Y_dense.grad))
print("memory allocated after backward: {} GB".format(torch.cuda.memory_allocated() / 10 ** 9))
print("max memory allocated after backward: {} GB".format(torch.cuda.max_memory_allocated() / 10 ** 9))
torch.cuda.synchronize()
print("spmm and backward time is {} s".format(time.time() - start_time))
n = 60000, nnz = 50,000时(使用torch原生的会OOM):
Using backend: pytorch
memory allocated before multi: 0.048434688 GB
max memory allocated before multi: 0.048434688 GB
memory allocated before backward: 0.089235456 GB
max memory allocated before backward: 0.17746944 GB
t: 1653.479248046875
values grad: tensor([ -9.5381, -11.1868, -7.3291, ..., -7.0068, 10.7648, -11.1320],
device='cuda:0')
memory allocated after backward: 0.096869888 GB
max memory allocated after backward: 0.20943616 GB
spmm and backward time is 0.03326892852783203 s
Process finished with exit code 0
但torch_sparse是使用的gather scatter模式,边的条数变多的话,显存会明显增长,比如:
n = 60000, nnz = 500,000时:
Using backend: pytorch
memory allocated before multi: 0.05023488 GB
max memory allocated before multi: 0.05023488 GB
memory allocated before backward: 0.458791424 GB
max memory allocated before backward: 0.90758144 GB
t: -14356.595703125
values grad: tensor([ 11.1778, -22.7249, -4.5888, ..., -24.6294, 4.4844, 4.5446],
device='cuda:0')
memory allocated after backward: 0.100470272 GB
max memory allocated after backward: 1.662460416 GB
spmm and backward time is 0.17125725746154785 s
Process finished with exit code 0
n = 61000, nnz = 5,000,000时:
memory allocated before multi: 0.069206016 GB
max memory allocated before multi: 0.069206016 GB
memory allocated before backward: 4.149206528 GB
max memory allocated before backward: 8.197440512 GB
Traceback (most recent call last):
File "/home/maqy/gnn/ginn_batch_compare/GINN-1130/memory_test.py", line 33, in <module>
t.backward()
File "/root/miniconda3/lib/python3.7/site-packages/torch/tensor.py", line 221, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph)
File "/root/miniconda3/lib/python3.7/site-packages/torch/autograd/__init__.py", line 132, in backward
allow_unreachable=True) # allow_unreachable flag
RuntimeError: CUDA out of memory. Tried to allocate 3.73 GiB (GPU 0; 14.76 GiB total capacity; 11.31 GiB already allocated; 2.50 GiB free; 11.32 GiB reserved in total by PyTorch)
Process finished with exit code 1
使用DGL的方式:
import torch
import numpy as np
import torch_sparse
import dgl
import time
n = 60000
nnz = 50000
np.random.seed(123)
torch.manual_seed(123)
rows = np.random.randint(0, n, nnz)
cols = np.random.randint(0, n, nnz)
values = torch.randn(nnz).cuda().requires_grad_(True)
# torch.sparse.mm
# X_sparse = torch.sparse_coo_tensor([rows, cols], values, size=(n, n)).cuda().requires_grad_(True)
Y_dense = torch.randn((n, 200)).cuda().requires_grad_(True)
# 注意,这里需要是cols在前,rows在后,计算出来的结果才是一样的
g = dgl.graph((cols, rows))
g = g.to("cuda:0")
g.srcdata['h'] = Y_dense
g.edata['e'] = values
print("memory allocated before multi: {} GB".format(torch.cuda.memory_allocated() / 10 ** 9))
print("max memory allocated before multi: {} GB".format(torch.cuda.max_memory_allocated() / 10 ** 9))
torch.cuda.synchronize()
start_time = time.time()
t = dgl.ops.gspmm(g, 'mul', 'sum', lhs_data=g.srcdata['h'], rhs_data=g.edata['e']).sum()
print("memory allocated before backward: {} GB".format(torch.cuda.memory_allocated() / 10 ** 9))
print("max memory allocated before backward: {} GB".format(torch.cuda.max_memory_allocated() / 10 ** 9))
t.backward()
print("t: {}".format(t))
print("values grad: {}".format(values.grad))
# print("x.grad: {0} y.grad: {1}".format(X_sparse.grad, Y_dense.grad))
print("memory allocated after backward: {} GB".format(torch.cuda.memory_allocated() / 10 ** 9))
print("max memory allocated after backward: {} GB".format(torch.cuda.max_memory_allocated() / 10 ** 9))
torch.cuda.synchronize()
print("spmm and backward time is {} s".format(time.time() - start_time))
n = 60000, nnz = 50000时:
Using backend: pytorch
memory allocated before multi: 0.048434688 GB
max memory allocated before multi: 0.048434688 GB
memory allocated before backward: 0.0484352 GB
max memory allocated before backward: 0.09667072 GB
t: 1653.47900390625
values grad: tensor([ -9.5381, -11.1868, -7.3291, ..., -7.0068, 10.7648, -11.1320],
device='cuda:0')
memory allocated after backward: 0.096869888 GB
max memory allocated after backward: 0.145104896 GB
spmm and backward time is 0.02077507972717285 s
Process finished with exit code 0
n = 60000, nnz = 500,000时:
Using backend: pytorch
memory allocated before multi: 0.05023488 GB
max memory allocated before multi: 0.05023488 GB
memory allocated before backward: 0.050235392 GB
max memory allocated before backward: 0.098470912 GB
t: -14356.599609375
values grad: tensor([ 11.1778, -22.7249, -4.5888, ..., -24.6294, 4.4844, 4.5446],
device='cuda:0')
memory allocated after backward: 0.100470272 GB
max memory allocated after backward: 0.14870528 GB
spmm and backward time is 0.10973095893859863 s
Process finished with exit code 0
n = 60000, nnz = 5,000,000时:
Using backend: pytorch
memory allocated before multi: 0.069206016 GB
max memory allocated before multi: 0.069206016 GB
memory allocated before backward: 0.069206528 GB
max memory allocated before backward: 0.117442048 GB
t: 7377.28125
values grad: tensor([ 18.3335, -23.7182, -4.3458, ..., 13.7045, 9.3865, -12.4589],
device='cuda:0')
memory allocated after backward: 0.13744128 GB
max memory allocated after backward: 0.185676288 GB
spmm and backward time is 1.038252353668213 s
Process finished with exit code 0
用于验证DGL和torch_sparse结果一样的代码:
import torch
import numpy as np
import torch_sparse
import dgl
import dgl.function as fn
torch.manual_seed(123)
n = 10
nnz = 2
rows = np.random.randint(0, n, nnz)
cols = np.random.randint(0, n, nnz)
a = torch.randn(nnz)
values = a.cuda().requires_grad_(True)
values2 = a.cuda().requires_grad_(True)
# torch.sparse.mm
Y_dense = torch.randn((n, 2)).cuda().requires_grad_(True)
g = dgl.graph([])
g.add_nodes(n)
g.add_edges(cols, rows)
g = g.to("cuda:0")
g.srcdata['h'] = Y_dense
g.edata['e'] = values
print("memory allocated before multi: {}".format(torch.cuda.memory_allocated() / 10 ** 9))
print("max memory allocated before multi: {}".format(torch.cuda.max_memory_allocated() / 10 ** 9))
t1 = dgl.ops.gspmm(g, 'mul', 'sum', lhs_data=g.srcdata['h'], rhs_data=g.edata['e']).sum()
t2 = torch_sparse.spmm(torch.tensor([rows, cols], dtype=torch.long).cuda(), values2, n, n, Y_dense).sum()
print("memory allocated before backward: {}".format(torch.cuda.memory_allocated() / 10 ** 9))
print("max memory allocated before backward: {}".format(torch.cuda.max_memory_allocated() / 10 ** 9))
t1.backward()
t2.backward()
print("t1: {}".format(t1))
print("t2: {}".format(t2))
print("values grad: {}".format(values.grad))
print("values2 grad: {}".format(values2.grad))
# print("x.grad: {0} y.grad: {1}".format(X_sparse.grad, Y_dense.grad))
print("memory allocated after backward: {}".format(torch.cuda.memory_allocated() / 10 ** 9))
print("max memory allocated after backward: {}".format(torch.cuda.max_memory_allocated() / 10 ** 9))
输出:
memory allocated before multi: 1.536e-06
max memory allocated before multi: 1.536e-06
memory allocated before backward: 3.584e-06
max memory allocated before backward: 4.096e-06
t1: 0.010851062834262848
t2: 0.010851062834262848
values grad: tensor([1.2198, 1.2198], device='cuda:0')
values2 grad: tensor([1.2198, 1.2198], device='cuda:0')
memory allocated after backward: 4.096e-06
max memory allocated after backward: 8.704e-06
Process finished with exit code 0
速度上,n=60000,nnz=50000的情况下,计时spmm和backward的时间
torch原生:
spmm and backward time is 0.5331666469573975 s
torch_sparse:
spmm and backward time is 0.03326892852783203 s
DGL:
spmm and backward time is 0.02077507972717285 s
可以看到DGL的方式确实会快一些。
n = 60000, nnz = 500,000时:
torch原生:
spmm and backward time is 0.5481641292572021 s
torch_sparse:
spmm and backward time is 0.17125725746154785 s
DGL:
spmm and backward time is 0.10973095893859863 s