为了实现CPU与GPU内存的共享,cuda采用了零拷贝内存,它值固定内存的一种,当然,也就是实际存储空间实在cpu上。
零拷贝内存的延迟高,在进行频繁的读写操作时尽量少用,否则会大大降低性能。
/*
*创建固定内存映射
*
* flags: cudaHostAllocDefault: make cudaHostAlloc same as "cudaMallocHost"
* cudaHostAllocPortable: 函数返回能被所有cuda上下文使用的固定内存,而不仅仅是执行内存分配的那个
* cudaHostAllocWriteCombined:返回写结合的内存,该内存可以在某个系统配置上通过PCIe总线上更快的传输
* cudaHostAllocMapped: 该标志返回映射到的设备内存地址空间的主机内存
*/
cudaError_t cudaHostAlloc(void **pHost, size_t count, unsigned int flags);
/*
*
* 获取映射得到的固定内存的设备指针
*/
cudaError_t cudaHostGetDevicePointer(void **pDevice, void *pHost, unsigned int flags);
size_t nBytes = 2048;
float *h_A;
h_A = (float *)malloc(nBytes);
initialData(h_A, nBytes);
cudaHostAlloc((void **)&h_A, nBytes, cudaHostAllocMapped);
cudaHostGetDevicePointer((void **)&d_A, (void *)h_A, 0);
cudaFreeHost(h_A);
#include "../common/common.h"
#include <cuda_runtime.h>
#include <stdio.h>
/*
* This example demonstrates the use of zero-copy memory to remove the need to
* explicitly issue a memcpy operation between the host and device. By mapping
* host, page-locked memory into the device's address space, the address can
* directly reference a host array and transfer its contents over the PCIe bus.
*
* This example compares performing a vector addition with and without zero-copy
* memory.
*/
void checkResult(float *hostRef, float *gpuRef, const int N)
{
double epsilon = 1.0E-8;
for (int i = 0; i < N; i++)
{
if (abs(hostRef[i] - gpuRef[i]) > epsilon)
{
printf("Arrays do not match!\n");
printf("host %5.2f gpu %5.2f at current %d\n", hostRef[i],
gpuRef[i], i);
break;
}
}
return;
}
void initialData(float *ip, int size)
{
int i;
for (i = 0; i < size; i++)
{
ip[i] = (float)( rand() & 0xFF ) / 10.0f;
}
return;
}
void sumArraysOnHost(float *A, float *B, float *C, const int N)
{
for (int idx = 0; idx < N; idx++)
{
C[idx] = A[idx] + B[idx];
}
}
__global__ void sumArrays(float *A, float *B, float *C, const int N)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < N) C[i] = A[i] + B[i];
}
__global__ void sumArraysZeroCopy(float *A, float *B, float *C, const int N)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < N) C[i] = A[i] + B[i];
}
int main(int argc, char **argv)
{
// set up device
int dev = 0;
CHECK(cudaSetDevice(dev));
// get device properties
cudaDeviceProp deviceProp;
CHECK(cudaGetDeviceProperties(&deviceProp, dev));
// check if support mapped memory
if (!deviceProp.canMapHostMemory)
{
printf("Device %d does not support mapping CPU host memory!\n", dev);
CHECK(cudaDeviceReset());
exit(EXIT_SUCCESS);
}
printf("Using Device %d: %s ", dev, deviceProp.name);
// set up data size of vectors
int ipower = 10;
if (argc > 1) ipower = atoi(argv[1]);
int nElem = 1 << ipower;
size_t nBytes = nElem * sizeof(float);
if (ipower < 18)
{
printf("Vector size %d power %d nbytes %3.0f KB\n", nElem, ipower,
(float)nBytes / (1024.0f));
}
else
{
printf("Vector size %d power %d nbytes %3.0f MB\n", nElem, ipower,
(float)nBytes / (1024.0f * 1024.0f));
}
// part 1: using device memory
// malloc host memory
float *h_A, *h_B, *hostRef, *gpuRef;
h_A = (float *)malloc(nBytes);
h_B = (float *)malloc(nBytes);
hostRef = (float *)malloc(nBytes);
gpuRef = (float *)malloc(nBytes);
// initialize data at host side
initialData(h_A, nElem);
initialData(h_B, nElem);
memset(hostRef, 0, nBytes);
memset(gpuRef, 0, nBytes);
// add vector at host side for result checks
sumArraysOnHost(h_A, h_B, hostRef, nElem);
// malloc device global memory
float *d_A, *d_B, *d_C;
CHECK(cudaMalloc((float**)&d_A, nBytes));
CHECK(cudaMalloc((float**)&d_B, nBytes));
CHECK(cudaMalloc((float**)&d_C, nBytes));
// transfer data from host to device
CHECK(cudaMemcpy(d_A, h_A, nBytes, cudaMemcpyHostToDevice));
CHECK(cudaMemcpy(d_B, h_B, nBytes, cudaMemcpyHostToDevice));
// set up execution configuration
int iLen = 512;
dim3 block (iLen);
dim3 grid ((nElem + block.x - 1) / block.x);
sumArrays<<<grid, block>>>(d_A, d_B, d_C, nElem);
// copy kernel result back to host side
CHECK(cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost));
// check device results
checkResult(hostRef, gpuRef, nElem);
// free device global memory
CHECK(cudaFree(d_A));
CHECK(cudaFree(d_B));
// free host memory
free(h_A);
free(h_B);
// part 2: using zerocopy memory for array A and B
// allocate zerocpy memory
CHECK(cudaHostAlloc((void **)&h_A, nBytes, cudaHostAllocMapped));
CHECK(cudaHostAlloc((void **)&h_B, nBytes, cudaHostAllocMapped));
// initialize data at host side
initialData(h_A, nElem);
initialData(h_B, nElem);
memset(hostRef, 0, nBytes);
memset(gpuRef, 0, nBytes);
// pass the pointer to device
CHECK(cudaHostGetDevicePointer((void **)&d_A, (void *)h_A, 0));
CHECK(cudaHostGetDevicePointer((void **)&d_B, (void *)h_B, 0));
// add at host side for result checks
sumArraysOnHost(h_A, h_B, hostRef, nElem);
// execute kernel with zero copy memory
sumArraysZeroCopy<<<grid, block>>>(d_A, d_B, d_C, nElem);
// copy kernel result back to host side
CHECK(cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost));
// check device results
checkResult(hostRef, gpuRef, nElem);
// free memory
CHECK(cudaFree(d_C));
CHECK(cudaFreeHost(h_A));
CHECK(cudaFreeHost(h_B));
free(hostRef);
free(gpuRef);
// reset device
CHECK(cudaDeviceReset());
return EXIT_SUCCESS;
}