Cuda编程Thrust库

傅奕

2023-12-01

1.Thrust介绍

Thrust是一个C++模板库，为GPU计算提供高效的算法和数据结构。它是由NVIDIA开发的，旨在提供一个简单易用的接口来开发高性能GPU程序。

Thrust库基于CUDA，并且是在CUDA C++语言扩展的基础上构建的。Thrust为GPU编程提供了一种高级编程范式，使得开发人员可以使用类似于STL的算法和数据结构来加速应用程序。

Thrust库提供了一个广泛的算法库，包括排序、归约、扫描、拷贝、填充等等。这些算法都是为了在GPU上获得最佳的性能而进行了优化。此外，Thrust还提供了多种数据结构，包括向量、列表、映射和集合等等。与其他GPU编程框架相比，Thrust的优势在于它采用了C++语言的通用性，并且提供了一个高度模板化的API，使得开发人员可以轻松地将其应用于现有的代码中。同时，Thrust还提供了高度抽象化的接口，使得开发人员可以专注于算法本身，而不需要过多地关注底层的GPU架构和细节。（尤其在深度学习部署时，可以让前处理与后处理部分也运行在GPU上，对算法最终速度得到提升）

2.环境

Thrust的配置环境与CUDA编程的配置环境相同，可以参考我之前写的文章：CUDA编程（一）安装。

3.vector

3.1host_vector 和device_vector介绍

thrust提供两个矢量容器host_vector 和device_vector。
host_vector 存储在主机内存中。
device_vector在GPU设备内存中，必须运行在.cu文件，否则将无法运行。

#include <iostream>
using namespace std;
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
int main()
{
  // H has storage for 4 integers
  thrust::host_vector<int> H(4);


  // initialize individual elements
  H[0] = 14;
  H[1] = 20;
  H[2] = 38;
  H[3] = 46;


  // H.size() 显示尺寸
  std::cout << "H has size " << H.size() << std::endl;


  // 打印数据
  for (int i = 0; i < H.size(); i++)
    std::cout << "H[" << i << "] = " << H[i] << std::endl;


  // 重设H尺寸
  H.resize(2);


  std::cout << "H now has size " << H.size() << std::endl;


  // 拷贝 host_vector H 到 device_vector D，CUDA 与cpu可以互通
  thrust::device_vector<int> D = H;
  
  // 修改D数据
  D[0] = 99;
  D[1] = 88;


  // 打印D
  for (int i = 0; i < D.size(); i++)
    std::cout << "D[" << i << "] = " << D[i] << std::endl;


  // 函数结束时H,D会自动释放
  return 0;
}

这个例子说了 “=”可以用于显存上的数组与内存上的数组内容互相拷贝。要注意一个单独的元素device_vector可以使用标准括号表示来访问。但是，因为每个访问都需要呼叫cudaMemcpy，他们应该谨慎使用。稍后会给出一些更高效的方式。

3.2初始化与复制的方法

#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/copy.h>
#include <thrust/fill.h>
#include <thrust/sequence.h>
#include <iostream>
using namespace std; 
int main(void)
{
    //初始化10个1
    thrust::device_vector<int> D(10, 1);


    // 将索引0--6的的值变为9
    thrust::fill(D.begin(), D.begin() + 7, 9);


    //用D的前5个数据对H进行初始化
    thrust::host_vector<int> H(D.begin(), D.begin() + 5);


    // 将H变为 0, 1, 2, 3, ...
    thrust::sequence(H.begin(), H.end());


    // H拷贝到D的开始位置
    thrust::copy(H.begin(), H.end(), D.begin());


    // print D
    for (int i = 0; i < D.size(); i++)
        std::cout << "D[" << i << "] = " << D[i] << std::endl;


    cout << "柯西的笔" << endl;
    return 0;
}

3.3原始指针与device_ptr之间的相互转化

#include <iostream>
using namespace std;
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/device_malloc.h>
int main()
{
  size_t N = 10;


  // 申请gpu内存
  int* raw_ptr;
  cudaMalloc((void**)&raw_ptr, N * sizeof(int));


  //用device_ptr封装该指针
  thrust::device_ptr<int> dev_ptr(raw_ptr);


  // 全部初始化为0
  thrust::fill(dev_ptr, dev_ptr + N, (int)0);


  //打印
  for (int i = 0; i < N; i++)
    std::cout << "dev_ptr[" << i << "] = " << dev_ptr[i] << std::endl;


  // 将device_ptr变回普通指针
  int* raw_ptr2 = thrust::raw_pointer_cast(dev_ptr);


  cout << "柯西的笔" << endl;
  return 0;
}

4.算法

thrust提供了很多公用的并行算法，所有算法都有device和host两种实现。

4.1Transformations

下面将介绍一些算法的使用：thrust::negate<int>()取反操作，thrust::modulus<int>()相除余数。thrust还提供了许多其他算法有需要的可以读官方文档。

#include <thrust/device_vector.h>
#include <thrust/transform.h>
#include <thrust/sequence.h>
#include <thrust/copy.h>
#include <thrust/fill.h>
#include <thrust/replace.h>
#include <thrust/functional.h>
#include <iostream>
int main(void)
{
    // 定义3个device_vectors
    thrust::device_vector<int> X(10);
    thrust::device_vector<int> Y(10);
    thrust::device_vector<int> Z(10);


    // X初始化0,1,2,3, ....
    thrust::sequence(X.begin(), X.end());


    // compute Y = -X     thrust::negate<int>()取反操作
    thrust::transform(X.begin(), X.end(), Y.begin(), thrust::negate<int>());
    for (int i = 0; i < Y.size(); i++)
        std::cout << "Y[" << i << "] = " << Y[i] << std::endl;


    // Z全部用2填充
    thrust::fill(Z.begin(), Z.end(), 2);


    // compute Y = X mod Z   thrust::modulus<int>()相除余数
    thrust::transform(X.begin(), X.end(), Z.begin(), Y.begin(), thrust::modulus<int>());  
    for (int i = 0; i < Y.size(); i++)
        std::cout << "Y[" << i << "] = " << Y[i] << std::endl;


    // 将Y中所有的1替换成10
    thrust::replace(Y.begin(), Y.end(), 1, 10);
    for (int i = 0; i < Y.size(); i++)
        std::cout << "Y[" << i << "] = " << Y[i] << std::endl;


    return 0;
}

实现Y = A*X+Y的两种方法

#include <thrust/device_vector.h>
#include <thrust/transform.h>
#include <thrust/fill.h>
#include <thrust/functional.h>
#include <thrust/sequence.h>
#include <iostream>
using namespace std; 
//实现方式1
void saxpy_slow(float A, thrust::device_vector<float>& X, thrust::device_vector<float>& Y)
{
    thrust::device_vector<float> temp(X.size());


    // 用数字A填充temp
    thrust::fill(temp.begin(), temp.end(), A);


    // temp = A * X   thrust::multiplies<float>()相乘
    thrust::transform(X.begin(), X.end(), temp.begin(), temp.begin(), thrust::multiplies<float>()); 


    // Y = A * X + Y  thrust::plus<float>()相加
    thrust::transform(temp.begin(), temp.end(), Y.begin(), Y.begin(), thrust::plus<float>());
}
//实现方式2
struct saxpy_functor
{
    const float a;


    saxpy_functor(float _a) : a(_a) {}


    __host__ __device__
        float operator()(const float& x, const float& y) const {
        return a * x + y;
    }
};
void saxpy_fast(float A, thrust::device_vector<float>& X, thrust::device_vector<float>& Y)
{
    // Y = A * X + Y
    thrust::transform(X.begin(), X.end(), Y.begin(), Y.begin(), saxpy_functor(A));
}
int main()
{
    //定义初始数据
    float A = 10;
    thrust::device_vector<float> X(10);
    thrust::device_vector<float> Y(10);
    thrust::sequence(X.begin(), X.end());
    thrust::fill(Y.begin(), Y.end(), 5);
    
    /*saxpy_slow(A, X, Y);
    for (int i = 0; i < Y.size(); i++)
        std::cout << "Y[" << i << "] = " << Y[i] << std::endl;*/


    saxpy_fast(A, X, Y);
    for (int i = 0; i < Y.size(); i++)
        std::cout << "Y[" << i << "] = " << Y[i] << std::endl;


  cout << "柯西的笔" << endl;
  return 0;
}

saxpy_fast共有2N次读,1N写的开销
saxpy_slow有4N次读,3N次写的开销
在像这样的算法中，通常值得应用kernel fusion（将多个操作组合到单个kernel中）以最小化内存读写的开销。

4.2Reductions(归约)

归约算法使用二元操作符将一个序列的值转换为单个值，可以求和，求最大值等

#include <thrust/device_vector.h>
#include <thrust/functional.h>
#include <thrust/sequence.h>
#include <iostream>
using namespace std;
int main()
{
  thrust::device_vector<int> D(10);
  thrust::sequence(D.begin(), D.end());


  //这里前两个参数提供归约序列,后两个提供初始值和归约运算符
  int sum = thrust::reduce(D.begin(), D.end(), (int)0, thrust::plus<int>());
  /*int sum = thrust::reduce(D.begin(), D.end(), (int)0);
  int sum = thrust::reduce(D.begin(), D.end());*/
 
  cout << sum << endl;
  return 0;
  }

4.3其他算法

最大值、最大值索引、最小值、最小值索引、相同数据计数、累积和、排序、关键词排序等等。

#include <thrust/device_vector.h>
#include <thrust/transform.h>
#include <thrust/functional.h>
#include <thrust/sequence.h>
#include <thrust/count.h>
#include <thrust/extrema.h>
#include <thrust/sort.h>
#include <thrust/remove.h>
#include <thrust/unique.h>
#include <thrust/scan.h>
#include <iostream>
using namespace std;
int main()
{
  thrust::device_vector<int> D(10);
  thrust::sequence(D.begin(), D.end());


  /**************************************/
  //计算对应数据数量
  D[2] = 5;
  D[7] = 5;
  D[9] = 5;
  int result = thrust::count(D.begin(), D.end(), 5);
  cout << result << endl;    //有4个5


  //输出最小值和最小位置
  thrust::device_vector<int>::iterator minindex = thrust::min_element(D.begin(), D.end());
  std::cout << "min value : " << *minindex << " at position " << (minindex - D.begin()) << std::endl;


  //输出最大值和最大位置
  thrust::device_vector<int>::iterator maxindex = thrust::max_element(D.begin(), D.end());
  std::cout << "max value : " << *maxindex << " at position " << (maxindex - D.begin()) << std::endl;
  
  /*********************************************/
  //去除对应元素
  thrust::device_vector<int> F(5, 6);
  F[2] = 4;
  auto NewF_End = thrust::remove(F.begin(), F.end(), 4);
  for (auto it = F.begin(); it != NewF_End; it++)
    std::cout << "NewF = " << *it << std::endl;
  
  //去除相邻的重复元素
  thrust::device_vector<int> C(10);
  thrust::sequence(C.begin(), C.end());
  C[0] = 1;
  C[6] = 1;
  auto NewC_End = thrust::unique(C.begin(), C.end());
  for (auto it = C.begin(); it != NewC_End; it++)
    std::cout << "NewC = " << *it << std::endl;


  //求累积和
  thrust::device_vector<int> Input(5);
  thrust::device_vector<int> Output(5);
  thrust::sequence(Input.begin(), Input.end());
  thrust::inclusive_scan(Input.begin(), Input.end(), Output.begin());
  for (int i =0;i<Output.size();i++)
    std::cout << "Output[" << i << "] = " << Output[i] << std::endl;


  /**********************************************/
  //排序
  /*thrust::sort(D.begin(), D.end());
  for (int i = 0; i < D.size(); i++)
    std::cout << "D[" << i << "] = " << D[i] << std::endl;*/


  //稳定排序算法(从小到大
  thrust::stable_sort(D.begin(), D.end());
  for (int i = 0; i < D.size(); i++)
    std::cout << "D[" << i << "] = " << D[i] << std::endl;
  //稳定排序算法(从大到小
  thrust::stable_sort(D.begin(), D.end(), thrust::greater<int>());
  for (int i = 0; i < D.size(); i++)
    std::cout << "D[" << i << "] = " << D[i] << std::endl;


  //通过key排序
  vector<int> keys = { 0,2,4,1,3 };
  vector<string> values = { "0","2","4","1","3" };
  /*thrust::sort_by_key(keys.begin(), keys.end(), values.begin());
  for (int i = 0; i < keys.size(); i++)
    std::cout << "keys[" << i << "] = " << keys[i] <<"  " << "values[" << i << "] ="<<values[i]<<std::endl;*/
  //稳定key排序
  thrust::stable_sort_by_key(keys.begin(), keys.end(), values.begin());
  for (int i = 0; i < keys.size(); i++)
    std::cout << "keys[" << i << "] = " << keys[i] << "  " << "values[" << i << "] =" << values[i] << std::endl;


  cout << "柯西的笔" << endl;
  return 0;
}

5.小结

Thrust是一个易用、高性能、可扩展和可移植的GPU编程库，可以帮助程序员更轻松地利用GPU的并行性能来加速通用计算。

参考文献

https://docs.nvidia.com/cuda/thrust/

Cuda编程Thrust库

3.vector

相关阅读

相关文章

相关问答

相关文档