本人 CUDA小白一枚,要是有什么不对,还望各位大佬指点。
Thrust是一种C++的并行算法库,thrust的接口大大提高了工作的效率,同时还实现了GPU和多核CPU之间的性能的可移植性。
本文及后面的几篇将分别从几个方面来大概阐述一下Thrust的一些接口。原来的网址在这里。
1.Algorithms
2.Container Classes
3.Containers
4.Function Objects
5.Iterators
6.Memory Management
7.Numerics
8.Parallel Execution Policies
9.Random Number Generation
10. System
11. Utility
1.1 Copying
1.1.1 Gathering
1.1.1.1 gather
template<typename DerivedPolicy, typename InputIterator, typename RandomAccessIterator, typename OutputIterator>
__host__ __device__ OutputIterator thrust::gather(
const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
InputIterator map_first,
InputIterator map_last,
RandomAccessIterator input_first,
OutputIterator result
);
template<typename InputIterator, typename RandomAccessIterator, typename OutputIterator>
OutputIterator thrust::gather(
InputIterator map_first,
InputIterator map_last,
RandomAccessIterator input_first,
OutputIterator result
);
将源数据按照一定的顺序(map)依次存放到目标地址
例子:
int values[10] = {1, 0, 1, 0, 1, 0, 1, 0, 1, 0};
thrust::device_vector<int> d_values(values, values + 10);
int map[10] = {0, 2, 4, 6, 8, 1, 3, 5, 7, 9};
thrust::device_vector<int> d_map(map, map + 10);
thrust::device_vector<int> d_output1(10);
thrust::device_vector<int> d_output2(10);
thrust::gather(thrust::device, d_map.begin(), d_map.end(), d_values.begin(), d_output1.begin());
thrust::gather(d_map.begin(), d_map.end(), d_values.begin(), d_output2.begin());
// 最终输出结果为{1, 1, 1, 1, 1, 0, 0, 0, 0, 0}
1.1.1.2 gather_if
template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2, typename RandomAccessIterator, typename OutputIterator>
__host__ __device__ OutputIterator thrust::gather_if(
const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
InputIterator1 map_first,
InputIterator1 map_last,
InputIterator2 stencil,
RandomAccessIterator input_first,
OutputIterator result
);
template<typename InputIterator1, typename InputIterator2, typename RandomAccessIterator, typename OutputIterator>
OutputIterator thrust::gather_if(
InputIterator1 map_first,
InputIterator1 map_last,
InputIterator2 stencil,
RandomAccessIterator input_first,
OutputIterator result
);
template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2, typename RandomAccessIterator, typename OutputIterator, typename Predicate>
__host__ __device__ OutputIterator thrust::gather_if(
const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
InputIterator1 map_first,
InputIterator1 map_last,
InputIterator2 stencil,
RandomAccessIterator input_first,
OutputIterator result,
Predicate pred
);
template<typename InputIterator1, typename InputIterator2, typename RandomAccessIterator, typename OutputIterator, typename Predicate>
OutputIterator thrust::gather_if(
InputIterator1 map_first,
InputIterator1 map_last,
InputIterator2 stencil,
RandomAccessIterator input_first,
OutputIterator result,
Predicate pred
);
gather_if与gather不同的是,拷贝的过程是带有一定的条件的,只有对应位置的变量符合条件,才会将其拷贝到到目的地址。
例子:
int value[10] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}
thrust::device_vector<int> d_values(values, values + 10);
int stencil[10] = {1, 0, 1, 0, 1, 0, 1, 0, 1, 0};
thrust::device_vector<int> d_stencil(stencil, stencil + 10);
int map[10] = {0, 2, 4, 6, 8, 1, 3, 5, 7, 9};
thrust::device_vector<int> d_map(map, map + 10);
thrust::device_vector<int> d_output(10, 7); // 输出初始化
thrust::gather_if(thrust::device, d_map.begin(), d_map.end(), d_stencil.begin(), d_values.begin(), d_output.begin());
// d_output: {7, 7, 7, 7, 7, 7, 7, 7, 7, 7} -> {0, 7, 4, 7, 8, 7, 3, 7, 7, 7}
thrust::device_vector<int> d_output1(10, 7);
thrust::gather_if(d_map.begin(), d_map.end(), d_stencil.begin(), d_values.begin(), d_output1.begin());
// 结果同d_output
struct is_even {
__host__ __device__
bool operator()(const int x) { return (x % 2 == 0); }
};
int stencil2[10] = {0, 3, 4, 1, 4, 1, 2, 7, 8, 9};
thrust::device_vector<int> d_stencil2(stencil2, stencil2 + 10);
thrust::device_vector<int> d_output2(10, 7);
thrust::gather_if(thrust::device, d_map.begin(), d_map.end(), d_stencil2.begin(), d_values.begin(), d_output2.begin(), is_even());
// d_output2: {7, 7, 7, 7, 7, 7, 7, 7, 7, 7} -> {0, 7, 4, 7, 8, 7, 3, 7, 7, 7}
thrust::device_vector<int> d_output3(10, 7);
thrust::gather_if(d_map.begin(), d_map.end(), d_stencil2.begin(), d_values.begin(), d_output3.begin(), is_even());
// 结果同d_output2
可以看出,增加了is_even()之后,是利用is_even()的规则,对stencil中的值判断输出,如果符合条件,则输出对应的结果。例子中主要是判断stencil中的奇偶性来结果输出对结果进行拷贝。
1.1.2 Scattering
1.1.2.1 scatter
template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2, typename RandomAccessIterator>
__host__ __device__ OutputIterator thrust::scatter(
const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
InputIterator1 first,
InputIterator1 last,
InputIterator2 map,
RandomAccessIterator result,
);
template<typename InputIterator1, typename InputIterator2, typename RandomAccessIterator>
OutputIterator thrust::scatter(
InputIterator1 first,
InputIterator1 last,
InputIterator2 map,
RandomAccessIterator result,
);
将源数据依次按照一定的顺序(map)存放到目标的对应位置。
例子:
int value[10] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}
thrust::device_vector<int> d_values(values, values + 10);
int map[10] = {0, 2, 4, 6, 8, 1, 3, 5, 7, 9};
thrust::device_vector<int> d_map(map, map + 10);
thrust::device_vector<int> d_output(10);
thrust::scatter(thrust::device, d_values.begin(), d_values.end(), d_map.begin(), d_output.begin());
// d_output: {0, 5, 1, 6, 2, 7, 3, 8, 4, 9}
thrust::device_vector<int> d_output1(10);
thrust::scatter(d_values.begin(), d_values.end(), d_map.begin(), d_output1.begin());
// 结果同d_output
1.1.2.2 scatter_if
template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2, typename InputIterator3, typename RandomAccessIterator>
__host__ __device__ OutputIterator thrust::scatter_if(
const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
InputIterator1 first,
InputIterator1 last,
InputIterator2 map,
InputIterator3 stencil,
RandomAccessIterator result
);
template<typename InputIterator1, typename InputIterator2, typename InputIterator3, typename RandomAccessIterator>
OutputIterator thrust::scatter_if(
InputIterator1 first,
InputIterator1 last,
InputIterator2 map,
InputIterator3 stencil,
RandomAccessIterator result
);
template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2, typename InputIterator3, typename RandomAccessIterator, typename Predicate>
__host__ __device__ OutputIterator thrust::scatter_if(
const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
InputIterator1 first,
InputIterator1 last,
InputIterator2 map,
InputIterator3 stencil,
RandomAccessIterator result,
Predicate pred
);
template<typename InputIterator1, typename InputIterator2, typename InputIterator3, typename RandomAccessIterator, typename Predicate>
OutputIterator thrust::scatter_if(
InputIterator1 first,
InputIterator1 last,
InputIterator2 map,
InputIterator3 stencil,
RandomAccessIterator result,
Predicate pred
);
与gather_if中的stencil和pred有点类似作用,都是用来决定元素是否需要拷贝。
例子:
int V[8] = {10, 20, 30 ,40, 50, 60, 70, 80};
int M[8] = {0, 5, 1, 6, 2, 7, 3, 4};
int S[8] = {1, 0, 1, 0, 1, 0, 1, 0};
int D[8] = {0, 0, 0, 0, 0, 0, 0, 0};
thrust::scatter_if(thrust::host, V, V + 8, M, S, D);
// thrust::scatter_if(V, V + 8, M, S, D);
// D : {0, 0, 0, 0, 0, 0, 0, 0} -> {10, 30, 50, 70, 0, 0, 0, 0}
struct is_even {
__host__ __device__ bool operator()(int x) {
return (x % 2 == 0);
}
}
int D1[8] = {0, 0, 0, 0, 0, 0, 0, 0};
is_even pred;
thrust::scatter_if(thrust::host, V, V + 8, M, S, D1, pred);
// thrust::scatter_if(V, V + 8, M, S, D, pred);
// D1: {0, 0, 0, 0, 0, 0, 0, 0} -> {0, 0, 0, 0, 80, 20, 40 ,60}
pred还是针对stencil中的值进行判断,如果符合条件才会对数据进行拷贝。
1.1.3 Copy
1.1.3.1 copy
template<typename DerivedPolicy, typename InputIterator, typename OutputIterator>
__host__ __device__ OutputIterator thrust::copy(
const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
InputIterator first,
InputIterator last,
OutputIterator result
);
template<typename InputIterator, typename OutputIterator>
OutputIterator thrust::copy(
InputIterator first,
InputIterator last,
OutputIterator result
);
批量数据拷贝
例子:
thrust::device_vector<int> vec0(100);
thrust::device_vector<int> vec1(100);
thrust::copy(thrust::device, vec0.begin(), vec0.end(), vec1.begin());
// thrust::copy(vec0.begin(), vec0.end(), vec1.begin());
// vec1中的结果和vec0中一致
这么操作完之后,vec1和vec2中的将和vec0中完全一致。
1.1.3.2 copy_n
template<typename DerivedPolicy, typename InputIterator, typename Size, typename OutputIterator>
__host__ __device__ OutputIterator thrust::copy(
const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
InputIterator first,
size n,
OutputIterator result
);
template<typename InputIterator, typename Size, typename OutputIterator>
OutputIterator thrust::copy(
InputIterator first,
size n,
OutputIterator result
);
批量拷贝n个数据
例子:
size_t n = 100;
thrust::device_vector<int> vec0(n);
thrust::device_vector<int> vec1(n);
thrust::copy(thrust::device, vec0.begin(), n, vec1.begin());
// thrust::copy(vec0.begin(), n, vec1.begin());
// vec1中的结果和vec0中一致
1.1.3.3 swap_ranges
template<typename DerivedPolicy, typename ForwardIterator1, typename ForwardIterator2>
__host__ __device__ ForwardIterator2 thrust::swap_ranges(
const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
ForwardIterator1 first1,
ForwardIterator1 last1,
ForwardIterator2 first2
);
template<typename ForwardIterator1, typename ForwardIterator2>
ForwardIterator2 thrust::swap_ranges(
ForwardIterator1 first1,
ForwardIterator1 last1,
ForwardIterator2 first2
);
一定范围内数据进行交换
例子:
thrust::device_vector<int> v1(2), v2(2);
v1[0] = 1; v1[1] = 2;
v2[0] = 3; v2[1] = 4;
thrust::swap_ranges(thrust::device, v1.begin(), v1.end(), v2.begin());
// thrust::swap_ranges(v1.begin(), v1.end(), v2.begin());
// v1[0] = 3, v1[1] = 4, v2[0] = 1, v2[1] = 2
1.1.3.4 uninitialized_copy
template<typename DerivedPolicy, typename InputIterator, typename ForwardIterator>
__host__ __device__ ForwardIterator uninitialized_copy(
const thrust::detail::execution_policy_base< DerivedPolicy > & exec,
InputIterator first,
InputIterator last,
ForwardIterator result
);
template<typename InputIterator, typename ForwardIterator>
ForwardIterator uninitialized_copy(
InputIterator first,
InputIterator last,
ForwardIterator result
);
使用源数据对目标进行初始化
例子:
struct Int {
__host__ __device__
Int(int x) : val(x) {}
int val;
}
const int N = 137;
Int val(46);
thrust::device_vector<Int> input(N, val);
thrust::device_ptr<Int> array = thrust::device_malloc<Int>(N);
thrust::uninitialized_copy(thrust::device, input.begin(), input.end(), array);
// thrust::uninitialized_copy(input.begin(), input.end(), array);
// Int x = array[i]
// x.val == 46, for all 0 <= i < N
1.1.3.5 uninitialized_copy_n
template<typename DerivedPolicy, typename InputIterator, typename Size, typename ForwardIterator>
__host__ __device__ ForwardIterator uninitialized_copy_n(
const thrust::detail::execution_policy_base< DerivedPolicy > & exec,
InputIterator first,
Size n,
ForwardIterator result
);
template<typename InputIterator, typename Size, typename ForwardIterator>
ForwardIterator uninitialized_copy_n(
InputIterator first,
Size n,
ForwardIterator result
);
使用源数据对目标中n个数据进行初始化
例子:
struct Int {
__host__ __device__
Int(int x) : val(x) {}
int val;
}
const int N = 137;
Int val(46);
thrust::device_vector<Int> input(N, val);
thrust::device_ptr<Int> array = thrust::device_malloc<Int>(N);
thrust::uninitialized_copy(thrust::device, input.begin(), N, array);
// thrust::uninitialized_copy(input.begin(), N, array);
// Int x = array[i]
// x.val == 46, for all 0 <= i < N
至此大概简单介绍了一下 copying内部常见的一些api,如果有不对的地方,欢迎指出。