南昌网站建设优化,自媒体短视频制作教程,麻章手机网站建设,网站建设从入门pdf接上篇文章#xff0c;可以发现使用CUDA提供的API进行前缀和扫描时#xff0c;第一次运行的时间不如共享内存访问#xff0c;猜测是使用到了全局内存。 首先看调用逻辑#xff1a;
thrust::inclusive_scan(thrust::device, d_x, d_x N, d_x);第一个参数指定了设备#x…接上篇文章可以发现使用CUDA提供的API进行前缀和扫描时第一次运行的时间不如共享内存访问猜测是使用到了全局内存。 首先看调用逻辑
thrust::inclusive_scan(thrust::device, d_x, d_x N, d_x);第一个参数指定了设备根据实参数量和类型找到对应的函数是scan.h中的如下函数
template typename DerivedPolicy, typename InputIterator, typename OutputIterator
_CCCL_HOST_DEVICE OutputIterator inclusive_scan(const thrust::detail::execution_policy_baseDerivedPolicy exec,InputIterator first,InputIterator last,OutputIterator result);
其实现位于thrust\thrust\system\cuda\detail\scan.h 注意路径可能与实际有偏差可以在/usr/local/下使用find . -name xx查找对应的文件
template typename Derived, typename InputIt, typename OutputIt
_CCCL_HOST_DEVICE OutputIt
inclusive_scan(thrust::cuda_cub::execution_policyDerived policy, InputIt first, InputIt last, OutputIt result)
{return thrust::cuda_cub::inclusive_scan(policy, first, last, result, thrust::plus{});
}将操作指定为plus 然后执行同一文件下的此函数
template typename Derived, typename InputIt, typename OutputIt, typename ScanOp
_CCCL_HOST_DEVICE OutputIt inclusive_scan(thrust::cuda_cub::execution_policyDerived policy, InputIt first, InputIt last, OutputIt result, ScanOp scan_op)
{using diff_t typename thrust::iterator_traitsInputIt::difference_type;diff_t const num_items thrust::distance(first, last);return thrust::cuda_cub::inclusive_scan_n(policy, first, num_items, result, scan_op);
}最终找到主要的执行逻辑
_CCCL_EXEC_CHECK_DISABLE
template typename Derived, typename InputIt, typename Size, typename OutputIt, typename ScanOp
_CCCL_HOST_DEVICE OutputIt inclusive_scan_n_impl(thrust::cuda_cub::execution_policyDerived policy, InputIt first, Size num_items, OutputIt result, ScanOp scan_op)
{using AccumT typename thrust::iterator_traitsInputIt::value_type;using Dispatch32 cub::DispatchScanInputIt, OutputIt, ScanOp, cub::NullType, std::int32_t, AccumT;using Dispatch64 cub::DispatchScanInputIt, OutputIt, ScanOp, cub::NullType, std::int64_t, AccumT;cudaStream_t stream thrust::cuda_cub::stream(policy);cudaError_t status;// Determine temporary storage requirements:size_t tmp_size 0;{THRUST_INDEX_TYPE_DISPATCH2(status,Dispatch32::Dispatch,Dispatch64::Dispatch,num_items,(nullptr, tmp_size, first, result, scan_op, cub::NullType{}, num_items_fixed, stream));thrust::cuda_cub::throw_on_error(status,after determining tmp storage requirements for inclusive_scan);}// Run scan:{// Allocate temporary storage:thrust::detail::temporary_arraystd::uint8_t, Derived tmp{policy, tmp_size};THRUST_INDEX_TYPE_DISPATCH2(status,Dispatch32::Dispatch,Dispatch64::Dispatch,num_items,(tmp.data().get(), tmp_size, first, result, scan_op, cub::NullType{}, num_items_fixed, stream));thrust::cuda_cub::throw_on_error(status, after dispatching inclusive_scan kernel);thrust::cuda_cub::throw_on_error(thrust::cuda_cub::synchronize_optional(policy), inclusive_scan failed to synchronize);}return result num_items;
}可以看到此处thrust调用了cub的Dispatchscan操作而cub中是使用全局内存的因此造成了效率还不如手动编写使用共享内存的算法。