Skip to content
This repository was archived by the owner on Mar 21, 2024. It is now read-only.

Commit 3e07bf5

Browse files
committed
Use CUB's new CDP macros.
1 parent 71fab9e commit 3e07bf5

25 files changed

+1025
-2042
lines changed

dependencies/cub

Submodule cub updated 49 files

thrust/system/cuda/config.h

+22-17
Original file line numberDiff line numberDiff line change
@@ -32,29 +32,34 @@
3232
// older releases. This header will always pull in version info:
3333
#include <cub/util_namespace.cuh>
3434

35-
#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
36-
# if !defined(__CUDA_ARCH__) || defined(__CUDACC_RDC__)
37-
# define __THRUST_HAS_CUDART__ 1
38-
# define THRUST_RUNTIME_FUNCTION __host__ __device__ __forceinline__
39-
# else
40-
# define __THRUST_HAS_CUDART__ 0
41-
# define THRUST_RUNTIME_FUNCTION __host__ __forceinline__
42-
# endif
43-
#else
44-
# define __THRUST_HAS_CUDART__ 0
45-
# define THRUST_RUNTIME_FUNCTION __host__ __forceinline__
46-
#endif
47-
4835
// These definitions were intended for internal use only and are now obsolete.
4936
// If you relied on them, consider porting your code to use the functionality
50-
// in libcu++'s <nv/target> header.
37+
// in libcu++'s <nv/target> header (for `THRUST_DEVICE_CODE`) or the new CDP
38+
// macros in CUB's <cub/detail/detect_cuda_runtime.cuh> header (for
39+
// `__THRUST_HAS_CUDART__` and `THRUST_RUNTIME_FUNCTION`).
40+
//
5141
// For a temporary workaround, define THRUST_PROVIDE_LEGACY_ARCH_MACROS to make
5242
// them available again. These should be considered deprecated and will be
5343
// fully removed in a future version.
5444
#ifdef THRUST_PROVIDE_LEGACY_ARCH_MACROS
55-
#ifdef __CUDA_ARCH__
56-
#define THRUST_DEVICE_CODE
57-
#endif // __CUDA_ARCH__
45+
46+
# if defined(__CUDACC__) || defined(_NVHPC_CUDA)
47+
# if !defined(__CUDA_ARCH__) || defined(__CUDACC_RDC__)
48+
# define __THRUST_HAS_CUDART__ 1
49+
# define THRUST_RUNTIME_FUNCTION __host__ __device__ __forceinline__
50+
# else
51+
# define __THRUST_HAS_CUDART__ 0
52+
# define THRUST_RUNTIME_FUNCTION __host__ __forceinline__
53+
# endif
54+
# else
55+
# define __THRUST_HAS_CUDART__ 0
56+
# define THRUST_RUNTIME_FUNCTION __host__ __forceinline__
57+
# endif
58+
59+
# ifdef __CUDA_ARCH__
60+
# define THRUST_DEVICE_CODE
61+
# endif // __CUDA_ARCH__
62+
5863
#endif // THRUST_PROVIDE_LEGACY_ARCH_MACROS
5964

6065
#ifdef THRUST_AGENT_ENTRY_NOINLINE

thrust/system/cuda/detail/adjacent_difference.h

+18-25
Original file line numberDiff line numberDiff line change
@@ -29,12 +29,14 @@
2929
#include <thrust/detail/config.h>
3030

3131
#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
32+
3233
#include <thrust/detail/cstdint.h>
3334
#include <thrust/detail/minmax.h>
3435
#include <thrust/detail/temporary_array.h>
3536
#include <thrust/detail/type_traits.h>
3637
#include <thrust/functional.h>
3738
#include <thrust/system/cuda/config.h>
39+
#include <thrust/system/cuda/detail/cdp_dispatch.h>
3840
#include <thrust/system/cuda/detail/dispatch.h>
3941
#include <thrust/system/cuda/detail/par_to_seq.h>
4042
#include <thrust/system/cuda/detail/util.h>
@@ -64,7 +66,7 @@ namespace __adjacent_difference {
6466
class InputIt,
6567
class OutputIt,
6668
class BinaryOp>
67-
cudaError_t THRUST_RUNTIME_FUNCTION
69+
cudaError_t CUB_CDP_FUNCTION
6870
doit_step(void *d_temp_storage,
6971
size_t &temp_storage_bytes,
7072
InputIt first,
@@ -114,7 +116,7 @@ namespace __adjacent_difference {
114116
template <class InputIt,
115117
class OutputIt,
116118
class BinaryOp>
117-
cudaError_t THRUST_RUNTIME_FUNCTION
119+
cudaError_t CUB_CDP_FUNCTION
118120
doit_step(void *d_temp_storage,
119121
size_t &temp_storage_bytes,
120122
InputIt first,
@@ -139,7 +141,7 @@ namespace __adjacent_difference {
139141
template <class InputIt,
140142
class OutputIt,
141143
class BinaryOp>
142-
cudaError_t THRUST_RUNTIME_FUNCTION
144+
cudaError_t CUB_CDP_FUNCTION
143145
doit_step(void *d_temp_storage,
144146
size_t &temp_storage_bytes,
145147
InputIt first,
@@ -181,7 +183,7 @@ namespace __adjacent_difference {
181183
typename InputIt,
182184
typename OutputIt,
183185
typename BinaryOp>
184-
OutputIt THRUST_RUNTIME_FUNCTION
186+
OutputIt CUB_CDP_FUNCTION
185187
adjacent_difference(execution_policy<Derived>& policy,
186188
InputIt first,
187189
InputIt last,
@@ -260,27 +262,18 @@ adjacent_difference(execution_policy<Derived> &policy,
260262
OutputIt result,
261263
BinaryOp binary_op)
262264
{
263-
OutputIt ret = result;
264-
if (__THRUST_HAS_CUDART__)
265-
{
266-
ret = __adjacent_difference::adjacent_difference(policy,
267-
first,
268-
last,
269-
result,
270-
binary_op);
271-
}
272-
else
273-
{
274-
#if !__THRUST_HAS_CUDART__
275-
ret = thrust::adjacent_difference(cvt_to_seq(derived_cast(policy)),
276-
first,
277-
last,
278-
result,
279-
binary_op);
280-
#endif
281-
}
282-
283-
return ret;
265+
THRUST_CDP_DISPATCH(
266+
(result = __adjacent_difference::adjacent_difference(policy,
267+
first,
268+
last,
269+
result,
270+
binary_op);),
271+
(result = thrust::adjacent_difference(cvt_to_seq(derived_cast(policy)),
272+
first,
273+
last,
274+
result,
275+
binary_op);));
276+
return result;
284277
}
285278

286279
template <class Derived,
+72
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
/*
2+
* Copyright 2021-2022 NVIDIA Corporation
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
/**
18+
* \file
19+
* Utilities for CUDA dynamic parallelism.
20+
*/
21+
22+
#pragma once
23+
24+
#include <cub/config.cuh>
25+
#include <cub/detail/detect_cuda_runtime.cuh>
26+
27+
#include <nv/target>
28+
29+
/**
30+
* \def THRUST_CDP_DISPATCH
31+
*
32+
* If CUDA Dynamic Parallelism / CUDA Nested Parallelism is available, always
33+
* run the parallel implementation. Otherwise, run the parallel implementation
34+
* when called from the host, and fallback to the sequential implementation on
35+
* the device.
36+
*
37+
* `par_impl` and `seq_impl` are blocks of C++ statements enclosed in
38+
* parentheses, similar to NV_IF_TARGET blocks:
39+
*
40+
* \code
41+
* THRUST_CDP_DISPATCH((launch_parallel_kernel();), (run_serial_impl();));
42+
* \endcode
43+
*/
44+
45+
#ifdef CUB_CDP_ENABLED
46+
47+
// seq_impl unused.
48+
#define THRUST_CDP_DISPATCH(par_impl, seq_impl) \
49+
NV_IF_TARGET(NV_ANY_TARGET, par_impl)
50+
51+
#else // CUB_CDP_ENABLED
52+
53+
// Special case for NVCC -- need to inform the device path about the kernels
54+
// that are launched from the host path.
55+
#if defined(__CUDACC__) && defined(__CUDA_ARCH__)
56+
57+
// Device-side launch not supported, fallback to sequential in device code.
58+
#define THRUST_CDP_DISPATCH(par_impl, seq_impl) \
59+
if (false) \
60+
{ /* Without this, the device pass won't compile any kernels. */ \
61+
NV_IF_TARGET(NV_ANY_TARGET, par_impl); \
62+
} \
63+
NV_IF_TARGET(NV_IS_HOST, par_impl, seq_impl)
64+
65+
#else // NVCC device pass
66+
67+
#define THRUST_CDP_DISPATCH(par_impl, seq_impl) \
68+
NV_IF_TARGET(NV_IS_HOST, par_impl, seq_impl)
69+
70+
#endif // NVCC device pass
71+
72+
#endif // CUB_CDP_ENABLED

thrust/system/cuda/detail/copy.h

+16-29
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,10 @@
2828

2929
#include <thrust/detail/config.h>
3030

31+
#include <thrust/advance.h>
32+
3133
#include <thrust/system/cuda/config.h>
34+
#include <thrust/system/cuda/detail/cdp_dispatch.h>
3235
#include <thrust/system/cuda/detail/execution_policy.h>
3336
#include <thrust/system/cuda/detail/cross_system.h>
3437

@@ -117,22 +120,11 @@ copy(execution_policy<System> &system,
117120
InputIterator last,
118121
OutputIterator result)
119122
{
120-
OutputIterator ret = result;
121-
if (__THRUST_HAS_CUDART__)
122-
{
123-
ret = __copy::device_to_device(system, first, last, result);
124-
}
125-
else
126-
{
127-
#if !__THRUST_HAS_CUDART__
128-
ret = thrust::copy(cvt_to_seq(derived_cast(system)),
129-
first,
130-
last,
131-
result);
132-
#endif
133-
}
134-
135-
return ret;
123+
THRUST_CDP_DISPATCH(
124+
(result = __copy::device_to_device(system, first, last, result);),
125+
(result =
126+
thrust::copy(cvt_to_seq(derived_cast(system)), first, last, result);));
127+
return result;
136128
} // end copy()
137129

138130
__thrust_exec_check_disable__
@@ -146,19 +138,14 @@ copy_n(execution_policy<System> &system,
146138
Size n,
147139
OutputIterator result)
148140
{
149-
OutputIterator ret = result;
150-
if (__THRUST_HAS_CUDART__)
151-
{
152-
ret = __copy::device_to_device(system, first, first + n, result);
153-
}
154-
else
155-
{
156-
#if !__THRUST_HAS_CUDART__
157-
ret = thrust::copy_n(cvt_to_seq(derived_cast(system)), first, n, result);
158-
#endif
159-
}
160-
161-
return ret;
141+
THRUST_CDP_DISPATCH(
142+
(result = __copy::device_to_device(system,
143+
first,
144+
thrust::next(first, n),
145+
result);),
146+
(result =
147+
thrust::copy_n(cvt_to_seq(derived_cast(system)), first, n, result);));
148+
return result;
162149
} // end copy_n()
163150
#endif
164151

0 commit comments

Comments
 (0)