Skip to content
This repository was archived by the owner on Mar 21, 2024. It is now read-only.

Commit 967924e

Browse files
authored
Merge pull request #1661 from allisonvacanti/if_target_cdp
Update CDP support macros for if-target compatibility
2 parents f302e6a + cb30a6b commit 967924e

25 files changed

+952
-1917
lines changed

dependencies/cub

Submodule cub updated 38 files

testing/cmake/check_source_files.cmake

+33
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,24 @@ if (NOT valid_count EQUAL 5)
8484
"Matched ${valid_count} times, expected 5.")
8585
endif()
8686

87+
################################################################################
88+
# Legacy macro checks.
89+
# Check all files in Thrust to make sure that they aren't using the legacy
90+
# CUB_RUNTIME_ENABLED and __THRUST_HAS_CUDART__ macros.
91+
#
92+
# These macros depend on __CUDA_ARCH__ and are not compatible with NV_IF_TARGET.
93+
# They are provided for legacy purposes and should be replaced with
94+
# [THRUST|CUB]_RDC_ENABLED and NV_IF_TARGET in Thrust/CUB code.
95+
#
96+
#
97+
set(legacy_macro_header_exclusions
98+
# This header defines a legacy CUDART macro:
99+
thrust/system/cuda/config.h
100+
)
101+
102+
set(cub_legacy_macro_regex "CUB_RUNTIME_ENABLED")
103+
set(thrust_legacy_macro_regex "__THRUST_HAS_CUDART__")
104+
87105
################################################################################
88106
# Read source files:
89107
foreach(src ${thrust_srcs})
@@ -145,6 +163,21 @@ foreach(src ${thrust_srcs})
145163
set(found_errors 1)
146164
endif()
147165
endif()
166+
167+
if (NOT ${src} IN_LIST legacy_macro_header_exclusions)
168+
count_substrings("${src_contents}" "${thrust_legacy_macro_regex}" thrust_count)
169+
count_substrings("${src_contents}" "${cub_legacy_macro_regex}" cub_count)
170+
171+
if (NOT thrust_count EQUAL 0)
172+
message("'${src}' uses __THRUST_HAS_CUDART__. Replace with THRUST_RDC_ENABLED and NV_IF_TARGET.")
173+
set(found_errors 1)
174+
endif()
175+
176+
if (NOT cub_count EQUAL 0)
177+
message("'${src}' uses CUB_RUNTIME_ENABLED. Replace with CUB_RDC_ENABLED and NV_IF_TARGET.")
178+
set(found_errors 1)
179+
endif()
180+
endif()
148181
endforeach()
149182

150183
if (NOT found_errors EQUAL 0)

testing/cuda/CMakeLists.txt

+10-6
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,10 @@ file(GLOB test_srcs
66

77
# These tests always build with RDC, so make sure that the sm_XX flags are
88
# compatible. See note in ThrustCudaConfig.cmake.
9+
# TODO once we're using CUDA_ARCHITECTURES, we can setup non-rdc fallback
10+
# tests to build for non-rdc arches. But for now, all files in a given directory
11+
# must build with the same `CMAKE_CUDA_FLAGS` due to CMake constraints around
12+
# how CUDA_FLAGS works.
913
set(CMAKE_CUDA_FLAGS "${THRUST_CUDA_FLAGS_BASE} ${THRUST_CUDA_FLAGS_RDC}")
1014

1115
foreach(thrust_target IN LISTS THRUST_TARGETS)
@@ -18,11 +22,11 @@ foreach(thrust_target IN LISTS THRUST_TARGETS)
1822
get_filename_component(test_name "${test_src}" NAME_WLE)
1923
string(PREPEND test_name "cuda.")
2024

21-
thrust_add_test(test_target ${test_name} "${test_src}" ${thrust_target})
22-
23-
# All in testing/cuda will test device-side launch (aka calling parallel
24-
# algorithms from device code), which requires the CUDA device-side runtime,
25-
# which requires RDC, so these always need to be built with RDC.
26-
thrust_enable_rdc_for_cuda_target(${test_target})
25+
# Create two targets, one with RDC enabled, the other without. This tests
26+
# both device-side behaviors -- the CDP kernel launch with RDC, and the
27+
# serial fallback path without RDC.
28+
thrust_add_test(seq_test_target ${test_name}.cdp_0 "${test_src}" ${thrust_target})
29+
thrust_add_test(cdp_test_target ${test_name}.cdp_1 "${test_src}" ${thrust_target})
30+
thrust_enable_rdc_for_cuda_target(${cdp_test_target})
2731
endforeach()
2832
endforeach()

thrust/system/cuda/config.h

+35-10
Original file line numberDiff line numberDiff line change
@@ -32,22 +32,47 @@
3232
// older releases. This header will always pull in version info:
3333
#include <cub/util_namespace.cuh>
3434

35-
#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
36-
# if !defined(__CUDA_ARCH__) || defined(__CUDACC_RDC__)
37-
# define __THRUST_HAS_CUDART__ 1
38-
# define THRUST_RUNTIME_FUNCTION __host__ __device__ __forceinline__
39-
# else
40-
# define __THRUST_HAS_CUDART__ 0
41-
# define THRUST_RUNTIME_FUNCTION __host__ __forceinline__
42-
# endif
35+
#include <cub/detail/detect_cuda_runtime.cuh>
36+
37+
/**
38+
* \def THRUST_RUNTIME_FUNCTION
39+
*
40+
* Execution space for functions that can use the CUDA runtime API (`__host__`
41+
* when RDC is off, `__host__ __device__` when RDC is on).
42+
*/
43+
#define THRUST_RUNTIME_FUNCTION CUB_RUNTIME_FUNCTION
44+
45+
/**
46+
* \def THRUST_RDC_ENABLED
47+
*
48+
* Defined if RDC is enabled.
49+
*/
50+
#ifdef CUB_RDC_ENABLED
51+
#define THRUST_RDC_ENABLED
52+
#endif
53+
54+
/**
55+
* \def __THRUST_HAS_CUDART__
56+
*
57+
* Whether or not the active compiler pass is allowed to invoke device kernels
58+
* or methods from the CUDA runtime API.
59+
*
60+
* This macro should not be used in Thrust, as it depends on `__CUDA_ARCH__`
61+
* and is not compatible with `NV_IF_TARGET`. It is provided for legacy
62+
* purposes only.
63+
*
64+
* Replace any usages with `THRUST_RDC_ENABLED` and `NV_IF_TARGET`.
65+
*/
66+
#ifdef CUB_RUNTIME_ENABLED
67+
#define __THRUST_HAS_CUDART__ 1
4368
#else
44-
# define __THRUST_HAS_CUDART__ 0
45-
# define THRUST_RUNTIME_FUNCTION __host__ __forceinline__
69+
#define __THRUST_HAS_CUDART__ 0
4670
#endif
4771

4872
// These definitions were intended for internal use only and are now obsolete.
4973
// If you relied on them, consider porting your code to use the functionality
5074
// in libcu++'s <nv/target> header.
75+
//
5176
// For a temporary workaround, define THRUST_PROVIDE_LEGACY_ARCH_MACROS to make
5277
// them available again. These should be considered deprecated and will be
5378
// fully removed in a future version.

thrust/system/cuda/detail/adjacent_difference.h

+14-21
Original file line numberDiff line numberDiff line change
@@ -29,12 +29,14 @@
2929
#include <thrust/detail/config.h>
3030

3131
#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
32+
3233
#include <thrust/detail/cstdint.h>
3334
#include <thrust/detail/minmax.h>
3435
#include <thrust/detail/temporary_array.h>
3536
#include <thrust/detail/type_traits.h>
3637
#include <thrust/functional.h>
3738
#include <thrust/system/cuda/config.h>
39+
#include <thrust/system/cuda/detail/cdp_dispatch.h>
3840
#include <thrust/system/cuda/detail/dispatch.h>
3941
#include <thrust/system/cuda/detail/par_to_seq.h>
4042
#include <thrust/system/cuda/detail/util.h>
@@ -260,27 +262,18 @@ adjacent_difference(execution_policy<Derived> &policy,
260262
OutputIt result,
261263
BinaryOp binary_op)
262264
{
263-
OutputIt ret = result;
264-
if (__THRUST_HAS_CUDART__)
265-
{
266-
ret = __adjacent_difference::adjacent_difference(policy,
267-
first,
268-
last,
269-
result,
270-
binary_op);
271-
}
272-
else
273-
{
274-
#if !__THRUST_HAS_CUDART__
275-
ret = thrust::adjacent_difference(cvt_to_seq(derived_cast(policy)),
276-
first,
277-
last,
278-
result,
279-
binary_op);
280-
#endif
281-
}
282-
283-
return ret;
265+
THRUST_CDP_DISPATCH(
266+
(result = __adjacent_difference::adjacent_difference(policy,
267+
first,
268+
last,
269+
result,
270+
binary_op);),
271+
(result = thrust::adjacent_difference(cvt_to_seq(derived_cast(policy)),
272+
first,
273+
last,
274+
result,
275+
binary_op);));
276+
return result;
284277
}
285278

286279
template <class Derived,
+72
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
/*
2+
* Copyright 2021-2022 NVIDIA Corporation
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
/**
18+
* \file
19+
* Utilities for CUDA dynamic parallelism.
20+
*/
21+
22+
#pragma once
23+
24+
#include <cub/config.cuh>
25+
#include <cub/detail/detect_cuda_runtime.cuh>
26+
27+
#include <nv/target>
28+
29+
/**
30+
* \def THRUST_CDP_DISPATCH
31+
*
32+
* If CUDA Dynamic Parallelism / CUDA Nested Parallelism is available, always
33+
* run the parallel implementation. Otherwise, run the parallel implementation
34+
* when called from the host, and fallback to the sequential implementation on
35+
* the device.
36+
*
37+
* `par_impl` and `seq_impl` are blocks of C++ statements enclosed in
38+
* parentheses, similar to NV_IF_TARGET blocks:
39+
*
40+
* \code
41+
* THRUST_CDP_DISPATCH((launch_parallel_kernel();), (run_serial_impl();));
42+
* \endcode
43+
*/
44+
45+
#ifdef THRUST_RDC_ENABLED
46+
47+
// seq_impl unused.
48+
#define THRUST_CDP_DISPATCH(par_impl, seq_impl) \
49+
NV_IF_TARGET(NV_ANY_TARGET, par_impl)
50+
51+
#else // THRUST_RDC_ENABLED
52+
53+
// Special case for NVCC -- need to inform the device path about the kernels
54+
// that are launched from the host path.
55+
#if defined(__CUDACC__) && defined(__CUDA_ARCH__)
56+
57+
// Device-side launch not supported, fallback to sequential in device code.
58+
#define THRUST_CDP_DISPATCH(par_impl, seq_impl) \
59+
if (false) \
60+
{ /* Without this, the device pass won't compile any kernels. */ \
61+
NV_IF_TARGET(NV_ANY_TARGET, par_impl); \
62+
} \
63+
NV_IF_TARGET(NV_IS_HOST, par_impl, seq_impl)
64+
65+
#else // NVCC device pass
66+
67+
#define THRUST_CDP_DISPATCH(par_impl, seq_impl) \
68+
NV_IF_TARGET(NV_IS_HOST, par_impl, seq_impl)
69+
70+
#endif // NVCC device pass
71+
72+
#endif // THRUST_RDC_ENABLED

thrust/system/cuda/detail/copy.h

+16-29
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,10 @@
2828

2929
#include <thrust/detail/config.h>
3030

31+
#include <thrust/advance.h>
32+
3133
#include <thrust/system/cuda/config.h>
34+
#include <thrust/system/cuda/detail/cdp_dispatch.h>
3235
#include <thrust/system/cuda/detail/execution_policy.h>
3336
#include <thrust/system/cuda/detail/cross_system.h>
3437

@@ -117,22 +120,11 @@ copy(execution_policy<System> &system,
117120
InputIterator last,
118121
OutputIterator result)
119122
{
120-
OutputIterator ret = result;
121-
if (__THRUST_HAS_CUDART__)
122-
{
123-
ret = __copy::device_to_device(system, first, last, result);
124-
}
125-
else
126-
{
127-
#if !__THRUST_HAS_CUDART__
128-
ret = thrust::copy(cvt_to_seq(derived_cast(system)),
129-
first,
130-
last,
131-
result);
132-
#endif
133-
}
134-
135-
return ret;
123+
THRUST_CDP_DISPATCH(
124+
(result = __copy::device_to_device(system, first, last, result);),
125+
(result =
126+
thrust::copy(cvt_to_seq(derived_cast(system)), first, last, result);));
127+
return result;
136128
} // end copy()
137129

138130
__thrust_exec_check_disable__
@@ -146,19 +138,14 @@ copy_n(execution_policy<System> &system,
146138
Size n,
147139
OutputIterator result)
148140
{
149-
OutputIterator ret = result;
150-
if (__THRUST_HAS_CUDART__)
151-
{
152-
ret = __copy::device_to_device(system, first, first + n, result);
153-
}
154-
else
155-
{
156-
#if !__THRUST_HAS_CUDART__
157-
ret = thrust::copy_n(cvt_to_seq(derived_cast(system)), first, n, result);
158-
#endif
159-
}
160-
161-
return ret;
141+
THRUST_CDP_DISPATCH(
142+
(result = __copy::device_to_device(system,
143+
first,
144+
thrust::next(first, n),
145+
result);),
146+
(result =
147+
thrust::copy_n(cvt_to_seq(derived_cast(system)), first, n, result);));
148+
return result;
162149
} // end copy_n()
163150
#endif
164151

0 commit comments

Comments
 (0)