Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improved GPU Copy #1976

Open
wants to merge 20 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
feea97f
Started with the implementation of a better copy, but I have to fix i…
philip-paul-mueller Mar 14, 2025
9b49c9e
Made a first version of the new copy implementation.
philip-paul-mueller Mar 14, 2025
d0a396f
Updated the memlet copying, I think I now have all the cases will now…
philip-paul-mueller Mar 14, 2025
c931b91
Now 2D copies works, more tests needed.
philip-paul-mueller Mar 14, 2025
a67ad2a
Added now also test for testing strided 1d copy.
philip-paul-mueller Mar 14, 2025
61ea7a6
Added a new test to the SDFG.
philip-paul-mueller Mar 14, 2025
322ecda
Improved memlet checking.
philip-paul-mueller Mar 14, 2025
0b15a74
Added a note about wrong usage of eid in validation.
philip-paul-mueller Mar 14, 2025
76a1a58
Added a new test for the pseudo 1d case, i.e. when we reduce a copy 2…
philip-paul-mueller Mar 14, 2025
66b43f8
Simplified some check.
Mar 15, 2025
801adb1
Added more verification.
Mar 15, 2025
02d87b5
Merge remote-tracking branch 'spcl/main' into improved-2d-copy
Mar 15, 2025
3166302
Fixed some issue and made it more logical.
Mar 15, 2025
2801967
I am not sure why the printout of the edge is not correct, but it is …
philip-paul-mueller Mar 17, 2025
51182e5
Moved the test for negative sized subsets from the Memlet to the `vai…
philip-paul-mueller Mar 17, 2025
065e0d7
Added tests to ensure that the new verification works as expected.
philip-paul-mueller Mar 17, 2025
b0b9945
Added Alexnicks's suggestions.
philip-paul-mueller Mar 21, 2025
ba97874
Merge remote-tracking branch 'spcl/main' into improved-2d-copy
philip-paul-mueller Mar 21, 2025
e5bf87f
Added a comment to address the possible issues with viewes.
philip-paul-mueller Mar 24, 2025
aef9945
As an experiment removed some code I think is useless, let's see what…
philip-paul-mueller Mar 25, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
148 changes: 95 additions & 53 deletions dace/codegen/targets/cuda.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,35 +161,40 @@ def preprocess(self, sdfg: SDFG) -> None:
nsdfg = state.parent
if (e.src.desc(nsdfg).storage == dtypes.StorageType.GPU_Global
and e.dst.desc(nsdfg).storage == dtypes.StorageType.GPU_Global):

# NOTE: If possible `memlet_copy_to_absolute_strides()` will collapse a
# ND copy into a 1D copy if the memory is contiguous. In that case
# `copy_shape` will only have one element.
copy_shape, src_strides, dst_strides, _, _ = memlet_copy_to_absolute_strides(
None, nsdfg, state, e, e.src, e.dst)
dims = len(copy_shape)

# Skip supported copy types
if dims == 1:
# NOTE: We do not check if the stride is `1`. See `_emit_copy()` for more.
continue
elif dims == 2:
if src_strides[-1] != 1 or dst_strides[-1] != 1:
# NOTE: Special case of continuous copy
# Example: dcol[0:I, 0:J, k] -> datacol[0:I, 0:J]
# with copy shape [I, J] and strides [J*K, K], [J, 1]
try:
is_src_cont = src_strides[0] / src_strides[1] == copy_shape[1]
is_dst_cont = dst_strides[0] / dst_strides[1] == copy_shape[1]
except (TypeError, ValueError):
is_src_cont = False
is_dst_cont = False
if is_src_cont and is_dst_cont:
continue
else:
# Because `memlet_copy_to_absolute_strides()` handles contiguous copies
# transparently, we only have to check if we have FORTRAN or C order.
# If we do not have them, then we have to turn this into a Map.
is_fortran_order = src_strides[0] == 1 and dst_strides[0] == 1
is_c_order = src_strides[-1] == 1 and dst_strides[-1] == 1
if is_c_order or is_fortran_order:
continue
elif dims > 2:
if not (src_strides[-1] != 1 or dst_strides[-1] != 1):
# Any higher dimensional copies must be C order. If not turn it
# into a copy map.
if src_strides[-1] == 1 and dst_strides[-1] == 1:
continue

# Turn unsupported copy to a map
try:
CopyToMap.apply_to(nsdfg, save=False, annotate=False, a=e.src, b=e.dst)
CopyToMap.apply_to(nsdfg,
save=False,
annotate=False,
a=e.src,
b=e.dst,
options={"ignore_strides": True})
except ValueError: # If transformation doesn't match, continue normally
continue

Expand Down Expand Up @@ -973,32 +978,21 @@ def _emit_copy(self, state_id: int, src_node: nodes.Node, src_storage: dtypes.St
copy_shape, src_strides, dst_strides, src_expr, dst_expr = (memlet_copy_to_absolute_strides(
self._dispatcher, sdfg, state_dfg, edge, src_node, dst_node, self._cpu_codegen._packed_types))
dims = len(copy_shape)

dtype = dst_node.desc(sdfg).dtype

# Handle unsupported copy types
if dims == 2 and (src_strides[-1] != 1 or dst_strides[-1] != 1):
# NOTE: Special case of continuous copy
# Example: dcol[0:I, 0:J, k] -> datacol[0:I, 0:J]
# with copy shape [I, J] and strides [J*K, K], [J, 1]
try:
is_src_cont = src_strides[0] / src_strides[1] == copy_shape[1]
is_dst_cont = dst_strides[0] / dst_strides[1] == copy_shape[1]
except (TypeError, ValueError):
is_src_cont = False
is_dst_cont = False
if is_src_cont and is_dst_cont:
dims = 1
copy_shape = [copy_shape[0] * copy_shape[1]]
src_strides = [src_strides[1]]
dst_strides = [dst_strides[1]]
else:
raise NotImplementedError('2D copy only supported with one stride')
# In 1D there is no difference between FORTRAN or C order, thus we will set them
# to the same value. The value indicates if the stride is `1`
# TODO: Figuring out if this is enough for views.
is_fortran_order = src_strides[0] == 1 and dst_strides[0] == 1
is_c_order = src_strides[-1] == 1 and dst_strides[-1] == 1

# Currently we only support ND copies when they can be represented
# as a 1D copy or as a 2D strided copy
if dims > 2:
if src_strides[-1] != 1 or dst_strides[-1] != 1:
# Currently we only support ND copies when they can be represented
# as a 1D copy or as a 2D strided copy
# NOTE: Not sure if this test is enough, it should also be tested that
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agreed

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To what are you agreeing on?

  • That you are not sure what this code is doing.
  • That you agree that it is meaningless and can be removed.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That the test is not enough. it doesn't consider, e.g., Views where you can have multiple discontinuities, and I guess it is like that because the original code predates Views. I also think it is fine to leave it as-is for now, but it should be noted down as a possible source of errors.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't understand why people give this code the impression that it is so old. It's from March 2022, it definitely does not predate views: #961

# they are ordered, i.e. largest stride on the left.
if not is_c_order:
# TODO: Implement the FORTRAN case.
raise NotImplementedError(
'GPU copies are not supported for N-dimensions if they cannot be represented by a strided copy\n'
f' Nodes: src {src_node} ({src_storage}), dst {dst_node}({dst_storage})\n'
Expand Down Expand Up @@ -1026,7 +1020,8 @@ def _emit_copy(self, state_id: int, src_node: nodes.Node, src_storage: dtypes.St
for d in range(dims - 2):
callsite_stream.write("}")

if dims == 1 and not (src_strides[-1] != 1 or dst_strides[-1] != 1):
elif dims == 1 and is_c_order:
# A 1D copy, in which the stride is 1, known at code generation time.
copysize = ' * '.join(_topy(copy_shape))
array_length = copysize
copysize += ' * sizeof(%s)' % dtype.ctype
Expand Down Expand Up @@ -1064,22 +1059,70 @@ def _emit_copy(self, state_id: int, src_node: nodes.Node, src_storage: dtypes.St
backend=self.backend), cfg,
state_id, [src_node, dst_node])
callsite_stream.write('}')
elif dims == 1 and ((src_strides[-1] != 1 or dst_strides[-1] != 1)):

elif dims == 1 and not is_c_order:
# This is the case that generated for expressions such as `A[::3]`, we reduce it
# to a 2D copy.
callsite_stream.write(
'DACE_GPU_CHECK({backend}Memcpy2DAsync({dst}, {dst_stride}, {src}, {src_stride}, {width}, {height}, {kind}, {stream}));\n'
.format(
backend=self.backend,
dst=dst_expr,
dst_stride=f'({_topy(dst_strides[0])}) * sizeof({dst_node.desc(sdfg).dtype.ctype})',
src=src_expr,
src_stride=f'({sym2cpp(src_strides[0])}) * sizeof({src_node.desc(sdfg).dtype.ctype})',
width=f'sizeof({dst_node.desc(sdfg).dtype.ctype})',
height=sym2cpp(copy_shape[0]),
kind=f'{self.backend}Memcpy{src_location}To{dst_location}',
stream=cudastream,
),
cfg,
state_id,
[src_node, dst_node],
)

elif dims == 2 and is_c_order:
# Copying a 2D array that are in C order, i.e. last stride is 1.
callsite_stream.write(
'DACE_GPU_CHECK(%sMemcpy2DAsync(%s, %s, %s, %s, %s, %s, %sMemcpy%sTo%s, %s));\n' %
(self.backend, dst_expr, _topy(dst_strides[0]) + ' * sizeof(%s)' % dst_node.desc(sdfg).dtype.ctype,
src_expr, sym2cpp(src_strides[0]) + ' * sizeof(%s)' % src_node.desc(sdfg).dtype.ctype,
'sizeof(%s)' % dst_node.desc(sdfg).dtype.ctype, sym2cpp(
copy_shape[0]), self.backend, src_location, dst_location, cudastream), cfg, state_id,
[src_node, dst_node])
elif dims == 2:
'DACE_GPU_CHECK({backend}Memcpy2DAsync({dst}, {dst_stride}, {src}, {src_stride}, {width}, {height}, {kind}, {stream}));\n'
.format(
backend=self.backend,
dst=dst_expr,
dst_stride=f'({_topy(dst_strides[0])}) * sizeof({dst_node.desc(sdfg).dtype.ctype})',
src=src_expr,
src_stride=f'({sym2cpp(src_strides[0])}) * sizeof({src_node.desc(sdfg).dtype.ctype})',
width=f'({sym2cpp(copy_shape[1])}) * sizeof({dst_node.desc(sdfg).dtype.ctype})',
height=sym2cpp(copy_shape[0]),
kind=f'{self.backend}Memcpy{src_location}To{dst_location}',
stream=cudastream,
),
cfg,
state_id,
[src_node, dst_node],
)
elif dims == 2 and is_fortran_order:
# Copying a 2D array into a 2D array that is in FORTRAN order, i.e. first stride
# is one. The CUDA API can not handle such cases directly, however, by "transposing"
# it is possible to use `Memcyp2DAsync`.
callsite_stream.write(
'DACE_GPU_CHECK(%sMemcpy2DAsync(%s, %s, %s, %s, %s, %s, %sMemcpy%sTo%s, %s));\n' %
(self.backend, dst_expr, _topy(dst_strides[0]) + ' * sizeof(%s)' % dst_node.desc(sdfg).dtype.ctype,
src_expr, sym2cpp(src_strides[0]) + ' * sizeof(%s)' % src_node.desc(sdfg).dtype.ctype,
sym2cpp(copy_shape[1]) + ' * sizeof(%s)' % dst_node.desc(sdfg).dtype.ctype, sym2cpp(
copy_shape[0]), self.backend, src_location, dst_location, cudastream), cfg, state_id,
[src_node, dst_node])
'DACE_GPU_CHECK({backend}Memcpy2DAsync({dst}, {dst_stride}, {src}, {src_stride}, {width}, {height}, {kind}, {stream}));\n'
.format(
backend=self.backend,
dst=dst_expr,
dst_stride=f'({_topy(dst_strides[1])}) * sizeof({dst_node.desc(sdfg).dtype.ctype})',
src=src_expr,
src_stride=f'({sym2cpp(src_strides[1])}) * sizeof({src_node.desc(sdfg).dtype.ctype})',
width=f'({sym2cpp(copy_shape[0])}) * sizeof({dst_node.desc(sdfg).dtype.ctype})',
height=sym2cpp(copy_shape[1]),
kind=f'{self.backend}Memcpy{src_location}To{dst_location}',
stream=cudastream,
),
cfg,
state_id,
[src_node, dst_node],
)
else:
raise NotImplementedError("The requested copy operation is not implemented.")

# Post-copy synchronization
if is_sync:
Expand Down Expand Up @@ -1126,7 +1169,6 @@ def _emit_copy(self, state_id: int, src_node: nodes.Node, src_storage: dtypes.St
# Obtain copy information
copy_shape, src_strides, dst_strides, src_expr, dst_expr = (memlet_copy_to_absolute_strides(
self._dispatcher, sdfg, state, edge, src_node, dst_node, self._cpu_codegen._packed_types))

dims = len(copy_shape)

funcname = 'dace::%sTo%s%dD' % (_get_storagename(src_storage), _get_storagename(dst_storage), dims)
Expand Down
16 changes: 13 additions & 3 deletions dace/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,8 @@ def _validate(self):
if any(not isinstance(s, (int, symbolic.SymExpr, symbolic.symbol, symbolic.sympy.Basic)) for s in self.shape):
raise TypeError('Shape must be a list or tuple of integer values '
'or symbols')
if any((shp < 0) == True for shp in self.shape):
raise TypeError(f'Found negative shape in Data, its shape was {self.shape}')
return True

def to_json(self):
Expand Down Expand Up @@ -1471,12 +1473,20 @@ def validate(self):
super(Array, self).validate()
if len(self.strides) != len(self.shape):
raise TypeError('Strides must be the same size as shape')
if len(self.offset) != len(self.shape):
raise TypeError('Offset must be the same size as shape')

if any(not isinstance(s, (int, symbolic.SymExpr, symbolic.symbol, symbolic.sympy.Basic)) for s in self.strides):
raise TypeError('Strides must be a list or tuple of integer values or symbols')

if len(self.offset) != len(self.shape):
raise TypeError('Offset must be the same size as shape')
if any(not isinstance(off, (int, symbolic.SymExpr, symbolic.symbol, symbolic.sympy.Basic))
for off in self.offset):
raise TypeError('Offset must be a list or tuple of integer values or symbols')

# Actually it would be enough to only enforce the non negativity only if the shape is larger than one.
if any((stride < 0) == True for stride in self.strides):
raise TypeError(f'Found negative strides in array, they were {self.strides}')
if (self.total_size < 0) == True:
raise TypeError(f'The total size of an array must be positive but it was negative {self.total_size}')

def covers_range(self, rng):
if len(rng) != len(self.shape):
Expand Down
3 changes: 3 additions & 0 deletions dace/memlet.py
Original file line number Diff line number Diff line change
Expand Up @@ -534,6 +534,9 @@ def dst_subset(self, new_dst_subset):
def validate(self, sdfg, state):
if self.data is not None and self.data not in sdfg.arrays:
raise KeyError('Array "%s" not found in SDFG' % self.data)
# NOTE: We do not check here is the subsets have a negative size, because such as subset
# is valid, in certain cases, for example if an AccessNode is connected to a MapEntry,
# because the Map is not executed. Thus we do the check in the `validate_state()` function.

def used_symbols(self, all_symbols: bool, edge=None) -> Set[str]:
"""
Expand Down
24 changes: 22 additions & 2 deletions dace/sdfg/validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

import networkx as nx

from dace import dtypes, subsets, symbolic
from dace import dtypes, subsets, symbolic, data
from dace.dtypes import DebugInfo

if TYPE_CHECKING:
Expand Down Expand Up @@ -656,7 +656,6 @@ def validate_state(state: 'dace.sdfg.SDFGState',
)
########################################

# Memlet checks
for eid, e in enumerate(state.edges()):
# Reference check
if id(e) in references:
Expand All @@ -680,6 +679,27 @@ def validate_state(state: 'dace.sdfg.SDFGState',
except Exception as ex:
raise InvalidSDFGEdgeError("Edge validation failed: " + str(ex), sdfg, state_id, eid)

# If the edge is a connection between two AccessNodes check if the subset has negative size.
# NOTE: We _should_ do this check in `Memlet.validate()` however, this is not possible,
# because the connection between am AccessNode and a MapEntry, with a negative size, is
# legal because, the Map will not run in that case. However, this constellation can not
# be tested for in the Memlet's validation function, so we have to do it here.
# NOTE: Zero size is explicitly allowed because it is essentially `memcpy(dst, src, 0)`
# which is save.
# TODO: The AN to AN connection is the most obvious one, but it should be extended.
if isinstance(e.src, nd.AccessNode) and isinstance(e.dst, nd.AccessNode):
e_memlet: dace.Memlet = e.data
if e_memlet.subset is not None:
if any((ss < 0) == True for ss in e_memlet.subset.size()):
raise InvalidSDFGEdgeError(
f'`subset` of an AccessNode to AccessNode Memlet contains a negative size; the size was {e_memlet.subset.size()}',
sdfg, state_id, eid)
if e_memlet.other_subset is not None:
if any((ss < 0) == True for ss in e_memlet.other_subset.size()):
raise InvalidSDFGEdgeError(
f'`other_subset` of an AccessNode to AccessNode Memlet contains a negative size; the size was {e_memlet.other_subset.size()}',
sdfg, state_id, eid)

# For every memlet, obtain its full path in the DFG
path = state.memlet_path(e)
src_node = path[0].src
Expand Down
Loading