Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ This release is compatible with NumPy 2.4.5.
* Updated tests to align with NumPy 2.4.5 compatibility [gh-2920](https://github.com/IntelPython/dpnp/pull/2920)
* Replaced `.pxi` includes in `dpnp.tensor` with modular `.pxd`/`.pyx` Cython imports [#2913](https://github.com/IntelPython/dpnp/pull/2913)
* Reimplemented `dpnp.eye` and `dpnp.tensor.eye` with a branchless kernel [gh-2937](https://github.com/IntelPython/dpnp/pull/2937)
* Improved performance of `dpnp.fft` functions for complex strided input by avoiding oversized allocations and extra copies [#2939](https://github.com/IntelPython/dpnp/pull/2939)

### Deprecated

Expand Down
21 changes: 18 additions & 3 deletions dpnp/fft/dpnp_utils_fft.py
Original file line number Diff line number Diff line change
Expand Up @@ -408,12 +408,27 @@ def _fft(a, norm, out, forward, in_place, c2c, axes, batch_fft=True):
a = dpnp.reshape(a, local_shape)
index = 1

if not a.flags.c_contiguous:
# cuFFT requires input arrays to be C-contiguous (row-major)
# for correct execution
if (
dpnp.is_cuda_backend(a) and not a.flags.c_contiguous
): # pragma: no cover
if dpnp.is_cuda_backend(a): # pragma: no cover

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Previously there was no copy for CUDA branch if batch_fft=False.
I guess it's indented change, based on the above comment that C-contig is a requirement for cuFFT without any exception. Then we need to update the PR description at least, mentioning that.

a = dpnp.ascontiguousarray(a)
else:
# Check if the memory footprint of the strides exceeds
# the number of elements.
# If so, copy to contiguous to avoid oversized allocation
# for the output array and unnecessary copy to contiguous
# after oneMKL FFT
_strides = dpnp.get_usm_ndarray(a).strides

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do we need to call get_usm_ndarray here?

_shape = a.shape
# Max element displacement reachable by the strides.
# Negative strides are handled by _copy_array, so only
# positive strides are possible here

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is another regression also, which can be handled separately:

import dpnp as np

a = np.arange(4, dtype='c8')
b = np.broadcast_to(a, (3, 4))

np.fft.fft(b, axis=0)
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[5], line 1
----> 1 np.fft.fft(b, axis=0)

File ~/code/dpnp/dpnp/fft/dpnp_iface_fft.py:122, in fft(a, n, axis, norm, out)
     47 """
     48 Compute the one-dimensional discrete Fourier Transform.
     49
   (...)    118
    119 """
    121 dpnp.check_supported_arrays_type(a)
--> 122 return dpnp_fft(
    123     a, forward=True, real=False, n=n, axis=axis, norm=norm, out=out
    124 )

File ~/code/dpnp/dpnp/fft/dpnp_utils_fft.py:664, in dpnp_fft(a, forward, real, n, axis, norm, out)
    658 if c2r:
    659     # input array should be Hermitian for c2r FFT
    660     a = _make_array_hermitian(
    661         a, axis, dpnp.are_same_logical_tensors(a, a_orig)
    662     )
--> 664 return _fft(
    665     a,
    666     norm=norm,
    667     out=out,
    668     forward=forward,
    669     # TODO: currently in-place is only implemented for c2c, see SAT-7154
    670     in_place=in_place and c2c,
    671     c2c=c2c,
    672     axes=axis,
    673     batch_fft=a_ndim != 1,
    674 )

File ~/code/dpnp/dpnp/fft/dpnp_utils_fft.py:447, in _fft(a, norm, out, forward, in_place, c2c, axes, batch_fft)
    443 a_strides = _standardize_strides_to_nonzero(strides, a.shape)
    444 dsc, out_strides = _commit_descriptor(
    445     a, forward, in_place, c2c, a_strides, index, batch_fft
    446 )
--> 447 res = _compute_result(dsc, a, out, forward, c2c, out_strides)
    448 res = _scale_result(res, a.shape, norm, forward, index)
    450 # Revert swapped axes

File ~/code/dpnp/dpnp/fft/dpnp_utils_fft.py:239, in _compute_result(dsc, a, out, forward, c2c, out_strides)
    231         result = dpnp_array(
    232             out_shape,
    233             dtype=out_dtype,
   (...)    236             sycl_queue=exec_q,
    237         )
    238         res_usm = result.get_array()
--> 239     ht_fft_event, fft_event = fi._fft_out_of_place(
    240         dsc, a_usm, res_usm, forward, depends=dep_evs
    241     )
    242 _manager.add_event_pair(ht_fft_event, fft_event)
    244 if not isinstance(result, dpnp_array):

ValueError: Memory addressed by the output array is not sufficiently ample.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

zero strides in case of broadcasting will also go that path, so the comment is not fully correct

max_disp = sum(
st * (sh - 1) for st, sh in zip(_strides, _shape) if st > 0
)
if (max_disp + 1) > a.size:

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It'd be helpful to add dedicated tests covering both copy path and no-copy path with transposed/F-contig complex input.

a = dpnp.ascontiguousarray(a)
Comment thread
ndgrigorian marked this conversation as resolved.

# w/a for cuFFT to avoid "Invalid strides" error when
# the last dimension is 1 and there are multiple axes
Expand Down
Loading