/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *\
*  Copyright (C) 2023--2026, High Performance Kernels LLC                     *
*                                                                             *
*  This software and the related documents are High Performance Kernels LLC   *
*  copyrighted materials, and your use of them is governed by the express     *
*  license under which they were provided to you (License).                   *
*  Unless the License provides otherwise, you may not use, copy, reproduce,   *
*  modify, disclose, transmit, publish, or distribute this software or the    *
*  related documents without prior written permission from High Performance   *
*  Kernels LLC.                                                               *
*                                                                             *
*    This software and the related documents are provided as is, WITHOUT ANY  *
*  WARRANTY, without even the implied warranty of MERCHANTABILITY or FITNESS  *
*  FOR A PARTICULAR PURPOSE.                                                  *
\* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */

#ifndef HPK_FFT_FFT_HPP_INCLUDED
#define HPK_FFT_FFT_HPP_INCLUDED

/// \file
/// \brief This header declares classes that compute Fast Fourier Transforms.

#include <cstddef>  // std::size_t
#include <memory>   // std::unique_ptr, std::allocator_traits
#include <new>
#include <ostream>
#include <string>
#include <type_traits>  // std::std::is_same_v
#include <utility>      // std::forward, std::declval

#include <hpk/alignedAllocator.hpp>
#include <hpk/complex_type_traits.hpp>

namespace hpk {
namespace fft {

/// \brief Abstract class for performing Fast Fourier Transforms in-place.
///
/// Instances of this class are returned by the `makeInplace()` function
/// of a `Factory`.
/// Note that all member functions are `const`-qualified.  An `Inplace`
/// object is immutable and can be shared in a multithreaded environment.
template<typename fp_t, typename time_t, typename freq_t> class Inplace {
 public:
    static_assert(!is_complex_v<fp_t>,
                  "The first template parameter must be real, not complex.");

    using mathType = fp_t;    ///< Type of math computations and scale factor
    using timeType = time_t;  ///< Type of data in the time domain
    using freqType = freq_t;  ///< Type of data in the frequency domain
    using realTimeType = remove_complex_t<timeType>;  ///< Real timeType
    using realFreqType = remove_complex_t<freqType>;  ///< Real freqType

    /// \brief Computes an unscaled in-place forward FFT.
    /// \param inout Pointer to the data to transform.  Its type can be
    ///              `realTimeType*` or `std::complex<realTimeType>*`.
    /// \param alloc Optionally provides an Allocator which will be used if
    ///              scratch memory is needed.  If omitted, an instance of
    ///              the function's first template parameter is constructed.
    ///              If that also is not specified, the default is
    ///              `hpk::AlignedAllocator`.
    template<
            typename Allocator = AlignedAllocator<mathType>, typename inout_t,
            std::enable_if_t<
                    std::is_same_v<typename std::decay_t<Allocator>::value_type,
                                   mathType>
                            && std::is_same_v<decltype(std::declval<Allocator>()
                                                               .allocate(0)),
                                              mathType*>,
                    bool> = true>
    void forward(inout_t* inout, Allocator&& alloc = Allocator()) const {
        mathType* scratch = nullptr;
        std::size_t n = scratchSize();
        using ATraits = std::allocator_traits<std::decay_t<Allocator>>;
        static_assert(std::is_same_v<typename ATraits::value_type, mathType>,
                      "Allocator::value_type must be mathType");
        if (n) {
            auto buf = ATraits::allocate(alloc, n);
            scratch = new (static_cast<void*>(buf)) mathType[n];
        }
        forward(inout, scratch);
        if (n) {
            ATraits::deallocate(alloc, scratch, n);
        }
    }

    /// \brief Computes an unscaled in-place forward FFT.
    /// \param inout   Pointer to the data to transform.  Its type can be
    ///                `realTimeType*` or `std::complex<realTimeType>*`.
    /// \param scratch Smart pointer (e.g., `std::unique_ptr`) owning
    ///                `scratchSize()` elements of `mathType` to be used as
    ///                needed for temporary storage in computing the FFT.
    template<typename inout_t, typename Smartptr,
             std::enable_if_t<
                     std::is_same_v<typename Smartptr::element_type, mathType>
                             && std::is_same_v<
                                     decltype(std::declval<Smartptr>().get()),
                                     mathType*>,
                     bool> = true>
    void forward(inout_t* inout, const Smartptr& scratch) const noexcept {
        forward(inout, scratch.get());
    }

    /// \brief Computes an unscaled in-place forward FFT.
    /// \param inout   Pointer to the data to transform.  Its type can be
    ///                `realTimeType*` or `std::complex<realTimeType>*`.
    /// \param scratch Raw pointer to `scratchSize()` elements of `mathType`
    ///                to be used as needed for temporary storage.
    template<typename inout_t>
    void forward(inout_t* inout, mathType* scratch) const noexcept {
        static_assert(std::is_same_v<remove_complex_t<inout_t>, realTimeType>);
        forward(reinterpret_cast<realTimeType*>(inout), scratch);
    }

    // Implemented by derived class
    virtual void forward(realTimeType* inout,
                         mathType* scratch) const noexcept = 0;

    /// \brief Computes an unscaled in-place backward FFT.
    /// \param inout Pointer to the data to transform.  Its type can be
    ///              `realFreqType*` or `std::complex<realFreqType>*`.
    /// \param alloc Optionally provides an Allocator which will be used if
    ///              scratch memory is needed.  If omitted, an instance of
    ///              the function's first template parameter is constructed.
    ///              If that also is not specified, the default is
    ///              `hpk::AlignedAllocator`.
    template<
            typename Allocator = AlignedAllocator<mathType>, typename inout_t,
            std::enable_if_t<
                    std::is_same_v<typename std::decay_t<Allocator>::value_type,
                                   mathType>
                            && std::is_same_v<decltype(std::declval<Allocator>()
                                                               .allocate(0)),
                                              mathType*>,
                    bool> = true>
    void backward(inout_t* inout, Allocator&& alloc = Allocator()) const {
        mathType* scratch = nullptr;
        std::size_t n = scratchSize();
        using ATraits = std::allocator_traits<std::decay_t<Allocator>>;
        static_assert(std::is_same_v<typename ATraits::value_type, mathType>,
                      "Allocator::value_type must be mathType");
        if (n) {
            auto buf = ATraits::allocate(alloc, n);
            scratch = new (static_cast<void*>(buf)) mathType[n];
        }
        backward(inout, scratch);
        if (n) {
            ATraits::deallocate(alloc, scratch, n);
        }
    }

    /// \brief Computes an unscaled in-place backward FFT.
    /// \param inout   Pointer to the data to transform.  Its type can be
    ///                `realFreqType*` or `std::complex<realFreqType>*`.
    /// \param scratch Smart pointer (e.g., `std::unique_ptr`) owning
    ///                `scratchSize()` elements of `mathType` to be used as
    ///                needed for temporary storage in computing the FFT.
    template<typename inout_t, typename Smartptr,
             std::enable_if_t<
                     std::is_same_v<typename Smartptr::element_type, mathType>
                             && std::is_same_v<
                                     decltype(std::declval<Smartptr>().get()),
                                     mathType*>,
                     bool> = true>
    void backward(inout_t* inout, const Smartptr& scratch) const noexcept {
        backward(inout, scratch.get());
    }

    /// \brief Computes an unscaled in-place backward FFT.
    /// \param inout   Pointer to the data to transform.  Its type can be
    ///                `realFreqType*` or `std::complex<realFreqType>*`.
    /// \param scratch Raw pointer to `scratchSize()` elements of `mathType`
    ///                to be used as needed for temporary storage.
    template<typename inout_t>
    void backward(inout_t* inout, mathType* scratch) const noexcept {
        static_assert(std::is_same_v<remove_complex_t<inout_t>, realFreqType>);
        backward(reinterpret_cast<realFreqType*>(inout), scratch);
    }

    // Implemented by derived class
    virtual void backward(realFreqType* inout,
                          mathType* scratch) const noexcept = 0;

    /// \brief Computes a scaled (by a real value) in-place forward FFT.
    /// \param scale Pointer to a real-valued scaling factor.
    /// \param inout Pointer to the data to transform.  Its type can be
    ///              `realTimeType*` or `std::complex<realTimeType>*`.
    /// \param alloc Optionally provides an Allocator which will be used if
    ///              scratch memory is needed.  If omitted, an instance of
    ///              the function's first template parameter is constructed.
    ///              If that also is not specified, the default is
    ///              `hpk::AlignedAllocator`.
    template<
            typename Allocator = AlignedAllocator<mathType>, typename inout_t,
            std::enable_if_t<
                    std::is_same_v<typename std::decay_t<Allocator>::value_type,
                                   mathType>
                            && std::is_same_v<decltype(std::declval<Allocator>()
                                                               .allocate(0)),
                                              mathType*>,
                    bool> = true>
    void scaleForward(const mathType* scale, inout_t* inout,
                      Allocator&& alloc = Allocator()) const {
        mathType* scratch = nullptr;
        std::size_t n = scratchSize();
        using ATraits = std::allocator_traits<std::decay_t<Allocator>>;
        static_assert(std::is_same_v<typename ATraits::value_type, mathType>,
                      "Allocator::value_type must be mathType");
        if (n) {
            auto buf = ATraits::allocate(alloc, n);
            scratch = new (static_cast<void*>(buf)) mathType[n];
        }
        scaleForward(scale, inout, scratch);
        if (n) {
            ATraits::deallocate(alloc, scratch, n);
        }
    }

    /// \brief Computes a scaled (by a real value) in-place forward FFT.
    /// \param scale   Pointer to a real-valued scaling factor.
    /// \param inout   Pointer to the data to transform.  Its type can be
    ///                `realTimeType*` or `std::complex<realTimeType>*`.
    /// \param scratch Smart pointer (e.g., `std::unique_ptr`) owning
    ///                `scratchSize()` elements of `mathType` to be used as
    ///                needed for temporary storage in computing the FFT.
    template<typename inout_t, typename Smartptr,
             std::enable_if_t<
                     std::is_same_v<typename Smartptr::element_type, mathType>
                             && std::is_same_v<
                                     decltype(std::declval<Smartptr>().get()),
                                     mathType*>,
                     bool> = true>
    void scaleForward(const mathType* scale, inout_t* inout,
                      const Smartptr& scratch) const noexcept {
        scaleForward(scale, inout, scratch.get());
    }

    /// \brief Computes a scaled (by a real value) in-place forward FFT.
    /// \param scale   Pointer to a real-valued scaling factor.
    /// \param inout   Pointer to the data to transform.  Its type can be
    ///                `realTimeType*` or `std::complex<realTimeType>*`.
    /// \param scratch Raw pointer to `scratchSize()` elements of `mathType`
    ///                to be used as needed for temporary storage.
    template<typename inout_t>
    void scaleForward(const mathType* scale, inout_t* inout,
                      mathType* scratch) const noexcept {
        static_assert(std::is_same_v<remove_complex_t<inout_t>, realTimeType>);
        scaleForward(scale, reinterpret_cast<realTimeType*>(inout), scratch);
    }

    // Implemented by derived class
    virtual void scaleForward(const mathType* scale, realTimeType* inout,
                              mathType* scratch) const noexcept = 0;

    /// \brief Computes a scaled (by a real value) in-place backward FFT.
    /// \param scale Pointer to a real-valued scaling factor.
    /// \param inout Pointer to the data to transform.  Its type can be
    ///              `realFreqType*` or `std::complex<realFreqType>*`.
    /// \param alloc Optionally provides an Allocator which will be used if
    ///              scratch memory is needed.  If omitted, an instance of
    ///              the function's first template parameter is constructed.
    ///              If that also is not specified, the default is
    ///              `hpk::AlignedAllocator`.
    template<
            typename Allocator = AlignedAllocator<mathType>, typename inout_t,
            std::enable_if_t<
                    std::is_same_v<typename std::decay_t<Allocator>::value_type,
                                   mathType>
                            && std::is_same_v<decltype(std::declval<Allocator>()
                                                               .allocate(0)),
                                              mathType*>,
                    bool> = true>
    void scaleBackward(const mathType* scale, inout_t* inout,
                       Allocator&& alloc = Allocator()) const {
        mathType* scratch = nullptr;
        std::size_t n = scratchSize();
        using ATraits = std::allocator_traits<std::decay_t<Allocator>>;
        static_assert(std::is_same_v<typename ATraits::value_type, mathType>,
                      "Allocator::value_type must be mathType");
        if (n) {
            auto buf = ATraits::allocate(alloc, n);
            scratch = new (static_cast<void*>(buf)) mathType[n];
        }
        scaleBackward(scale, inout, scratch);
        if (n) {
            ATraits::deallocate(alloc, scratch, n);
        }
    }

    /// \brief Computes a scaled (by a real value) in-place backward FFT.
    /// \param scale   Pointer to a real-valued scaling factor.
    /// \param inout   Pointer to the data to transform.  Its type can be
    ///                `realFreqType*` or `std::complex<realFreqType>*`.
    /// \param scratch Smart pointer (e.g., `std::unique_ptr`) owning
    ///                `scratchSize()` elements of `mathType` to be used as
    ///                needed for temporary storage in computing the FFT.
    template<typename inout_t, typename Smartptr,
             std::enable_if_t<
                     std::is_same_v<typename Smartptr::element_type, mathType>
                             && std::is_same_v<
                                     decltype(std::declval<Smartptr>().get()),
                                     mathType*>,
                     bool> = true>
    void scaleBackward(const mathType* scale, inout_t* inout,
                       const Smartptr& scratch) const noexcept {
        scaleBackward(scale, inout, scratch.get());
    }

    /// \brief Computes a scaled (by a real value) in-place backward FFT.
    /// \param scale   Pointer to a real-valued scaling factor.
    /// \param inout   Pointer to the data to transform.  Its type can be
    ///                `realFreqType*` or `std::complex<realFreqType>*`.
    /// \param scratch Raw pointer to `scratchSize()` elements of `mathType`
    ///                to be used as needed for temporary storage.
    template<typename inout_t>
    void scaleBackward(const mathType* scale, inout_t* inout,
                       mathType* scratch) const noexcept {
        static_assert(std::is_same_v<remove_complex_t<inout_t>, realFreqType>);
        scaleBackward(scale, reinterpret_cast<realFreqType*>(inout), scratch);
    }

    // Implemented by derived class
    virtual void scaleBackward(const mathType* scale, realFreqType* inout,
                               mathType* scratch) const noexcept = 0;

 protected:
    Inplace() = default;

 public:
    virtual ~Inplace() = default;
    Inplace(const Inplace& f) = delete;
    Inplace& operator=(const Inplace& f) = delete;
    Inplace(Inplace&&) = delete;
    Inplace& operator=(Inplace&&) = delete;

    /// Returns the number of mathType real elements needed as scratch space.
    virtual std::size_t scratchSize() const noexcept = 0;

    /// \brief Returns the number of bytes needed as scratch space,
    ///        `sizeof(mathType) * scratchSize()`.
    std::size_t scratchSizeBytes() const noexcept {
        return sizeof(mathType) * scratchSize();
    }

    /// Returns an upper bound on the number of threads that could be used.
    virtual int maxThreads() const = 0;

    /// Returns a short string describing the transform.
    virtual std::string toString() const = 0;
};

/// \brief Abstract class for performing Fast Fourier Transforms out-of-place.
///
/// Instances of this class are returned by the `makeOoplace()` function
/// of a `Factory`.
/// Note that all member functions are `const`-qualified.  An `Ooplace`
/// object is immutable and can be shared in a multithreaded environment.
template<typename fp_t, typename time_t, typename freq_t> class Ooplace {
 public:
    static_assert(!is_complex_v<fp_t>,
                  "The first template parameter must be real, not complex.");

    using mathType = fp_t;    ///< Type of math computations and scale factor
    using timeType = time_t;  ///< Type of data in the time domain
    using freqType = freq_t;  ///< Type of data in the frequency domain
    using realTimeType = remove_complex_t<timeType>;  ///< Real timeType
    using realFreqType = remove_complex_t<freqType>;  ///< Real freqType

    /// \brief Computes an unscaled out-of-place forward FFT.
    /// \param in    Pointer to the input data.  Its type can be `timeType*`
    ///              or `realTimeType*` and can be `const`-qualified.
    /// \param out   Pointer to the output data to be written.  Its type can
    ///              be `freqType*` or `realFreqType*`.
    /// \param alloc Optionally provides an Allocator which will be used if
    ///              scratch memory is needed.  If omitted, an instance of
    ///              the function's first template parameter is constructed.
    ///              If that also is not specified, the default is
    ///              `hpk::AlignedAllocator`.
    template<
            typename Allocator = AlignedAllocator<mathType>, typename in_t,
            typename out_t,
            std::enable_if_t<
                    std::is_same_v<typename std::decay_t<Allocator>::value_type,
                                   mathType>
                            && std::is_same_v<decltype(std::declval<Allocator>()
                                                               .allocate(0)),
                                              mathType*>,
                    bool> = true>
    void forwardCopy(const in_t* in, out_t* out,
                     Allocator&& alloc = Allocator()) const {
        mathType* scratch = nullptr;
        std::size_t n = scratchSize();
        using ATraits = std::allocator_traits<std::decay_t<Allocator>>;
        static_assert(std::is_same_v<typename ATraits::value_type, mathType>,
                      "Allocator::value_type must be mathType");
        if (n) {
            auto buf = ATraits::allocate(alloc, n);
            scratch = new (static_cast<void*>(buf)) mathType[n];
        }
        forwardCopy(in, out, scratch);
        if (n) {
            ATraits::deallocate(alloc, scratch, n);
        }
    }

    /// \brief Computes an unscaled out-of-place forward FFT.
    /// \param in      Pointer to the input data.  Its type can be `timeType*`
    ///                or `realTimeType*` and can be `const`-qualified.
    /// \param out     Pointer to the output data to be written.  Its type can
    ///                be `freqType*` or `realFreqType*`.
    /// \param scratch Smart pointer (e.g., `std::unique_ptr`) owning
    ///                `scratchSize()` elements of `mathType` to be used as
    ///                needed for temporary storage in computing the FFT.
    template<typename in_t, typename out_t, typename Smartptr,
             std::enable_if_t<
                     std::is_same_v<typename Smartptr::element_type, mathType>
                             && std::is_same_v<
                                     decltype(std::declval<Smartptr>().get()),
                                     mathType*>,
                     bool> = true>
    void forwardCopy(const in_t* in, out_t* out,
                     const Smartptr& scratch) const noexcept {
        forwardCopy(in, out, scratch.get());
    }

    /// \brief Computes an unscaled out-of-place forward FFT.
    /// \param in      Pointer to the input data.  Its type can be `timeType*`
    ///                or `realTimeType*` and can be `const`-qualified.
    /// \param out     Pointer to the output data to be written.  Its type can
    ///                be `freqType*` or `realFreqType*`.
    /// \param scratch Raw pointer to `scratchSize()` elements of `mathType`
    ///                to be used as needed for temporary storage.
    template<typename in_t, typename out_t>
    void forwardCopy(const in_t* in, out_t* out,
                     mathType* scratch) const noexcept {
        static_assert(std::is_same_v<in_t, timeType>
                      || std::is_same_v<in_t, realTimeType>);
        static_assert(std::is_same_v<out_t, freqType>
                      || std::is_same_v<out_t, realFreqType>);
        forwardCopy(reinterpret_cast<const realTimeType*>(in),
                    reinterpret_cast<realFreqType*>(out), scratch);
    }

    // Implemented by derived class
    virtual void forwardCopy(const realTimeType* in, realFreqType* out,
                             mathType* scratch) const noexcept = 0;

    /// \brief Computes an unscaled out-of-place backward FFT.
    /// \param in    Pointer to the input data.  Its type can be `freqType*`
    ///              or `realFreqType*` and can be `const`-qualified.
    /// \param out   Pointer to the output data to be written.  Its type can
    ///              be `timeType*` or `realTimeType*`.
    /// \param alloc Optionally provides an Allocator which will be used if
    ///              scratch memory is needed.  If omitted, an instance of
    ///              the function's first template parameter is constructed.
    ///              If that also is not specified, the default is
    ///              `hpk::AlignedAllocator`.
    template<
            typename Allocator = AlignedAllocator<mathType>, typename in_t,
            typename out_t,
            std::enable_if_t<
                    std::is_same_v<typename std::decay_t<Allocator>::value_type,
                                   mathType>
                            && std::is_same_v<decltype(std::declval<Allocator>()
                                                               .allocate(0)),
                                              mathType*>,
                    bool> = true>
    void backwardCopy(const in_t* in, out_t* out,
                      Allocator&& alloc = Allocator()) const {
        mathType* scratch = nullptr;
        std::size_t n = scratchSize();
        using ATraits = std::allocator_traits<std::decay_t<Allocator>>;
        static_assert(std::is_same_v<typename ATraits::value_type, mathType>,
                      "Allocator::value_type must be mathType");
        if (n) {
            auto buf = ATraits::allocate(alloc, n);
            scratch = new (static_cast<void*>(buf)) mathType[n];
        }
        backwardCopy(in, out, scratch);
        if (n) {
            ATraits::deallocate(alloc, scratch, n);
        }
    }

    /// \brief Computes an unscaled out-of-place backward FFT.
    /// \param in      Pointer to the input data.  Its type can be `freqType*`
    ///                or `realFreqType*` and can be `const`-qualified.
    /// \param out     Pointer to the output data to be written.  Its type can
    ///                be `timeType*` or `realTimeType*`.
    /// \param scratch Smart pointer (e.g., `std::unique_ptr`) owning
    ///                `scratchSize()` elements of `mathType` to be used as
    ///                needed for temporary storage in computing the FFT.
    template<typename in_t, typename out_t, typename Smartptr,
             std::enable_if_t<
                     std::is_same_v<typename Smartptr::element_type, mathType>
                             && std::is_same_v<
                                     decltype(std::declval<Smartptr>().get()),
                                     mathType*>,
                     bool> = true>
    void backwardCopy(const in_t* in, out_t* out,
                      const Smartptr& scratch) const noexcept {
        backwardCopy(in, out, scratch.get());
    }

    /// \brief Computes an unscaled out-of-place backward FFT.
    /// \param in      Pointer to the input data.  Its type can be `freqType*`
    ///                or `realFreqType*` and can be `const`-qualified.
    /// \param out     Pointer to the output data to be written.  Its type can
    ///                be `timeType*` or `realTimeType*`.
    /// \param scratch Raw pointer to `scratchSize()` elements of `mathType`
    ///                to be used as needed for temporary storage.
    template<typename in_t, typename out_t>
    void backwardCopy(const in_t* in, out_t* out,
                      mathType* scratch) const noexcept {
        static_assert(std::is_same_v<in_t, freqType>
                      || std::is_same_v<in_t, realFreqType>);
        static_assert(std::is_same_v<out_t, timeType>
                      || std::is_same_v<out_t, realTimeType>);
        backwardCopy(reinterpret_cast<const realFreqType*>(in),
                     reinterpret_cast<realTimeType*>(out), scratch);
    }

    // Implemented by derived class
    virtual void backwardCopy(const realFreqType* in, realTimeType* out,
                              mathType* scratch) const noexcept = 0;

    /// \brief Computes a scaled (by a real value) out-of-place forward FFT.
    /// \param scale Pointer to a real-valued scaling factor.
    /// \param in    Pointer to the input data.  Its type can be `timeType*`
    ///              or `realTimeType*` and can be `const`-qualified.
    /// \param out   Pointer to the output data to be written.  Its type can
    ///              be `freqType*` or `realFreqType*`.
    /// \param alloc Optionally provides an Allocator which will be used if
    ///              scratch memory is needed.  If omitted, an instance of
    ///              the function's first template parameter is constructed.
    ///              If that also is not specified, the default is
    ///              `hpk::AlignedAllocator`.
    template<
            typename Allocator = AlignedAllocator<mathType>, typename in_t,
            typename out_t,
            std::enable_if_t<
                    std::is_same_v<typename std::decay_t<Allocator>::value_type,
                                   mathType>
                            && std::is_same_v<decltype(std::declval<Allocator>()
                                                               .allocate(0)),
                                              mathType*>,
                    bool> = true>
    void scaleForwardCopy(const mathType* scale, const in_t* in, out_t* out,
                          Allocator&& alloc = Allocator()) const {
        mathType* scratch = nullptr;
        std::size_t n = scratchSize();
        using ATraits = std::allocator_traits<std::decay_t<Allocator>>;
        static_assert(std::is_same_v<typename ATraits::value_type, mathType>,
                      "Allocator::value_type must be mathType");
        if (n) {
            auto buf = ATraits::allocate(alloc, n);
            scratch = new (static_cast<void*>(buf)) mathType[n];
        }
        scaleForwardCopy(scale, in, out, scratch);
        if (n) {
            ATraits::deallocate(alloc, scratch, n);
        }
    }

    /// \brief Computes a scaled (by a real value) out-of-place forward FFT.
    /// \param scale   Pointer to a real-valued scaling factor.
    /// \param in      Pointer to the input data.  Its type can be `timeType*`
    ///                or `realTimeType*` and can be `const`-qualified.
    /// \param out     Pointer to the output data to be written.  Its type can
    ///                be `freqType*` or `realFreqType*`.
    /// \param scratch Smart pointer (e.g., `std::unique_ptr`) owning
    ///                `scratchSize()` elements of `mathType` to be used as
    ///                needed for temporary storage in computing the FFT.
    template<typename in_t, typename out_t, typename Smartptr,
             std::enable_if_t<
                     std::is_same_v<typename Smartptr::element_type, mathType>
                             && std::is_same_v<
                                     decltype(std::declval<Smartptr>().get()),
                                     mathType*>,
                     bool> = true>
    void scaleForwardCopy(const mathType* scale, const in_t* in, out_t* out,
                          const Smartptr& scratch) const noexcept {
        scaleForwardCopy(scale, in, out, scratch.get());
    }

    /// \brief Computes a scaled (by a real value) out-of-place forward FFT.
    /// \param scale   Pointer to a real-valued scaling factor.
    /// \param in      Pointer to the input data.  Its type can be `timeType*`
    ///                or `realTimeType*` and can be `const`-qualified.
    /// \param out     Pointer to the output data to be written.  Its type can
    ///                be `freqType*` or `realFreqType*`.
    /// \param scratch Raw pointer to `scratchSize()` elements of `mathType`
    ///                to be used as needed for temporary storage.
    template<typename in_t, typename out_t>
    void scaleForwardCopy(const mathType* scale, const in_t* in, out_t* out,
                          mathType* scratch) const noexcept {
        static_assert(std::is_same_v<in_t, timeType>
                      || std::is_same_v<in_t, realTimeType>);
        static_assert(std::is_same_v<out_t, freqType>
                      || std::is_same_v<out_t, realFreqType>);
        scaleForwardCopy(scale, reinterpret_cast<const realTimeType*>(in),
                         reinterpret_cast<realFreqType*>(out), scratch);
    }

    // Implemented by derived class
    virtual void scaleForwardCopy(const mathType* scale, const realTimeType* in,
                                  realFreqType* out,
                                  mathType* scratch) const noexcept = 0;

    /// \brief Computes a scaled (by a real value) out-of-place backward FFT.
    /// \param scale Pointer to a real-valued scaling factor.
    /// \param in    Pointer to the input data.  Its type can be `freqType*`
    ///              or `realFreqType*` and can be `const`-qualified.
    /// \param out   Pointer to the output data to be written.  Its type can
    ///              be `timeType*` or `realTimeType*`.
    /// \param alloc Optionally provides an Allocator which will be used if
    ///              scratch memory is needed.  If omitted, an instance of
    ///              the function's first template parameter is constructed.
    ///              If that also is not specified, the default is
    ///              `hpk::AlignedAllocator`.
    template<
            typename Allocator = AlignedAllocator<mathType>, typename in_t,
            typename out_t,
            std::enable_if_t<
                    std::is_same_v<typename std::decay_t<Allocator>::value_type,
                                   mathType>
                            && std::is_same_v<decltype(std::declval<Allocator>()
                                                               .allocate(0)),
                                              mathType*>,
                    bool> = true>
    void scaleBackwardCopy(const mathType* scale, const in_t* in, out_t* out,
                           Allocator&& alloc = Allocator()) const {
        mathType* scratch = nullptr;
        std::size_t n = scratchSize();
        using ATraits = std::allocator_traits<std::decay_t<Allocator>>;
        static_assert(std::is_same_v<typename ATraits::value_type, mathType>,
                      "Allocator::value_type must be mathType");
        if (n) {
            auto buf = ATraits::allocate(alloc, n);
            scratch = new (static_cast<void*>(buf)) mathType[n];
        }
        scaleBackwardCopy(scale, in, out, scratch);
        if (n) {
            ATraits::deallocate(alloc, scratch, n);
        }
    }

    /// \brief Computes a scaled (by a real value) out-of-place backward FFT.
    /// \param scale   Pointer to a real-valued scaling factor.
    /// \param in      Pointer to the input data.  Its type can be `freqType*`
    ///                or `realFreqType*` and can be `const`-qualified.
    /// \param out     Pointer to the output data to be written.  Its type can
    ///                be `timeType*` or `realTimeType*`.
    /// \param scratch Smart pointer (e.g., `std::unique_ptr`) owning
    ///                `scratchSize()` elements of `mathType` to be used as
    ///                needed for temporary storage in computing the FFT.
    template<typename in_t, typename out_t, typename Smartptr,
             std::enable_if_t<
                     std::is_same_v<typename Smartptr::element_type, mathType>
                             && std::is_same_v<
                                     decltype(std::declval<Smartptr>().get()),
                                     mathType*>,
                     bool> = true>
    void scaleBackwardCopy(const mathType* scale, const in_t* in, out_t* out,
                           const Smartptr& scratch) const noexcept {
        scaleBackwardCopy(scale, in, out, scratch.get());
    }

    /// \brief Computes a scaled (by a real value) out-of-place backward FFT.
    /// \param scale   Pointer to a real-valued scaling factor.
    /// \param in      Pointer to the input data.  Its type can be `freqType*`
    ///                or `realFreqType*` and can be `const`-qualified.
    /// \param out     Pointer to the output data to be written.  Its type can
    ///                be `timeType*` or `realTimeType*`.
    /// \param scratch Raw pointer to `scratchSize()` elements of `mathType`
    ///                to be used as needed for temporary storage.
    template<typename in_t, typename out_t>
    void scaleBackwardCopy(const mathType* scale, const in_t* in, out_t* out,
                           mathType* scratch) const noexcept {
        static_assert(std::is_same_v<in_t, freqType>
                      || std::is_same_v<in_t, realFreqType>);
        static_assert(std::is_same_v<out_t, timeType>
                      || std::is_same_v<out_t, realTimeType>);
        scaleBackwardCopy(scale, reinterpret_cast<const realFreqType*>(in),
                          reinterpret_cast<realTimeType*>(out), scratch);
    }

    // Implemented by derived class
    virtual void scaleBackwardCopy(const mathType* scale,
                                   const realFreqType* in, realTimeType* out,
                                   mathType* scratch) const noexcept = 0;

 protected:
    Ooplace() = default;

 public:
    virtual ~Ooplace() = default;
    Ooplace(const Ooplace& f) = delete;
    Ooplace& operator=(const Ooplace& f) = delete;
    Ooplace(Ooplace&&) = delete;
    Ooplace& operator=(Ooplace&&) = delete;

    /// Returns the number of mathType real elements needed as scratch space.
    virtual std::size_t scratchSize() const noexcept = 0;

    /// \brief Returns the number of bytes needed as scratch space,
    ///        `sizeof(mathType) * scratchSize()`.
    std::size_t scratchSizeBytes() const noexcept {
        return sizeof(mathType) * scratchSize();
    }

    /// Returns an upper bound on the number of threads that could be used.
    virtual int maxThreads() const = 0;

    /// Returns a short string describing the transform.
    virtual std::string toString() const = 0;
};

/// Overload for ostream's `<<` operator for an `Inplace`.
/// \related Inplace
template<typename fp_t, typename time_t, typename freq_t>
inline std::ostream& operator<<(std::ostream& os,
                                const Inplace<fp_t, time_t, freq_t>& fft) {
    return os << fft.toString();
}

/// Overload for ostream's `<<` operator for an `Ooplace`.
/// \related Ooplace
template<typename fp_t, typename time_t, typename freq_t>
inline std::ostream& operator<<(std::ostream& os,
                                const Ooplace<fp_t, time_t, freq_t>& fft) {
    return os << fft.toString();
}

/// \brief Allocates scratch memory for an instance of `Inplace`.
/// \related Inplace
/// \return `std::unique_ptr` that owns the allocated memory
///
/// Convenience function for allocating the scratch memory needed when using
/// an instance of `hpk::fft::Inplace` to compute an FFT.
/// The required memory size is obtained from `fft.scratchSize()`, and the
/// return value is constructed using `allocateMemory()`.
///
/// Example using the default `AlignedAllocator`:
///
///     auto factory = hpk::fft::makeFactory<float>();
///     auto fft = factory->makeInplace({1024});
///     auto scratch = allocateScratch(*fft);  // default 64B-aligned
///
/// Or, using an instance of an `AlignedAllocator`:
///
///     hpk::AlignedAllocator<float, 128> alloc;
///     auto scratch = allocateScratch(*fft, alloc);  // 128B-aligned
///
template<typename fp_t, typename Allocator = AlignedAllocator<fp_t>,
         typename time_t, typename freq_t>
[[nodiscard]] inline auto
allocateScratch(const Inplace<fp_t, time_t, freq_t>& fft,
                Allocator&& alloc = Allocator()) {
    return ::hpk::allocateMemory<fp_t>(fft.scratchSize(),
                                       std::forward<Allocator>(alloc));
}

/// \brief Allocates scratch memory for an instance of `Ooplace`.
/// \related Ooplace
/// \return `std::unique_ptr` that owns the allocated memory
///
/// Convenience function for allocating the scratch memory needed when using
/// an instance of `hpk::fft::Ooplace` to compute an FFT.
/// The required memory size is obtained from `fft.scratchSize()`, and the
/// return value is constructed using `allocateMemory()`.
///
/// Example using the default `AlignedAllocator`:
///
///     auto factory = hpk::fft::makeFactory<float>();
///     auto fft = factory->makeOoplace({1024});
///     auto scratch = allocateScratch(*fft);  // default 64B-aligned
///
/// Or, using an instance of an `AlignedAllocator`:
///
///     hpk::AlignedAllocator<float, 128> alloc;
///     auto scratch = allocateScratch(*fft, alloc);  // 128B-aligned
///
template<typename fp_t, typename Allocator = AlignedAllocator<fp_t>,
         typename time_t, typename freq_t>
[[nodiscard]] inline auto
allocateScratch(const Ooplace<fp_t, time_t, freq_t>& fft,
                Allocator&& alloc = Allocator()) {
    return ::hpk::allocateMemory<fp_t>(fft.scratchSize(),
                                       std::forward<Allocator>(alloc));
}

/// \brief Convenience type alias for in-place transforms having complex time
///        and complex frequency domains.
template<typename fp_t>
using InplaceCC = Inplace<fp_t, std::complex<fp_t>, std::complex<fp_t>>;

/// \brief Convenience type alias for out-of-place transforms having complex
///        time and complex frequency domains.
template<typename fp_t>
using OoplaceCC = Ooplace<fp_t, std::complex<fp_t>, std::complex<fp_t>>;

/// \brief Convenience type alias for in-place transforms having real time
///        domain and complex frequency domain.
template<typename fp_t>
using InplaceRC = Inplace<fp_t, fp_t, std::complex<fp_t>>;

/// \brief Convenience type alias for out-of-place transforms having real time
///        domain and complex frequency domain.
template<typename fp_t>
using OoplaceRC = Ooplace<fp_t, fp_t, std::complex<fp_t>>;

}  // namespace fft
}  // namespace hpk

#endif  // HPK_FFT_FFT_HPP_INCLUDED
