fft_2x4.cpp#

The file fft_2x4.cpp shows a 2D complex time, complex frequency FFT. It also illustrates passing a Factory (which is owned by a unique pointer) to a function.


/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *\
*  Copyright (C) 2023--2025, High Performance Kernels LLC                     *
*                                                                             *
*    This software and the related documents are provided as is, WITHOUT ANY  *
*  WARRANTY, without even the implied warranty of MERCHANTABILITY or FITNESS  *
*  FOR A PARTICULAR PURPOSE.                                                  *
\* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */

#include <cassert>
#include <iostream>
#include <string>

#include <hpk/fft/makeFactory.hpp>

// Prints data formatted into rows and columns, with line continuations if
// necessary.  It assumes that real and imaginary elements (of type 'fp_t')
// are interleaved in memory.
template<typename fp_t>
void printComplexData(std::string label, const fp_t* data, int rows, int cols) {
    std::cout << label << ":\n";
    for (int i = 0; i < rows; ++i) {
        std::cout << "        ";
        for (int j = 0; j < cols; ++j) {
            if (j % 8 == 0 && j > 0) std::cout << " \\\n        ";
            std::cout << data[(2 * cols * i) + (2 * j) + 0] << std::showpos
                      << data[(2 * cols * i) + (2 * j) + 1] << "i  "
                      << std::noshowpos;
        }
        std::cout << '\n';
    }
    std::cout << std::endl;
}

// This demonstrates how a factory can be passed to a function, with the
// function taking ownership of the factory.
void compute2x4(std::unique_ptr<hpk::fft::FactoryCC<float>> factory) {
    // This is just a toy problem.  Data layout is 2x4.
    constexpr long kRows = 2;
    constexpr long kCols = 4;

    // First, do the problem using an out-of-place 2D transform.
    // This is the fastest (and easiest) way to go.

    std::cout << "Example #1: Two dimensional, Out-of-place FFT.\n"
              << "~~~~~~~~~~  Rows=" << kRows << ", Cols=" << kCols << '\n';
    const float in[] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f,
                        1.0f, 1.0f, 2.0f, 2.0f, 1.0f, 1.0f, 2.0f, 2.0f};
    float out[2 * kRows * kCols];
    printComplexData("in", in, kRows, kCols);

    auto fft_2D = factory->makeOoplace<2>({kRows, kCols});
    assert(fft_2D && "Error: makeOoplace() failed for fft_2D.");
    std::cout << *fft_2D << '\n';
    fft_2D->forwardCopy(in, out);
    printComplexData("out", out, kRows, kCols);

    // Now redo the problem using only 1D unit stride FFTs.

    std::cout << "Example #2: Same problem as in the previous example,\n"
              << "~~~~~~~~~~  but for fun we'll do it the slow way.\n";
    printComplexData("in", in, kRows, kCols);

    auto fft_rows = factory->makeOoplace<1>({kCols}, /*batch=*/kRows);
    assert(fft_rows && "Error: makeOoplace() failed for fft_rows.");
    std::cout << *fft_rows << '\n';
    fft_rows->forwardCopy(in, out);

    std::cout << "Transpose to a temporary array\n";
    float transposed[2 * kRows * kCols];
    for (int i = 0; i < kRows; ++i) {
        for (int j = 0; j < kCols; ++j) {
            transposed[2 * (j * kRows + i) + 0] = out[2 * (i * kCols + j) + 0];
            transposed[2 * (j * kRows + i) + 1] = out[2 * (i * kCols + j) + 1];
        }
    }

    auto fft_colsAsRows = factory->makeInplace<1>({kRows}, /*batch=*/kCols);
    assert(fft_colsAsRows && "Error: makeInplace() failed for fft_colsAsRows.");
    std::cout << *fft_colsAsRows << '\n';
    fft_colsAsRows->forward(transposed);

    std::cout << "Transpose to the out array\n";
    for (int i = 0; i < kRows; ++i) {
        for (int j = 0; j < kCols; ++j) {
            out[2 * (i * kCols + j) + 0] = transposed[2 * (j * kRows + i) + 0];
            out[2 * (i * kCols + j) + 1] = transposed[2 * (j * kRows + i) + 1];
        }
    }

    printComplexData("out", out, kRows, kCols);

    // Now re-do the problem yet again using 1D unit stride FFTs for the rows
    // and 1D strided FFTs for the columns.

    std::cout << "Example #3: Same problem as in the previous example,\n"
              << "~~~~~~~~~~  but avoiding the transposes.\n";
    printComplexData("in", in, kRows, kCols);

    // We will reuse fft_rows, which was made in the previous example.
    std::cout << *fft_rows << '\n';
    fft_rows->forwardCopy(in, out);

    // Now, make a strided 1D FFT where the distance between points in the
    // transform is the length of a row, that is 2 * kCols.
    // The batch here can be thought of as a SIMD vector of complex numbers.
    // The vector length (the batch size) is kCols, and the stride from one
    // batch element to the next is 2.
    // Recall that strides are measured in terms of real values (floats).
    auto fft_cols = factory->makeInplace<1>({{kRows, 2 * kCols}}, {kCols, 2});
    assert(fft_cols && "Error: makeInplace() failed for fft_cols.");
    std::cout << *fft_cols << '\n';
    fft_cols->forward(out);
    printComplexData("out", out, kRows, kCols);
}

int main() {
    // Make a factory for complex single precision time and frequency domains
    // that uses only one thread.
    auto factory = hpk::fft::makeFactory<float>({{hpk::Parameter::threads, 1}});
    if (factory) {
        std::cout << "Using " << *factory << "\n\n";
    } else {
        std::cout << "Error: makeFactory<float>() failed" << std::endl;
        return -1;
    }

    // Transfer ownership of the factory to the compute2x4() function, which
    // will do some FFTs and print the results.
    compute2x4(std::move(factory));
}