Optimizing Complex Number Computations for GPU with Thrust: Seeking Efficient Migration Advice

Question

I am a little bit new to thrust I am trying to migrate the following code to use make use of gpus but this one seems a little difficult

#include <iostream>
#include <complex>
#include <random>
#include <omp.h>
#include <chrono>

using Complex = std::complex<double>;


void fill_random_complex(Complex *array, int n) {
    std::random_device rd;
    std::mt19937 gen(rd());
    std::uniform_real_distribution<double> dis(0.0, 1.0);

    for (int i = 0; i < n; ++i) {
        double real_part = dis(gen);
        double imag_part = dis(gen);
        array[i] = Complex(real_part, imag_part);
    }
}

void process_loop(
    Complex *ag,
    Complex *bg,
    double *c_distr,
    int *nbnd_loc,
    int *ngk,
    int l1_e, int l2_s, int l2_e,
    int kpt_pool_nloc, int gstart,
    int nimage, int idx, int pert_nglob,
    int ag_dim1, int ag_dim2, int ag_dim3,
    int bg_dim1, int bg_dim2, int bg_dim3,
    int c_distr_dim1) 
{
    auto start_time = std::chrono::high_resolution_clock::now();

    for (int il1 = 0; il1 < l1_e; ++il1) {
        for (int il2 = l2_s - 1; il2 < l2_e; ++il2) {
            int ig1 = nimage * il1 + idx;
            double reduce = 0.0;

            for (int iks = 0; iks < kpt_pool_nloc; ++iks) {
                int nbndval = nbnd_loc[iks];
                int npw = ngk[iks];

                for (int lbnd = 0; lbnd < nbndval; ++lbnd) {
                    for (int il3 = 0; il3 < npw; ++il3) {
                        int ag_index = il3 + nbndval * (lbnd + ag_dim3 * (iks + ag_dim4 * il1));
                        int bg_index = il3 + nbndval * (lbnd + bg_dim3 * (iks + bg_dim4 * il2));

                        reduce += 2.0 * std::real(ag[ag_index]) * std::real(bg[bg_index])
                                + 2.0 * std::imag(ag[ag_index]) * std::imag(bg[bg_index]);
                    }
                }

                if (gstart == 2) {
                    for (int lbnd = 0; lbnd < nbndval; ++lbnd) {
                        int ag_index = 0 + nbndval * (lbnd + ag_dim3 * (iks + ag_dim4 * il1));
                        int bg_index = 0 + nbndval * (lbnd + bg_dim3 * (iks + bg_dim4 * il2));

                        reduce -= std::real(ag[ag_index]) * std::real(bg[bg_index]);
                    }
                }
            }

            int c_distr_index = ig1 * c_distr_dim1 + il2;
            c_distr[c_distr_index] = reduce;
        }
    }

    auto end_time = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double> elapsed_time = end_time - start_time;
    std::cout << "Time for process_loop: " << elapsed_time.count() << " seconds" << std::endl;
}

int main() {
    const int nimg = 1;
    const int idx = 0;
    const int kpt_pool_nloc = 1;
    const int l1_e = 200;
    const int l2_s = 1;
    const int l2_e = 100;
    const int pert_nglob = 200;
    const int gstart = 2;

    const int ag_dim1 = 16984;
    const int ag_dim2 = 128;
    const int ag_dim3 = kpt_pool_nloc;
    const int ag_dim4 = l1_e;

    const int bg_dim1 = 16984;
    const int bg_dim2 = 128;
    const int bg_dim3 = kpt_pool_nloc;
    const int bg_dim4 = l2_e;

    const int c_distr_dim1 = pert_nglob;

    int *nbnd_loc = new int[kpt_pool_nloc];
    int *ngk      = new int[kpt_pool_nloc];

    nbnd_loc[0] = 128;  // Number of bands
    ngk[0] = 16984;     // Number of grid points

    int ag_size = ag_dim1 * ag_dim2 * ag_dim3 * ag_dim4;
    int bg_size = bg_dim1 * bg_dim2 * bg_dim3 * bg_dim4;
    int c_distr_size = c_distr_dim1 * pert_nglob;

    Complex *ag = new Complex[ag_size];
    Complex *bg = new Complex[bg_size];
    double *c_distr   = new double[c_distr_size];

    fill_random_complex(ag, ag_size);
    fill_random_complex(bg, bg_size);

    std::random_device rd;
    std::mt19937 gen(rd());
    std::uniform_real_distribution<double> dis(0.0, 1.0);

    for (int i = 0; i < c_distr_size; ++i) {
        c_distr[i] = dis(gen);
    }

    int nimage = nimg;

    process_loop(ag, bg, c_distr, nbnd_loc, ngk, l1_e, l2_s, l2_e, kpt_pool_nloc, gstart, nimage, idx, pert_nglob, ag_dim1, ag_dim2, ag_dim3, bg_dim1, bg_dim2, bg_dim3, c_distr_dim1);

    delete[] ag;
    delete[] bg;
    delete[] c_distr;
    delete[] nbnd_loc;
    delete[] ngk;

    return 0;
}

What I tried is implementing a functor for the two sums, I perform loop fusion:

           for (int iks = 0; iks < kpt_pool_nloc; ++iks) {
                int nbndval = nbnd_loc[iks];
                int npw = ngk[iks];

                for (int lbnd = 0; lbnd < nbndval; ++lbnd) {
                    for (int il3 = 0; il3 < npw; ++il3) {
                        int ag_index = il3 + nbndval * (lbnd + ag_dim3 * (iks + ag_dim4 * il1));
                        int bg_index = il3 + nbndval * (lbnd + bg_dim3 * (iks + bg_dim4 * il2));

                        reduce += 2.0 * std::real(ag[ag_index]) * std::real(bg[bg_index])
                                + 2.0 * std::imag(ag[ag_index]) * std::imag(bg[bg_index]);
                    }
                }
            }

            if (gstart == 2) {
                for (int iks = 0; iks < kpt_pool_nloc; ++iks) {
                    int nbndval = nbnd_loc[iks];
                    for (int lbnd = 0; lbnd < nbndval; ++lbnd) {
                        int ag_index = 0 + nbndval * (lbnd + ag_dim3 * (iks + ag_dim4 * il1));
                        int bg_index = 0 + nbndval * (lbnd + bg_dim3 * (iks + bg_dim4 * il2));

                        reduce -= std::real(ag[ag_index]) * std::real(bg[bg_index]);
                    }
                }
            }

Then I defined a functor for each of the operations:

struct calculate_ag_bg {
    Complex* ag; Complex* bg;
    int npw;

    calculate_ag_bg(Complex* _ag,  Complex* _bg, int _npw) 
    : ag(_ag), bg(_bg), npw(_npw) {}

    __host__ __device__
    double operator()(int idx) const {
        int lbnd = idx / npw, il3 = idx % npw;
        double a_real = ag[il3 + lbnd * npw].real() * bg[il3 + lbnd * npw].real();
        double b_imag = ag[il3 + lbnd * npw].imag() * bg[il3 + lbnd * npw].imag();
        double a = 2.0 * a_real;
        double b = 2.0 * b_imag;
        return a + b;
    }
};

struct correct_ag_bg_prod_kernel {
    Complex* ag; Complex* bg; int npw;

    correct_ag_bg_prod_kernel(Complex* _ag,  Complex* _bg, int _npw) 
    : ag(_ag), bg(_bg), npw(_npw) {}

    __host__ __device__
    double operator()(int idx) const {
        return ag[idx * npw].real() * bg[idx * npw].real();
    }
};

and use thrust::transform_reduce to perform the reductions within for loop, but I got bad performance which is to be expected. While kpt_pool_nloc is 1 in the above code, it could be more, so the code would have to accommodate that. Furthermore neither nbnd_loc nor ngk are guaranteed to uniform. Maybe there is a better way to doing this using transform_iterator or something of that sort.

Use steady_clock for timing purposes, not high_resolution_clock. Take it from the library author. — paleonix
– paleonix, Commented Aug 2, 2024 at 10:46
In general you have a multidimensional and irregular problem. Thrust is not a good fit to parallelize this code. Independent of Thrust one would want to include the outer loops into a single kernel for more parallelism but would then suffer from load-imbalance due to the non-uniformity of nbnd_loc and ngk. This is not an insurmountable problem but certainly not trivial to parallelize efficiently. For inspiration on solving the load imbalance you could check out sparse matrix-vector product kernels which have a similar irregular transform-reduce structure. — paleonix
– paleonix, Commented Aug 2, 2024 at 11:12
Hi @paleonix, Sorry for replying late, nbnd_loc, ngk kind of scale depending on the problem to solve, so this code is part of a much larger code with domain decomposition, so this is why it has to be kind of generic but for all practical purposes you can assume these are the maximum. (nbnd_loc tho is always a power of 2). While regards to the second comment if i understand correctly yes, i am not sure how to map my loops because of nbnd_loc and ngk. initially i thought i could collapse the outer two and maybe do a reduce_by_key but i am not sure how to do the inner most two. — kiragon kiriyo
– kiragon kiriyo, Commented Aug 2, 2024 at 11:55
I would assume that the first big performance limiter are the synchronizations done internally by Thrust. Even the thrust::cuda::par_nosync policy will not be able to avoid them because Thrust reductions return the result to the host. Therefore I would switch to CUB algorithms instead. CUB is used in the backend of Thrust and is asynchronous/stream-aware by default. I.e. replace thrust::transform_reduce with cub::deviceReduce::TransformReduce. — paleonix
– paleonix, Commented Aug 2, 2024 at 13:04
CUB also has a CSR SpMV but it is sadly not very flexible (can't pass functors to customize the reduction op etc.). For these kinds of irregular problems moderngpu is often cited. I don't know if one should really use it as-is because it hasn't been updated in years, but it's certainly a good point of reference. — paleonix
– paleonix, Commented Aug 2, 2024 at 13:09

Collectives™ on Stack Overflow

Optimizing Complex Number Computations for GPU with Thrust: Seeking Efficient Migration Advice

0

Your Answer

Linked

Hot Network Questions

Collectives™ on Stack Overflow

0

Know someone who can answer? Share a link to this question via email, Twitter, or Facebook.

Your Answer

Sign up or log in

Post as a guest

Linked