2

I am a little bit new to thrust I am trying to migrate the following code to use make use of gpus but this one seems a little difficult

#include <iostream>
#include <complex>
#include <random>
#include <omp.h>
#include <chrono>

using Complex = std::complex<double>;


void fill_random_complex(Complex *array, int n) {
    std::random_device rd;
    std::mt19937 gen(rd());
    std::uniform_real_distribution<double> dis(0.0, 1.0);

    for (int i = 0; i < n; ++i) {
        double real_part = dis(gen);
        double imag_part = dis(gen);
        array[i] = Complex(real_part, imag_part);
    }
}

void process_loop(
    Complex *ag,
    Complex *bg,
    double *c_distr,
    int *nbnd_loc,
    int *ngk,
    int l1_e, int l2_s, int l2_e,
    int kpt_pool_nloc, int gstart,
    int nimage, int idx, int pert_nglob,
    int ag_dim1, int ag_dim2, int ag_dim3,
    int bg_dim1, int bg_dim2, int bg_dim3,
    int c_distr_dim1) 
{
    auto start_time = std::chrono::high_resolution_clock::now();

    for (int il1 = 0; il1 < l1_e; ++il1) {
        for (int il2 = l2_s - 1; il2 < l2_e; ++il2) {
            int ig1 = nimage * il1 + idx;
            double reduce = 0.0;

            for (int iks = 0; iks < kpt_pool_nloc; ++iks) {
                int nbndval = nbnd_loc[iks];
                int npw = ngk[iks];

                for (int lbnd = 0; lbnd < nbndval; ++lbnd) {
                    for (int il3 = 0; il3 < npw; ++il3) {
                        int ag_index = il3 + nbndval * (lbnd + ag_dim3 * (iks + ag_dim4 * il1));
                        int bg_index = il3 + nbndval * (lbnd + bg_dim3 * (iks + bg_dim4 * il2));

                        reduce += 2.0 * std::real(ag[ag_index]) * std::real(bg[bg_index])
                                + 2.0 * std::imag(ag[ag_index]) * std::imag(bg[bg_index]);
                    }
                }

                if (gstart == 2) {
                    for (int lbnd = 0; lbnd < nbndval; ++lbnd) {
                        int ag_index = 0 + nbndval * (lbnd + ag_dim3 * (iks + ag_dim4 * il1));
                        int bg_index = 0 + nbndval * (lbnd + bg_dim3 * (iks + bg_dim4 * il2));

                        reduce -= std::real(ag[ag_index]) * std::real(bg[bg_index]);
                    }
                }
            }

            int c_distr_index = ig1 * c_distr_dim1 + il2;
            c_distr[c_distr_index] = reduce;
        }
    }

    auto end_time = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double> elapsed_time = end_time - start_time;
    std::cout << "Time for process_loop: " << elapsed_time.count() << " seconds" << std::endl;
}

int main() {
    const int nimg = 1;
    const int idx = 0;
    const int kpt_pool_nloc = 1;
    const int l1_e = 200;
    const int l2_s = 1;
    const int l2_e = 100;
    const int pert_nglob = 200;
    const int gstart = 2;

    const int ag_dim1 = 16984;
    const int ag_dim2 = 128;
    const int ag_dim3 = kpt_pool_nloc;
    const int ag_dim4 = l1_e;

    const int bg_dim1 = 16984;
    const int bg_dim2 = 128;
    const int bg_dim3 = kpt_pool_nloc;
    const int bg_dim4 = l2_e;

    const int c_distr_dim1 = pert_nglob;

    int *nbnd_loc = new int[kpt_pool_nloc];
    int *ngk      = new int[kpt_pool_nloc];

    nbnd_loc[0] = 128;  // Number of bands
    ngk[0] = 16984;     // Number of grid points

    int ag_size = ag_dim1 * ag_dim2 * ag_dim3 * ag_dim4;
    int bg_size = bg_dim1 * bg_dim2 * bg_dim3 * bg_dim4;
    int c_distr_size = c_distr_dim1 * pert_nglob;

    Complex *ag = new Complex[ag_size];
    Complex *bg = new Complex[bg_size];
    double *c_distr   = new double[c_distr_size];

    fill_random_complex(ag, ag_size);
    fill_random_complex(bg, bg_size);

    std::random_device rd;
    std::mt19937 gen(rd());
    std::uniform_real_distribution<double> dis(0.0, 1.0);

    for (int i = 0; i < c_distr_size; ++i) {
        c_distr[i] = dis(gen);
    }

    int nimage = nimg;

    process_loop(ag, bg, c_distr, nbnd_loc, ngk, l1_e, l2_s, l2_e, kpt_pool_nloc, gstart, nimage, idx, pert_nglob, ag_dim1, ag_dim2, ag_dim3, bg_dim1, bg_dim2, bg_dim3, c_distr_dim1);

    delete[] ag;
    delete[] bg;
    delete[] c_distr;
    delete[] nbnd_loc;
    delete[] ngk;

    return 0;
}

What I tried is implementing a functor for the two sums, I perform loop fusion:

           for (int iks = 0; iks < kpt_pool_nloc; ++iks) {
                int nbndval = nbnd_loc[iks];
                int npw = ngk[iks];

                for (int lbnd = 0; lbnd < nbndval; ++lbnd) {
                    for (int il3 = 0; il3 < npw; ++il3) {
                        int ag_index = il3 + nbndval * (lbnd + ag_dim3 * (iks + ag_dim4 * il1));
                        int bg_index = il3 + nbndval * (lbnd + bg_dim3 * (iks + bg_dim4 * il2));

                        reduce += 2.0 * std::real(ag[ag_index]) * std::real(bg[bg_index])
                                + 2.0 * std::imag(ag[ag_index]) * std::imag(bg[bg_index]);
                    }
                }
            }

            if (gstart == 2) {
                for (int iks = 0; iks < kpt_pool_nloc; ++iks) {
                    int nbndval = nbnd_loc[iks];
                    for (int lbnd = 0; lbnd < nbndval; ++lbnd) {
                        int ag_index = 0 + nbndval * (lbnd + ag_dim3 * (iks + ag_dim4 * il1));
                        int bg_index = 0 + nbndval * (lbnd + bg_dim3 * (iks + bg_dim4 * il2));

                        reduce -= std::real(ag[ag_index]) * std::real(bg[bg_index]);
                    }
                }
            }

Then I defined a functor for each of the operations:

struct calculate_ag_bg {
    Complex* ag; Complex* bg;
    int npw;

    calculate_ag_bg(Complex* _ag,  Complex* _bg, int _npw) 
    : ag(_ag), bg(_bg), npw(_npw) {}

    __host__ __device__
    double operator()(int idx) const {
        int lbnd = idx / npw, il3 = idx % npw;
        double a_real = ag[il3 + lbnd * npw].real() * bg[il3 + lbnd * npw].real();
        double b_imag = ag[il3 + lbnd * npw].imag() * bg[il3 + lbnd * npw].imag();
        double a = 2.0 * a_real;
        double b = 2.0 * b_imag;
        return a + b;
    }
};

struct correct_ag_bg_prod_kernel {
    Complex* ag; Complex* bg; int npw;

    correct_ag_bg_prod_kernel(Complex* _ag,  Complex* _bg, int _npw) 
    : ag(_ag), bg(_bg), npw(_npw) {}

    __host__ __device__
    double operator()(int idx) const {
        return ag[idx * npw].real() * bg[idx * npw].real();
    }
};

and use thrust::transform_reduce to perform the reductions within for loop, but I got bad performance which is to be expected. While kpt_pool_nloc is 1 in the above code, it could be more, so the code would have to accommodate that. Furthermore neither nbnd_loc nor ngk are guaranteed to uniform. Maybe there is a better way to doing this using transform_iterator or something of that sort.

12
  • 1
    Use steady_clock for timing purposes, not high_resolution_clock. Take it from the library author. Commented Aug 2, 2024 at 10:46
  • In general you have a multidimensional and irregular problem. Thrust is not a good fit to parallelize this code. Independent of Thrust one would want to include the outer loops into a single kernel for more parallelism but would then suffer from load-imbalance due to the non-uniformity of nbnd_loc and ngk. This is not an insurmountable problem but certainly not trivial to parallelize efficiently. For inspiration on solving the load imbalance you could check out sparse matrix-vector product kernels which have a similar irregular transform-reduce structure. Commented Aug 2, 2024 at 11:12
  • Hi @paleonix, Sorry for replying late, nbnd_loc, ngk kind of scale depending on the problem to solve, so this code is part of a much larger code with domain decomposition, so this is why it has to be kind of generic but for all practical purposes you can assume these are the maximum. (nbnd_loc tho is always a power of 2). While regards to the second comment if i understand correctly yes, i am not sure how to map my loops because of nbnd_loc and ngk. initially i thought i could collapse the outer two and maybe do a reduce_by_key but i am not sure how to do the inner most two. Commented Aug 2, 2024 at 11:55
  • 2
    I would assume that the first big performance limiter are the synchronizations done internally by Thrust. Even the thrust::cuda::par_nosync policy will not be able to avoid them because Thrust reductions return the result to the host. Therefore I would switch to CUB algorithms instead. CUB is used in the backend of Thrust and is asynchronous/stream-aware by default. I.e. replace thrust::transform_reduce with cub::deviceReduce::TransformReduce. Commented Aug 2, 2024 at 13:04
  • 1
    CUB also has a CSR SpMV but it is sadly not very flexible (can't pass functors to customize the reduction op etc.). For these kinds of irregular problems moderngpu is often cited. I don't know if one should really use it as-is because it hasn't been updated in years, but it's certainly a good point of reference. Commented Aug 2, 2024 at 13:09

0

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.