Fusing two reduction operations in cuda Thrust

Question

Is there a way to do a reduce_by_key operation and a reduce (or ideally another reduce_by_key) operation in only one kernel call in Thrust? Besides gaining computational speed, let us say I want to do this because the number of output values from the first reduce_by_key operation is too large to be stored in memory.

I have been wondering if transform_output_iterator could help here but have not found a solution.

A simple demonstration, but not my real use case, could be to find the minimum of the maximums of each row in a matrix, where that matrix is flattened and stored in a device_vector.

Thank you for your comment. I'm not sure your solution solves my example problem, I have to think more. Also in general my segments will not all be of the same length. But in the meantime, regarding my comment about memory, you can imagine the matrix values are generated from a counting iterator and a transform iterator, and there are several billions of them. — Frimousse
– Frimousse, Commented May 18, 2023 at 12:57
Sorry, I missread. My description is for finding both the minima and the maxima, not the minima of the maxima. — paleonix
– paleonix, Commented May 18, 2023 at 13:31

Abator Abetor · Accepted Answer · 2023-05-18 16:47:44Z

3

The following code computes the minimum of all row maximums with a fixed amount of temporary storage to store a limited number of minima. Afterwards, a min reduce is performed to find the global minimum

The idea is to directly update the minimum value via transform_output_iterator. This can be done via atomics (in case of raw pointers for temp minima) or via locks (in case of iterators for temp minima. not shown in this answer).

To avoid atomic contention, the number of temporary minima should not be too small.

For 1G segments of size 1,i.e. there will be an atomic operation for each input element, I observe the following timings on an A100 GPU.

time approach 1 (standard): 13.2674 ms.
time approach 2 (fused): 38.0479 ms. (minimaSlots = 1)
time approach 2 (fused): 23.9251 ms. (minimaSlots = 1024)
time approach 2 (fused): 10.1109 ms. (minimaSlots = 1024 * 1024)

#include <thrust/device_vector.h>
#include <thrust/reduce.h>
#include <thrust/iterator/transform_output_iterator.h>
#include <thrust/iterator/discard_iterator.h>

#include <iostream>
#include <vector>
#include <limits>

template<size_t size>
struct UpdateMinimumOp{
    int* minPtr;

    UpdateMinimumOp(int* ptr):minPtr(ptr){}
    __device__
    int operator()(int value){
     // select output slot for minimum based on thread id
        const size_t pos = size_t(threadIdx.x) + size_t(blockIdx.x) * size_t(blockDim.x);
        const size_t minPos = pos % size;

        atomicMin(minPtr + minPos, value);
        return value;
    }
};

int main(){
    cudaEvent_t a; cudaEventCreate(&a);
    cudaEvent_t b; cudaEventCreate(&b);
    float t; 

    size_t N =  1ull << 30;
    thrust::device_vector<int> keys(N);
    thrust::device_vector<int> values(N);
    thrust::sequence(keys.begin(), keys.end(), 0);
    thrust::sequence(values.begin(), values.end(), 1);

    //Approach 1 (for timing comparison). max Reduce_by_key. then min reduce
    thrust::device_vector<int> maxima(N);

    cudaEventRecord(a);

    thrust::reduce_by_key(
        keys.begin(),
        keys.end(),
        values.begin(),
        thrust::make_discard_iterator(),
        maxima.begin(),
        thrust::equal_to<int>{},
        thrust::maximum<int>{}
    );

    int minimumApproach1 = thrust::reduce(maxima.begin(), maxima.end(), std::numeric_limits<int>::max(), thrust::minimum<int>{});

    cudaEventRecord(b);
    cudaEventSynchronize(b);
    cudaEventElapsedTime(&t, a,b);
    std::cout << "time approach 1 (standard): " << t << " ms. minimum: " <<minimumApproach1 << "\n";


    //Approach 2. Fuse max Reduce_by_key with the computation of the minimaSlots smallest maxima. then min reduce the stored smallest maxima
    //constexpr size_t minimaSlots = 1; 
    //constexpr size_t minimaSlots = 1024; 
    constexpr size_t minimaSlots = 1024*1024;
   
    thrust::device_vector<int> minima_of_maxima(minimaSlots);
    thrust::fill(minima_of_maxima.begin(), minima_of_maxima.end(), std::numeric_limits<int>::max());

    auto minimaOfMaximaIterator = thrust::make_transform_output_iterator(
        thrust::make_discard_iterator(),
        UpdateMinimumOp<minimaSlots>{minima_of_maxima.data().get()}
    );

    cudaEventRecord(a);

    thrust::reduce_by_key(
        keys.begin(),
        keys.end(),
        values.begin(),
        thrust::make_discard_iterator(),
        minimaOfMaximaIterator,
        thrust::equal_to<int>{},
        thrust::maximum<int>{}
    );

    int minimumApproach2 = thrust::reduce(minima_of_maxima.begin(), minima_of_maxima.end(), std::numeric_limits<int>::max(), thrust::minimum<int>{});

    cudaEventRecord(b);
    cudaEventSynchronize(b);
    cudaEventElapsedTime(&t, a,b);
    std::cout << "time approach 2 (fused): " << t << " ms. minimum: " << minimumApproach2 << "\n";

    cudaEventDestroy(a);
    cudaEventDestroy(b);
}

edited May 18, 2023 at 16:47

answered May 18, 2023 at 13:00

Abator Abetor

2,7181 gold badge12 silver badges13 bronze badges

Sign up to request clarification or add additional context in comments.

5 Comments

Frimousse Over a year ago

Unless I'm misunderstanding, the first reduce_by_key is unnecessary so it does solve the problem in one kernel call? Although the atomic restriction is too restrictive for me and an ideal solution for me would be without atomics and raw memory pointers.

Abator Abetor Over a year ago

@Frimousse You can simply update the callback to store an iterator instead of a pointer and use a lock for serialization. But there has to be serialization if you want to stick with that approach, let it be atomics or something else.

Abator Abetor Over a year ago

@Frimousse You could also implement your own reduce by key, compute per-block minima and use a simple device-wide reduction to compute the global minimum. I am not going to write that for you.

Frimousse Over a year ago

I see. The need for serialization makes this solution unusable for me unfortunately.

Abator Abetor Over a year ago

@Frimousse I added some timings. the fused approach is faster if atomic contention can be reduced

Collectives™ on Stack Overflow

Fusing two reduction operations in cuda Thrust

1 Answer 1

5 Comments

Your Answer

Hot Network Questions

Collectives™ on Stack Overflow

1 Answer 1

5 Comments

Your Answer

Sign up or log in

Post as a guest

Related