0

I am trying to compare how faster adding vectors in parallel is than not non-parallel adding. In order for parallel-adding to be faster, I need big numbers. When my N is 10000, I still can run it. However, when my N is 100000, I get

Unhandled exception at 0x00D25B89 in basicCuda.exe: 0xC00000FD: Stack overflow (parameters: 0x00000000, 0x002F2000)

How do I fix this issue?

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <stdio.h>
#include <time.h>
#include <chrono>
cudaError_t addWithCuda(int *c, const int *a, const int *b, uint64_t N);

__global__ void addKernel(int *c, const int *a, const int *b, uint64_t N)
{
    int i = threadIdx.x + blockIdx.x * blockDim.x;
    if (i < N) {
        c[i] = a[i] * b[i];
    }
}


void randomizeArr(int arr[],  int size) {

    unsigned int randNum;
    srand(time(NULL));
    for (int i = 0; i < size; i++) {
        randNum = rand() % 100 + 1;
        arr[i] = randNum;
    }
}

void addWithCPU(int c[], int a[], int b[], int size) {
    for (int i = 0; i < size; i++) {
        c[i] = a[i] + b[i];
    }
}

#define N (10000) // Number of elements each array has
#define M 1024 // 512 Threads Per Block
int main()
{
    const uint64_t arSize = N;
    int a[arSize] = { 0 };
    int b[arSize] = { 0 };
    randomizeArr(a, arSize);
    randomizeArr(b, arSize);
    int c[arSize] = { 0 };
    int d[arSize] = { 0 };

    // Add vectors in parallel.
    int iterations = 100;
    cudaError cudaStatus;
    auto begin = std::chrono::high_resolution_clock::now();
    for (uint32_t i = 0; i < iterations; ++i)
    {
        cudaStatus = addWithCuda(c, a, b, arSize);
    }
    auto end = std::chrono::high_resolution_clock::now();
    auto duration = std::chrono::duration_cast<std::chrono::nanoseconds>(end - begin).count();
    std::cout << "Parallel : " << duration / iterations << "ns." << std::endl;

    // Add vectors NOT in parallel.
    auto begin2 = std::chrono::high_resolution_clock::now();
    for (uint32_t i = 0; i < iterations; ++i)
    {
        addWithCPU(d, a, b, arSize);
    }
    auto end2 = std::chrono::high_resolution_clock::now();
    auto duration2 = std::chrono::duration_cast<std::chrono::nanoseconds>(end2 - begin2).count();
    std::cout << "Not Parallel : " << duration2 / iterations << "ns." << std::endl;


    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "addWithCuda failed!");
        std::getchar();
        return 1;
    }


    // cudaDeviceReset must be called before exiting in order for profiling and
    // tracing tools such as Nsight and Visual Profiler to show complete traces.
    cudaStatus = cudaDeviceReset();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceReset failed!");
        return 1;
    }
    std::getchar();

    return 0;
}

// Helper function for using CUDA to add vectors in parallel.
cudaError_t addWithCuda(int *c, const int *a, const int *b, uint64_t size)
{
    int *dev_a = 0;
    int *dev_b = 0;
    int *dev_c = 0;
    cudaError_t cudaStatus;

    // Choose which GPU to run on, change this on a multi-GPU system.
    cudaStatus = cudaSetDevice(0);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
        goto Error;
    }

    // Allocate GPU buffers for three vectors (two input, one output)    .
    cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(int));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    // Copy input vectors from host memory to GPU buffers.
    cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

    cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

    // Launch a kernel on the GPU with one thread for each element.
    addKernel<<<(N + M - 1)/ M, M>>>(dev_c, dev_a, dev_b, N);

    // Check for any errors launching the kernel
    cudaStatus = cudaGetLastError();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
        goto Error;
    }

    // cudaDeviceSynchronize waits for the kernel to finish, and returns
    // any errors encountered during the launch.
    cudaStatus = cudaDeviceSynchronize();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
        goto Error;
    }

    // Copy output vector from GPU buffer to host memory.
    cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

Error:
    cudaFree(dev_c);
    cudaFree(dev_a);
    cudaFree(dev_b);

    return cudaStatus;
}
0

1 Answer 1

2

All arrays:

int a[arSize]
int b[arSize]
int c[arSize]
int d[arSize]

are created on stack in function "main". Given arSize = 100000 and sizeof(int) = 4, you ask to allocate 1600000 bytes (1.5 MB), which may require tweaking with compiler or OS parameters to allow a stack that big.

Instead, you could allocate memory with new:

int* a = new int[arSize]();

(note that all array values will be initialized to 0, see also c++ initial value of dynamic array)

Sign up to request clarification or add additional context in comments.

Comments

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.