0

I am need to perform transpose of a matrix(CSR) using cuSPARSE, but get “internal error”. I write my code referring to How to transpose a sparse matrix in cuSparse? and https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2. To make it more clearly, I am trying to perform transpose by convert the matrix from format csr to format csc.

I am running on Nvidia GeForce GTX 1080, with driver cuda_11.1.0. I am using Windows 10.

The following is my codes. You can download the folder from https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/sparse2dense, and replace the sparse2dense_example.c with my codes. Then configure and make using CMake, in this way maybe you can reproduce my problems.

#include <cuda_runtime_api.h> // cudaMalloc, cudaMemcpy, etc.
#include <cusparse.h>         // cusparseSparseToDense
#include <stdio.h>            // printf
#include <stdlib.h>           // EXIT_FAILURE

#define CHECK_CUDA(func)                                                       \
{                                                                              \
    cudaError_t status = (func);                                               \
    if (status != cudaSuccess) {                                               \
        printf("CUDA API failed at line %d with error: %s (%d)\n",             \
               __LINE__, cudaGetErrorString(status), status);                  \
        return EXIT_FAILURE;                                                   \
    }                                                                          \
}

#define CHECK_CUSPARSE(func)                                                   \
{                                                                              \
    cusparseStatus_t status = (func);                                          \
    if (status != CUSPARSE_STATUS_SUCCESS) {                                   \
        printf("CUSPARSE API failed at line %d with error: %s (%d)\n",         \
               __LINE__, cusparseGetErrorString(status), status);              \
        return EXIT_FAILURE;                                                   \
    }                                                                          \
}

int main(void) {
    // CUSPARSE APIs
    cusparseHandle_t     handle = NULL;
    cusparseStatus_t status = (cusparseCreate(&handle));
    if (status != CUSPARSE_STATUS_SUCCESS) {
        printf("CUSPARSE API failed at line %d with error: %s (%d)\n", __LINE__, cusparseGetErrorString(status), status);
    }
    
    // Initialize matrix A
    // this matrix is the same as https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/sparse2dense/sparse2dense_example.c
    int   num_rows = 5;
    int   num_cols = 4;
    int   nnz = 11;
    int   h_csr_offsets[] = { 0, 3, 4, 7, 9, 11 };
    int   h_csr_columns[] = { 0, 2, 3, 1, 0, 2, 3, 1, 3, 1, 2 };
    float h_csr_values[] = { 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f,
                               7.0f, 8.0f, 9.0f, 10.0f, 11.0f };
    // Device memory management
    int* d_csr_offsets, * d_csr_columns;
    float* d_csr_values;
    CHECK_CUDA(cudaMalloc((void**)&d_csr_offsets, (num_rows + 1) * sizeof(int)))
    CHECK_CUDA(cudaMalloc((void**)&d_csr_columns, nnz * sizeof(int)))
    CHECK_CUDA(cudaMalloc((void**)&d_csr_values, nnz * sizeof(float)))

    CHECK_CUDA(cudaMemcpy(d_csr_offsets, h_csr_offsets, (num_rows + 1) * sizeof(int), cudaMemcpyHostToDevice))
    CHECK_CUDA(cudaMemcpy(d_csr_columns, h_csr_columns, nnz * sizeof(int), cudaMemcpyHostToDevice))
    CHECK_CUDA(cudaMemcpy(d_csr_values, h_csr_values, nnz * sizeof(float), cudaMemcpyHostToDevice))

    // Memory allocation of transpose A
    int* d_csr_offsets_AT, * d_csr_columns_AT;
    float* d_csr_values_AT;
    //first allocate memory to ATT
    CHECK_CUDA(cudaMalloc((void**)&d_csr_offsets_AT, (num_cols + 1) * sizeof(int)))
    CHECK_CUDA(cudaMalloc((void**)&d_csr_columns_AT, nnz * sizeof(int)))
    CHECK_CUDA(cudaMalloc((void**)&d_csr_values_AT, nnz * sizeof(float)))

    size_t buffer_temp_size;
    cusparseCsr2cscEx2_bufferSize(
        handle, num_rows, num_cols, nnz, h_csr_values, h_csr_offsets, h_csr_columns,
        d_csr_values_AT, d_csr_offsets_AT, d_csr_columns_AT, CUDA_R_32F, CUSPARSE_ACTION_NUMERIC,
        CUSPARSE_INDEX_BASE_ZERO, CUSPARSE_CSR2CSC_ALG1, &buffer_temp_size);
    void* buffer_temp = NULL;
    printf("buffer_temp_size is %zd\n", buffer_temp_size);
    CHECK_CUDA(cudaMalloc(&buffer_temp, buffer_temp_size))
    CHECK_CUSPARSE(cusparseCsr2cscEx2(handle, num_rows, num_cols, nnz, h_csr_values, h_csr_offsets, h_csr_columns,
        d_csr_values_AT, d_csr_offsets_AT, d_csr_columns_AT, CUDA_R_32F, CUSPARSE_ACTION_NUMERIC,
        CUSPARSE_INDEX_BASE_ZERO, CUSPARSE_CSR2CSC_ALG1, buffer_temp))
}

1 Answer 1

1

The error is due to the fact that you are passing pointers to host data, to a routine that intends to work on device data:

cusparseCsr2cscEx2_bufferSize(
    handle, num_rows, num_cols, nnz, h_csr_values, h_csr_offsets, h_csr_columns,
                                     ^             ^              ^

and

CHECK_CUSPARSE(cusparseCsr2cscEx2(handle, num_rows, num_cols, nnz, h_csr_values, h_csr_offsets, h_csr_columns,
                                                                   ^             ^              ^

When I change those instances to your allocated device data:

d_csr_values, d_csr_offsets, d_csr_columns

the "internal error" that you are asking about goes away, according to my testing.

Sign up to request clarification or add additional context in comments.

Comments

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.