function cusparseScsr2csc in cuSPARSE library return strange result

Question

I want to test the cusparseScsr2csc which is a function used to convert a csr format matrix to a csc format matrix (or just say transpose a csr format matrix), so I write the code below to test it.

the wrapper:

CSR<float> Csr2csc(int m, int n, int nnz, float *values, int *row_ptrs, int *col_inds, cusparseStatus_t &st) {
    cusparseHandle_t handle;
    cusparseAction_t copyValues = CUSPARSE_ACTION_NUMERIC;
    cusparseIndexBase_t idxBase = CUSPARSE_INDEX_BASE_ZERO;
    cusparseCreate(&handle);
    
    // malloc space on video card and copy data
    float *csr_values;
    int *csr_row_ptrs;
    int *csr_col_inds;
    float *csc_values;
    int *csc_col_ptrs;
    int *csc_row_inds;
    cudaMalloc(&csr_values, sizeof(float) * nnz);
    cudaMalloc(&csr_row_ptrs, sizeof(int) * (m + 1));
    cudaMalloc(&csr_col_inds, sizeof(int) * (nnz));
    cudaMalloc(&csc_values, sizeof(float) * nnz);
    cudaMalloc(&csc_col_ptrs, sizeof(int) * (n + 1));
    cudaMalloc(&csc_row_inds, sizeof(int) * (nnz));
    cudaMemcpy(csr_values, values, sizeof(float) * nnz, cudaMemcpyHostToDevice);
    cudaMemcpy(csr_row_ptrs, row_ptrs, sizeof(int) * (m + 1), cudaMemcpyHostToDevice);
    cudaMemcpy(csr_values, col_inds, sizeof(int) * nnz, cudaMemcpyHostToDevice);
    
    // use the API from cuSPARSE
    st = cusparseScsr2csc(handle, m, n, nnz, csr_values, csr_row_ptrs,
        csr_col_inds, csc_values, csc_row_inds,
        csc_col_ptrs, copyValues, idxBase);
    
    // copy the data from device (video card) to host (CPU)
    vector<float> res_values;
    vector<int> res_row_ptrs, res_col_inds;
    res_row_ptrs.resize(n + 1);
    res_col_inds.resize(nnz);
    res_values.resize(nnz);
    cudaMemcpy(res_row_ptrs.data(), csc_col_ptrs, (n + 1) * sizeof(int), cudaMemcpyDeviceToHost);
    cudaMemcpy(res_values.data(), csc_values, nnz * sizeof(float), cudaMemcpyDeviceToHost);
    cudaMemcpy(res_col_inds.data(), csc_row_inds, nnz * sizeof(int), cudaMemcpyDeviceToHost);
    
    // return the answer
    return CSR<float>(res_values, res_row_ptrs, res_col_inds);
}

this is the CSR class:

template<class T>
struct CSR {
    vector<T> values;
    vector<int> row_ptrs;
    vector<int> col_inds;
    CSR(vector<T> &a, vector<int> &b, vector<int> &c) :values(a), row_ptrs(b), col_inds(c) {}
    void out() {
        cout << "valuse = ";
        for (auto &t : values) cout << t << ' ';
        cout << "\nrow_ptrs = ";
        for (auto &t : row_ptrs) cout << t << ' ';
        cout << "\ncol_inds = ";
        for (auto &t : col_inds) cout << t << ' ';
        cout << endl;
    }
};

and this is the code in the main:

int m = 4, n = 6, nnz = 8;
float values[] = { 10,20 ,30 ,4, 50, 60 ,70 ,80 };
int row_ptrs[] = { 0, 2 ,4 ,7, 8 };
int col_inds[] = { 0 ,1, 1, 3 ,2, 3, 4, 5 };

cusparseStatus_t st;
auto res = Csr2csc(m, n, nnz, values, row_ptrs, col_inds, st);
res.out();

the CSR format in the main is derived from the matrix below which I want to transpose (A <=> values, IA <=> row_ptrs, JA <=> col_inds):

the result I got (definitely wrong):

My video card is Geforce MX150, and I use Visual Studio 15 2017 with CUDA 9.2

Full code:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <cuda_runtime.h>
#include <cusparse.h>

#include <iostream>
#include <vector>
#include <complex>

using namespace std;

template<class T>
struct CSR {
    vector<T> values;
    vector<int> row_ptrs;
    vector<int> col_inds;
    CSR(vector<T> &a, vector<int> &b, vector<int> &c) :values(a), row_ptrs(b), col_inds(c) {}
    void out() {
        cout << "valuse = ";
        for (auto &t : values) cout << t << ' ';
        cout << "\nrow_ptrs = ";
        for (auto &t : row_ptrs) cout << t << ' ';
        cout << "\ncol_inds = ";
        for (auto &t : col_inds) cout << t << ' ';
        cout << endl;
    }
};

CSR<float> Csr2csc(int m, int n, int nnz, float *values, int *row_ptrs, int *col_inds, cusparseStatus_t &st) {
    cusparseHandle_t handle;
    cusparseAction_t copyValues = CUSPARSE_ACTION_NUMERIC;
    cusparseIndexBase_t idxBase = CUSPARSE_INDEX_BASE_ZERO;
    cusparseCreate(&handle);
    float *csr_values;
    int *csr_row_ptrs;
    int *csr_col_inds;
    float *csc_values;
    int *csc_col_ptrs;
    int *csc_row_inds;
    cudaMalloc(&csr_values, sizeof(float) * nnz);
    cudaMalloc(&csr_row_ptrs, sizeof(int) * (m + 1));
    cudaMalloc(&csr_col_inds, sizeof(int) * (nnz));
    cudaMalloc(&csc_values, sizeof(float) * nnz);
    cudaMalloc(&csc_col_ptrs, sizeof(int) * (n + 1));
    cudaMalloc(&csc_row_inds, sizeof(int) * (nnz));
    cudaMemcpy(csr_values, values, sizeof(float) * nnz, cudaMemcpyHostToDevice);
    cudaMemcpy(csr_row_ptrs, row_ptrs, sizeof(int) * (m + 1), cudaMemcpyHostToDevice);
    cudaMemcpy(csr_values, col_inds, sizeof(int) * nnz, cudaMemcpyHostToDevice);
    st = cusparseScsr2csc(handle, m, n, nnz, csr_values, csr_row_ptrs,
        csr_col_inds, csc_values, csc_row_inds,
        csc_col_ptrs, copyValues, idxBase);
    vector<float> res_values;
    vector<int> res_row_ptrs, res_col_inds;
    res_row_ptrs.resize(n + 1);
    res_col_inds.resize(nnz);
    res_values.resize(nnz);
    cudaMemcpy(res_row_ptrs.data(), csc_col_ptrs, (n + 1) * sizeof(int), cudaMemcpyDeviceToHost);
    cudaMemcpy(res_values.data(), csc_values, nnz * sizeof(float), cudaMemcpyDeviceToHost);
    cudaMemcpy(res_col_inds.data(), csc_row_inds, nnz * sizeof(int), cudaMemcpyDeviceToHost);
    return CSR<float>(res_values, res_row_ptrs, res_col_inds);
}

int main()
{
    int m = 4, n = 6, nnz = 8;
    float values[] = { 10,20 ,30 ,4, 50, 60 ,70 ,80 };
    int row_ptrs[] = { 0, 2 ,4 ,7, 8 };
    int col_inds[] = { 0 ,1, 1, 3 ,2, 3, 4, 5 };

    cusparseStatus_t st;
    auto res = Csr2csc(m, n, nnz, values, row_ptrs, col_inds, st);

    if (st == CUSPARSE_STATUS_SUCCESS) {
        cout << "success" << endl;
        res.out();
    }
    return 0;
}

this is the documentation page, the function cusparseScsr2csc is located in ch. 9. And I find the text below, it says the function executes asynchronously, maybe this is the problem, but I still don't know how to deal with it.

Edit:

I tried the solution mentioned by paleonix (add cudaDeviceSynchronize() right after the cusparseScsr2csc(...)), but still got the exact same wrong answer.

cudaDeviceSynchronize() is the function you need to use to get the results of any asynchronous operation in CUDA. — paleonix
– paleonix, Commented Jul 18, 2022 at 8:39
@paleonix Thanks for your reply. I have added cudaDeviceSynchronize() function right after the cusparseScsr2csc(...), but still got the same output... — yys_c
– yys_c, Commented Jul 18, 2022 at 8:45
Maybe you will find the problem using proper CUDA error checking. See also the Conjugate Gradient CUDA sample and this header which is used for error checking in the samples. — paleonix
– paleonix, Commented Jul 18, 2022 at 9:50
It might not make a difference for cusparseScsr2csc, but there are newer versions of the documentation you linked. For CUDA 9.2, see here. — paleonix
– paleonix, Commented Jul 18, 2022 at 13:09

Robert Crovella · Accepted Answer · 2022-07-18 14:56:32Z

2

The main problem is here:

cudaMemcpy(csr_values, col_inds, sizeof(int) * nnz, cudaMemcpyHostToDevice);

That should be:

cudaMemcpy(csr_col_inds, col_inds, sizeof(int) * nnz, cudaMemcpyHostToDevice);

A few other notes:

The function cusparseScsr2csc is no longer available in recent versions of CUDA (it was evidently deprecated and later removed). I assume this might be one reason you are using CUDA 9.2. One possible replacement would be cusparseCsr2cscEx2()
There is no particular need for an additional cudaDeviceSynchronize() here. The cudaMemcpy functions after the cusparse function call serve the same purpose.

answered Jul 18, 2022 at 14:56

Robert Crovella

154k12 gold badges254 silver badges300 bronze badges

Sign up to request clarification or add additional context in comments.

Collectives™ on Stack Overflow

function cusparseScsr2csc in cuSPARSE library return strange result

Edit:

1 Answer 1

Comments

Your Answer

Linked

Hot Network Questions

Collectives™ on Stack Overflow

Edit:

1 Answer 1

Comments

Your Answer

Sign up or log in

Post as a guest

Linked

Related