I want to test the cusparseScsr2csc which is a function used to convert a csr format matrix to a csc format matrix (or just say transpose a csr format matrix), so I write the code below to test it.
the wrapper:
CSR<float> Csr2csc(int m, int n, int nnz, float *values, int *row_ptrs, int *col_inds, cusparseStatus_t &st) {
cusparseHandle_t handle;
cusparseAction_t copyValues = CUSPARSE_ACTION_NUMERIC;
cusparseIndexBase_t idxBase = CUSPARSE_INDEX_BASE_ZERO;
cusparseCreate(&handle);
// malloc space on video card and copy data
float *csr_values;
int *csr_row_ptrs;
int *csr_col_inds;
float *csc_values;
int *csc_col_ptrs;
int *csc_row_inds;
cudaMalloc(&csr_values, sizeof(float) * nnz);
cudaMalloc(&csr_row_ptrs, sizeof(int) * (m + 1));
cudaMalloc(&csr_col_inds, sizeof(int) * (nnz));
cudaMalloc(&csc_values, sizeof(float) * nnz);
cudaMalloc(&csc_col_ptrs, sizeof(int) * (n + 1));
cudaMalloc(&csc_row_inds, sizeof(int) * (nnz));
cudaMemcpy(csr_values, values, sizeof(float) * nnz, cudaMemcpyHostToDevice);
cudaMemcpy(csr_row_ptrs, row_ptrs, sizeof(int) * (m + 1), cudaMemcpyHostToDevice);
cudaMemcpy(csr_values, col_inds, sizeof(int) * nnz, cudaMemcpyHostToDevice);
// use the API from cuSPARSE
st = cusparseScsr2csc(handle, m, n, nnz, csr_values, csr_row_ptrs,
csr_col_inds, csc_values, csc_row_inds,
csc_col_ptrs, copyValues, idxBase);
// copy the data from device (video card) to host (CPU)
vector<float> res_values;
vector<int> res_row_ptrs, res_col_inds;
res_row_ptrs.resize(n + 1);
res_col_inds.resize(nnz);
res_values.resize(nnz);
cudaMemcpy(res_row_ptrs.data(), csc_col_ptrs, (n + 1) * sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(res_values.data(), csc_values, nnz * sizeof(float), cudaMemcpyDeviceToHost);
cudaMemcpy(res_col_inds.data(), csc_row_inds, nnz * sizeof(int), cudaMemcpyDeviceToHost);
// return the answer
return CSR<float>(res_values, res_row_ptrs, res_col_inds);
}
this is the CSR class:
template<class T>
struct CSR {
vector<T> values;
vector<int> row_ptrs;
vector<int> col_inds;
CSR(vector<T> &a, vector<int> &b, vector<int> &c) :values(a), row_ptrs(b), col_inds(c) {}
void out() {
cout << "valuse = ";
for (auto &t : values) cout << t << ' ';
cout << "\nrow_ptrs = ";
for (auto &t : row_ptrs) cout << t << ' ';
cout << "\ncol_inds = ";
for (auto &t : col_inds) cout << t << ' ';
cout << endl;
}
};
and this is the code in the main:
int m = 4, n = 6, nnz = 8;
float values[] = { 10,20 ,30 ,4, 50, 60 ,70 ,80 };
int row_ptrs[] = { 0, 2 ,4 ,7, 8 };
int col_inds[] = { 0 ,1, 1, 3 ,2, 3, 4, 5 };
cusparseStatus_t st;
auto res = Csr2csc(m, n, nnz, values, row_ptrs, col_inds, st);
res.out();
the CSR format in the main is derived from the matrix below which I want to transpose (A <=> values, IA <=> row_ptrs, JA <=> col_inds):

the result I got (definitely wrong):

My video card is Geforce MX150, and I use Visual Studio 15 2017 with CUDA 9.2
Full code:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cuda_runtime.h>
#include <cusparse.h>
#include <iostream>
#include <vector>
#include <complex>
using namespace std;
template<class T>
struct CSR {
vector<T> values;
vector<int> row_ptrs;
vector<int> col_inds;
CSR(vector<T> &a, vector<int> &b, vector<int> &c) :values(a), row_ptrs(b), col_inds(c) {}
void out() {
cout << "valuse = ";
for (auto &t : values) cout << t << ' ';
cout << "\nrow_ptrs = ";
for (auto &t : row_ptrs) cout << t << ' ';
cout << "\ncol_inds = ";
for (auto &t : col_inds) cout << t << ' ';
cout << endl;
}
};
CSR<float> Csr2csc(int m, int n, int nnz, float *values, int *row_ptrs, int *col_inds, cusparseStatus_t &st) {
cusparseHandle_t handle;
cusparseAction_t copyValues = CUSPARSE_ACTION_NUMERIC;
cusparseIndexBase_t idxBase = CUSPARSE_INDEX_BASE_ZERO;
cusparseCreate(&handle);
float *csr_values;
int *csr_row_ptrs;
int *csr_col_inds;
float *csc_values;
int *csc_col_ptrs;
int *csc_row_inds;
cudaMalloc(&csr_values, sizeof(float) * nnz);
cudaMalloc(&csr_row_ptrs, sizeof(int) * (m + 1));
cudaMalloc(&csr_col_inds, sizeof(int) * (nnz));
cudaMalloc(&csc_values, sizeof(float) * nnz);
cudaMalloc(&csc_col_ptrs, sizeof(int) * (n + 1));
cudaMalloc(&csc_row_inds, sizeof(int) * (nnz));
cudaMemcpy(csr_values, values, sizeof(float) * nnz, cudaMemcpyHostToDevice);
cudaMemcpy(csr_row_ptrs, row_ptrs, sizeof(int) * (m + 1), cudaMemcpyHostToDevice);
cudaMemcpy(csr_values, col_inds, sizeof(int) * nnz, cudaMemcpyHostToDevice);
st = cusparseScsr2csc(handle, m, n, nnz, csr_values, csr_row_ptrs,
csr_col_inds, csc_values, csc_row_inds,
csc_col_ptrs, copyValues, idxBase);
vector<float> res_values;
vector<int> res_row_ptrs, res_col_inds;
res_row_ptrs.resize(n + 1);
res_col_inds.resize(nnz);
res_values.resize(nnz);
cudaMemcpy(res_row_ptrs.data(), csc_col_ptrs, (n + 1) * sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(res_values.data(), csc_values, nnz * sizeof(float), cudaMemcpyDeviceToHost);
cudaMemcpy(res_col_inds.data(), csc_row_inds, nnz * sizeof(int), cudaMemcpyDeviceToHost);
return CSR<float>(res_values, res_row_ptrs, res_col_inds);
}
int main()
{
int m = 4, n = 6, nnz = 8;
float values[] = { 10,20 ,30 ,4, 50, 60 ,70 ,80 };
int row_ptrs[] = { 0, 2 ,4 ,7, 8 };
int col_inds[] = { 0 ,1, 1, 3 ,2, 3, 4, 5 };
cusparseStatus_t st;
auto res = Csr2csc(m, n, nnz, values, row_ptrs, col_inds, st);
if (st == CUSPARSE_STATUS_SUCCESS) {
cout << "success" << endl;
res.out();
}
return 0;
}
this is the documentation page, the function cusparseScsr2csc is located in ch. 9.
And I find the text below, it says the function executes asynchronously, maybe this is the problem, but I still don't know how to deal with it.
Edit:
I tried the solution mentioned by paleonix (add cudaDeviceSynchronize() right after the cusparseScsr2csc(...)), but still got the exact same wrong answer.

cudaDeviceSynchronize()is the function you need to use to get the results of any asynchronous operation in CUDA.cudaDeviceSynchronize()function right after thecusparseScsr2csc(...), but still got the same output...cusparseScsr2csc, but there are newer versions of the documentation you linked. For CUDA 9.2, see here.