I am trying to pass a float4 as argument to my cuda kernel (by value) using PyCUDA’s make_float4(). But there seems to be some misalignment when the data is transferred to the kernel. If I read the output for an input (1,2,3,4) I instead get (3,4,0,0). This happens with int4 as well, but int3 and float3 work just fine.
Minimal code to reproduce error in Google Colab:
# --- Minimal PyCUDA Test ---
import pycuda.driver as drv
import pycuda.compiler
import pycuda.gpuarray as gpa
import numpy as np
import pycuda.autoinit
minimal_kernel_code = """
__global__ void write_constant(
int* output,
const int4 test
) {
output[0] = test.x;
output[1] = test.y;
output[2] = test.z;
output[3] = test.w;
}
"""
module_test = pycuda.compiler.SourceModule(minimal_kernel_code)
write_constant_kernel = module_test.get_function("write_constant")
test_gpu_mem = drv.mem_alloc(4 * np.int32().nbytes)
write_constant_kernel(
test_gpu_mem,
gpa.vec.make_int4(1,2,3,4), # Constant value to write
block=(1, 1, 1),
grid=(1, 1)
)
test_cpu_mem = np.empty(4, dtype=np.int32)
drv.memcpy_dtoh(test_cpu_mem, test_gpu_mem)
print(test_cpu_mem)
The expected output would be [1,2,3,4] but it is [3,4,0,0].