I implemented a N-body simulation that combines gravity and SPH (smoothed particle hydrodynamics). I want to optimize the SPH part. I use spacial hashing for the neighborhood search. On the host side (the actual calculations are done on the GPU with OpenCL) I use this C++ code (calculate hashes and sort the particles by it - this part uses basically no CPU time, so no optimization necessary here):
std::vector<glm::dvec3> SPH_GPUf2(double t, const std::vector<glm::dvec3>* Pos, const std::vector<glm::dvec3>* Vel, const std::vector<double>* Mass, const std::vector<char>* Charges)
{
auto t_start = tic();
int NumParticles = Pos->size();
static std::vector<int> NeighborhoodCount;
std::vector<glm::vec3> Positions(NumParticles);
std::vector<glm::vec3> Velocities(NumParticles);
std::vector<float> Masses(NumParticles);
cl_bool AutoSmoothingRadius = Settings.Simulation.SPH.AutoSmoothingRadius;
for (int n = 0; n < NumParticles; n++)
{
Positions[n] = (*Pos)[n];
Velocities[n] = (*Vel)[n];
Masses[n] = (*Mass)[n];
}
float r0 = Settings.Simulation.SPH.smoothing_radius;
float r0inv = 1.0f / r0;
int NumBuckets = Settings.Simulation.SPH.NumBuckets;
std::vector<glm::vec3> acc(NumParticles);
// Calc bounding box
glm::vec3 bb_min = Positions[0], bb_max = Positions[0];
for (int n = 1; n < NumParticles; n++)
{
bb_min.x = std::min(bb_min.x, Positions[n].x);
bb_max.x = std::max(bb_max.x, Positions[n].x);
bb_min.y = std::min(bb_min.y, Positions[n].y);
bb_max.y = std::max(bb_max.y, Positions[n].y);
bb_min.z = std::min(bb_min.z, Positions[n].z);
bb_max.z = std::max(bb_max.z, Positions[n].z);
}
// Calculate and sort particles by hash value
glm::ivec3 gridDim;
gridDim.x = static_cast<int>(std::ceil((bb_max.x - bb_min.x) * r0inv));
gridDim.y = static_cast<int>(std::ceil((bb_max.y - bb_min.y) * r0inv));
gridDim.z = static_cast<int>(std::ceil((bb_max.z - bb_min.z) * r0inv));
std::vector<unsigned int> HashValues(NumParticles);
for (size_t i = 0; i < NumParticles; ++i)
{
const glm::vec3& pos = Positions[i];
unsigned int ix = static_cast<unsigned int>(std::floor((pos.x - bb_min.x) * r0inv));
unsigned int iy = static_cast<unsigned int>(std::floor((pos.y - bb_min.y) * r0inv));
unsigned int iz = static_cast<unsigned int>(std::floor((pos.z - bb_min.z) * r0inv));
HashValues[i] = (ix * 73856093u ^ iy * 19349663u ^ iz * 83492791u) % (unsigned int)NumBuckets;
}
// Index array for sorting
std::vector<int> Indices = sort_with_indices(HashValues);
// Sort particles
std::vector<glm::vec3> Temp(NumParticles), Temp3(NumParticles);
std::vector<float> Temp2(NumParticles);
for (size_t i = 0; i < NumParticles; i++)
{
Temp[i] = Positions[Indices[i]];
Temp3[i] = Velocities[Indices[i]];
Temp2[i] = Masses[Indices[i]];
}
Positions = std::move(Temp);
Masses = std::move(Temp2);
Velocities = std::move(Temp3);
// Calculate bucket ranges
std::vector<int> bucketRanges(2 * NumBuckets, -1);
for (int i = 0; i < NumParticles; i++)
{
uint32_t bucket = HashValues[i];
// If this is the first entry, store it
if (bucketRanges[2 * bucket] == -1)
bucketRanges[2 * bucket] = i;
// Update end index
bucketRanges[2 * bucket + 1] = i + 1;
}
Statistics.add("SPH prepare neighbor search", toc(t_start), "time");
t_start = tic();
// Calc density
cl_kernel kernel = Interface.getKernel("SPH_calc_density_kernelf2");
Interface.checkBufferSize("Positions", sizeof(glm::vec3) * NumParticles);
Interface.checkBufferSize("Velocities", sizeof(glm::vec3) * NumParticles);
Interface.checkBufferSize("Masses", sizeof(float) * NumParticles);
Interface.checkBufferSize("Acc", sizeof(glm::vec3) * NumParticles);
Interface.checkBufferSize("BucketRanges", sizeof(int) * NumBuckets * 2);
Interface.checkBufferSize("Densities", sizeof(float) * NumParticles);
Interface.checkBufferSize("SurfaceNormals", sizeof(glm::vec3) * NumParticles);
Interface.checkBufferSize("SizeNeighborhood", sizeof(int) * NumParticles);
Interface.writeBufferOCL("Positions", Positions.data(), sizeof(glm::vec3) * NumParticles);
Interface.writeBufferOCL("Velocities", Velocities.data(), sizeof(glm::vec3) * NumParticles);
Interface.writeBufferOCL("Masses", Masses.data(), sizeof(float) * NumParticles);
int zero = 0;
Interface.writeBufferOCL("BucketRanges", bucketRanges.data(), sizeof(int) * NumBuckets * 2);
cl_mem bufferPositions = Interface.getBufferOCL("Positions");
cl_mem bufferVelocities = Interface.getBufferOCL("Velocities");
cl_mem bufferMasses = Interface.getBufferOCL("Masses");
cl_mem bufferAcc = Interface.getBufferOCL("Acc");
cl_mem bufferDensities = Interface.getBufferOCL("Densities");
cl_mem bufferBucketRanges = Interface.getBufferOCL("BucketRanges");
cl_mem bufferSurfaceNormals = Interface.getBufferOCL("SurfaceNormals");
cl_mem bufferSizeNeighborhood = Interface.getBufferOCL("SizeNeighborhood");
cl_float3 bb_min_;
bb_min_.x = bb_min.x;
bb_min_.y = bb_min.y;
bb_min_.z = bb_min.z;
cl_int3 gridDim_;
gridDim_.x = gridDim.x;
gridDim_.y = gridDim.y;
gridDim_.z = gridDim.z;
Interface.setKernelArg(kernel, 0, sizeof(cl_mem), &bufferPositions);
Interface.setKernelArg(kernel, 1, sizeof(cl_mem), &bufferMasses);
Interface.setKernelArg(kernel, 2, sizeof(cl_mem), &bufferBucketRanges);
Interface.setKernelArg(kernel, 3, sizeof(cl_mem), &bufferDensities);
Interface.setKernelArg(kernel, 4, sizeof(cl_mem), &bufferSizeNeighborhood);
Interface.setKernelArg(kernel, 5, sizeof(cl_mem), &bufferSurfaceNormals);
Interface.setKernelArg(kernel, 6, sizeof(float), &r0);
Interface.setKernelArg(kernel, 7, sizeof(cl_float3), &bb_min_);
Interface.setKernelArg(kernel, 8, sizeof(cl_uint), &NumBuckets);
Interface.setKernelArg(kernel, 9, sizeof(cl_int3), &gridDim_);
Interface.setKernelArg(kernel, 10, sizeof(int), &NumParticles);
Statistics.add("SPH buffer stuff", toc(t_start), "time");
t_start = tic();
Interface.runKernel(kernel, cl_int2({ (NumParticles / Settings.Simulation.SPH.local_size + 1) * Settings.Simulation.SPH.local_size, 1 }), cl_int2({ Settings.Simulation.SPH.local_size, 1 }));
kernel = Interface.getKernel("SPH_calc_forcef2");
Interface.setKernelArg(kernel, 0, sizeof(cl_mem), &bufferAcc);
Interface.setKernelArg(kernel, 1, sizeof(cl_mem), &bufferPositions);
Interface.setKernelArg(kernel, 2, sizeof(cl_mem), &bufferVelocities);
Interface.setKernelArg(kernel, 3, sizeof(cl_mem), &bufferMasses);
Interface.setKernelArg(kernel, 4, sizeof(cl_mem), &bufferBucketRanges);
Interface.setKernelArg(kernel, 5, sizeof(cl_mem), &bufferDensities);
Interface.setKernelArg(kernel, 6, sizeof(float), &r0);
Interface.setKernelArg(kernel, 7, sizeof(cl_float3), &bb_min_);
Interface.setKernelArg(kernel, 8, sizeof(cl_uint), &NumBuckets);
Interface.setKernelArg(kernel, 9, sizeof(cl_int3), &gridDim_);
float viscosity = Settings.Simulation.SPH.viscosity, target_density = Settings.Simulation.SPH.target_density, stiffness = Settings.Simulation.SPH.stiffness;
Interface.setKernelArg(kernel, 10, sizeof(float), &viscosity);
Interface.setKernelArg(kernel, 11, sizeof(float), &target_density);
Interface.setKernelArg(kernel, 12, sizeof(float), &stiffness);
Interface.setKernelArg(kernel, 13, sizeof(int), &NumParticles);
clFinish(Interface.OCL_commandQueue);
Statistics.add("SPH density kernel", toc(t_start), "time");
t_start = tic();
Interface.runKernel(kernel, cl_int2({ (NumParticles / Settings.Simulation.SPH.local_size + 1) * Settings.Simulation.SPH.local_size, 1 }), cl_int2({ Settings.Simulation.SPH.local_size, 1 }));
clFinish(Interface.OCL_commandQueue);
Statistics.add("SPH force kernel", toc(t_start), "time");
t_start = tic();
Interface.readBufferOCL("Acc", acc.data(), sizeof(glm::vec3) * NumParticles);
NeighborhoodCount.resize(NumParticles);
Interface.readBufferOCL("SizeNeighborhood", NeighborhoodCount.data(), sizeof(int) * NumParticles);
std::vector<glm::vec3> SurfaceNormals(NumParticles);
Interface.readBufferOCL("SurfaceNormals", SurfaceNormals.data(), sizeof(glm::vec3) * NumParticles);
if (AutoSmoothingRadius)
{
int sum = NeighborhoodCount[0];
for (int n = 1; n < NumParticles; n++)
sum += NeighborhoodCount[n];
double meanNeighbors = double(sum) / NumParticles;
if (meanNeighbors < 0.8 * Settings.Simulation.SPH.TargetNeighbors || meanNeighbors > 1.2 * Settings.Simulation.SPH.TargetNeighbors)
{
double factor = meanNeighbors < Settings.Simulation.SPH.TargetNeighbors ? 1.1 : 0.9;
Settings.Simulation.SPH.smoothing_radius *= factor;
}
}
// Bring acc in original order
static std::vector<glm::dvec3> Tempd(NumParticles);
Tempd.resize(NumParticles);
Particles.SurfaceNormals.resize(NumParticles);
for (int n = 0; n < NumParticles; n++)
{
Tempd[Indices[n]] = acc[n];
if (Settings.View.ThresholdParticleInBody * Settings.Simulation.SPH.TargetNeighbors > NeighborhoodCount[n]
&& Settings.View.ThresholdParticleOnSurface * Settings.Simulation.SPH.TargetNeighbors < NeighborhoodCount[n])
Particles.SurfaceNormals[Indices[n]] = SurfaceNormals[n];
else
Particles.SurfaceNormals[Indices[n]] = glm::vec3(0.0f, 0.0f, 0.0f);
}
Statistics.add("SPH the rest", toc(t_start), "time");
return Tempd;
}
On the GPU side I use the following two OpenCL kernels
__kernel void SPH_calc_density_kernelf2(__global const float* Pos, __global const float* Mass, __global const int* BucketRanges,
__global float* density, __global int* SizeNeighborhood, __global float* SurfaceNormals, float r0, float3 bb_min, uint NumBuckets, int3 gridDim, int NumParticles)
{
if (get_global_id(0) > NumParticles)
return;
int particle = get_global_id(0), NeighborhoodCount = 0;
float3 Pos_particle = (float3)(Pos[particle * 3 + 0], Pos[particle * 3 + 1], Pos[particle * 3 + 2]);
int3 idx = convert_int3((Pos_particle - bb_min) / r0);
float current_density = 0, r0_sq = r0 * r0;
int visited_buckets[27] = { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 };
float3 centerOfMass_neighbors = { 0.0f, 0.0f, 0.0f };
// Iterate only over neighboring grid cells
for (int n = 0; n < 27; n++)
{
int3 idx2 = idx + neighborOffsets[n];
if (!(idx2.x >= 0 && idx2.x < gridDim.x && idx2.y >= 0 && idx2.y < gridDim.y && idx2.z >= 0 && idx2.z < gridDim.z)) continue;
int Bucket = (idx2.x * 73856093 ^ idx2.y * 19349663 ^ idx2.z * 83492791) % NumBuckets;
bool already_visited = false;
for (int m = 0; m < n; m++)
{
if (visited_buckets[m] == Bucket)
{
already_visited = true;
break;
}
}
if (already_visited) continue;
visited_buckets[n] = Bucket;
int BucketStart = BucketRanges[2 * Bucket], BucketEnd = BucketRanges[2 * Bucket + 1];
for (int neighbor = BucketStart; neighbor < BucketEnd; neighbor++)
{
float3 Pos_neighbor = (float3)(Pos[neighbor * 3 + 0], Pos[neighbor * 3 + 1], Pos[neighbor * 3 + 2]);
float3 delta = Pos_particle - Pos_neighbor;
float r_sq = dot(delta, delta);
current_density += Mass[neighbor] * kernelFunf(r_sq, r0);
if (r_sq < r0_sq && r_sq > 0.0f)
{
NeighborhoodCount++;
centerOfMass_neighbors += Pos_neighbor;
}
}
}
density[particle] = current_density;
SizeNeighborhood[particle] = NeighborhoodCount;
float3 Normal = (Pos_particle - (centerOfMass_neighbors / (float)NeighborhoodCount));
SurfaceNormals[3 * particle + 0] = Normal.x;
SurfaceNormals[3 * particle + 1] = Normal.y;
SurfaceNormals[3 * particle + 2] = Normal.z;
}
__kernel void SPH_calc_forcef2(__global float* Acc, __global const float* Pos, __global const float* Vel, __global const float* Mass, __global const int* BucketRanges,
__global float* density, float r0, float3 bb_min, uint NumBuckets, int3 gridDim, float viscosity, float target_density, float stiffness, int NumParticles)
{
if (get_global_id(0) > NumParticles)
return;
int particle = get_global_id(0);
float3 Pos_particle = (float3)(Pos[particle * 3 + 0], Pos[particle * 3 + 1], Pos[particle * 3 + 2]);
float3 Vel_particle = (float3)(Vel[particle * 3 + 0], Vel[particle * 3 + 1], Vel[particle * 3 + 2]);
int3 idx = convert_int3((Pos_particle - bb_min) / r0);
float3 current_acc = -0.0f;
float density_particle = density[particle];
float pressure_particle = max(0.0f, stiffness * (density_particle - target_density));
float Mass_particle_inv = 1.0f / Mass[particle];
int visited_buckets[27] = { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 };
// Iterate only over neighboring grid cells
for (int n = 0; n < 27; n++)
{
int3 idx2 = idx + neighborOffsets[n];
if (!(idx2.x >= 0 && idx2.x < gridDim.x && idx2.y >= 0 && idx2.y < gridDim.y && idx2.z >= 0 && idx2.z < gridDim.z)) continue;
int Bucket = (idx2.x * 73856093 ^ idx2.y * 19349663 ^ idx2.z * 83492791) % NumBuckets;
bool already_visited = false;
for (int m = 0; m < n; m++)
{
if (visited_buckets[m] == Bucket)
{
already_visited = true;
break;
}
}
if (already_visited) continue;
visited_buckets[n] = Bucket;
int BucketStart = BucketRanges[2 * Bucket], BucketEnd = BucketRanges[2 * Bucket + 1];
for (int neighbor = BucketStart; neighbor < BucketEnd; neighbor++)
{
float3 Pos_neighbor = (float3)(Pos[neighbor * 3 + 0], Pos[neighbor * 3 + 1], Pos[neighbor * 3 + 2]);
float3 diff = Pos_particle - Pos_neighbor;
float r_sq = dot(diff, diff);
if (r_sq == 0.0f || r_sq > r0 * r0) continue;
float3 Vel_neighbor = (float3)(Vel[neighbor * 3 + 0], Vel[neighbor * 3 + 1], Vel[neighbor * 3 + 2]);
float density_neighbor = density[neighbor];
float Mass_neighbor = Mass[neighbor];
float pressure_neighbor = max(0.0f, stiffness * (density_neighbor - target_density));
float r = sqrt(r_sq);
float pressure_force = -Mass_neighbor * (pressure_particle + pressure_neighbor) / (2.0 * density_neighbor) * kernelFun_derivativef(r, r0);
float3 visc_force = viscosity * Mass_neighbor * (Vel_neighbor - Vel_particle) / density_neighbor * kernelFun_laplacianf(r, r0);
current_acc += /*Mass_particle_inv * */(pressure_force * diff / r + visc_force);
}
}
Acc[particle * 3 + 0] = current_acc.x;
Acc[particle * 3 + 1] = current_acc.y;
Acc[particle * 3 + 2] = current_acc.z;
}
with these helper functions:
inline float kernelFunf(float r, float r0)
{
float r0_pow9 = r0 * r0;
r0_pow9 *= r0_pow9;
r0_pow9 *= r0_pow9;
r0_pow9 *= r0;
float factor = (315.0 / (64.0 * M_PI * r0_pow9));
float temp = r0 * r0 - r;
float term = temp * temp * temp;
return max(0.0f, factor * term);
}
inline float kernelFun_derivativef(float r, float r0)
{
float r_pow6 = r0 * r0 * r0;
r_pow6 *= r_pow6;
float factor = (-45.0 / (M_PI * r_pow6));
return factor * (r0 - r) * (r0 - r);
}
inline float kernelFun_laplacianf(float r, float r0)
{
float r_pow6 = r0 * r0 * r0;
r_pow6 *= r_pow6;
float factor = (45.0 / (M_PI * r_pow6));
return factor * (r0 - r);
}
__constant int3 neighborOffsets[27] =
{
{ -1, -1, -1 }, { 0, -1, -1 }, { 1, -1, -1 },
{ -1, 0, -1 }, { 0, 0, -1 }, { 1, 0, -1 },
{ -1, 1, -1 }, { 0, 1, -1 }, { 1, 1, -1 },
{ -1, -1, 0 }, { 0, -1, 0 }, { 1, -1, 0 },
{ -1, 0, 0 }, { 0, 0, 0 }, { 1, 0, 0 },
{ -1, 1, 0 }, { 0, 1, 0 }, { 1, 1, 0 },
{ -1, -1, 1 }, { 0, -1, 1 }, { 1, -1, 1 },
{ -1, 0, 1 }, { 0, 0, 1 }, { 1, 0, 1 },
{ -1, 1, 1 }, { 0, 1, 1 }, { 1, 1, 1 }
};
inline int getBucket(int3 idx, int* numel, uint NumBuckets, uint BucketSize, __global const int* BucketCount)
{
uint Bucket = (idx.x * 73856093 ^ idx.y * 19349663 ^ idx.z * 83492791) % NumBuckets;
*numel = BucketCount[Bucket];
return Bucket * BucketSize;
}
Can anybody give me any hints on how to optimize this? Ideally I need a performance boost of >4x. Any help is appreaceated.
Edit: Added full C++ function for OpenCL call
Posare stridded. Generally GPUs support packed loads for better performance (see an example here) but it will certainly not results in a x4 boost. There are also some minor math improvement you can do. I guess the key is to use SHM or make accesses more contiguous to reduce the memory overhead.