1

I implemented a N-body simulation that combines gravity and SPH (smoothed particle hydrodynamics). I want to optimize the SPH part. I use spacial hashing for the neighborhood search. On the host side (the actual calculations are done on the GPU with OpenCL) I use this C++ code (calculate hashes and sort the particles by it - this part uses basically no CPU time, so no optimization necessary here):

std::vector<glm::dvec3> SPH_GPUf2(double t, const std::vector<glm::dvec3>* Pos, const std::vector<glm::dvec3>* Vel, const std::vector<double>* Mass, const std::vector<char>* Charges)
{
    auto t_start = tic();
    int NumParticles = Pos->size();
    static std::vector<int> NeighborhoodCount;
    std::vector<glm::vec3> Positions(NumParticles);
    std::vector<glm::vec3> Velocities(NumParticles);
    std::vector<float> Masses(NumParticles);
    cl_bool AutoSmoothingRadius = Settings.Simulation.SPH.AutoSmoothingRadius;

    for (int n = 0; n < NumParticles; n++)
    {
        Positions[n] = (*Pos)[n];
        Velocities[n] = (*Vel)[n];
        Masses[n] = (*Mass)[n];
    }

    float r0 = Settings.Simulation.SPH.smoothing_radius;
    float r0inv = 1.0f / r0;
    int NumBuckets = Settings.Simulation.SPH.NumBuckets;

    std::vector<glm::vec3> acc(NumParticles);

    // Calc bounding box
    glm::vec3 bb_min = Positions[0], bb_max = Positions[0];
    for (int n = 1; n < NumParticles; n++)
    {
        bb_min.x = std::min(bb_min.x, Positions[n].x);
        bb_max.x = std::max(bb_max.x, Positions[n].x);
        bb_min.y = std::min(bb_min.y, Positions[n].y);
        bb_max.y = std::max(bb_max.y, Positions[n].y);
        bb_min.z = std::min(bb_min.z, Positions[n].z);
        bb_max.z = std::max(bb_max.z, Positions[n].z);
    }

    // Calculate and sort particles by hash value
    glm::ivec3 gridDim;
    gridDim.x = static_cast<int>(std::ceil((bb_max.x - bb_min.x) * r0inv));
    gridDim.y = static_cast<int>(std::ceil((bb_max.y - bb_min.y) * r0inv));
    gridDim.z = static_cast<int>(std::ceil((bb_max.z - bb_min.z) * r0inv));

    std::vector<unsigned int> HashValues(NumParticles);
    for (size_t i = 0; i < NumParticles; ++i)
    {
        const glm::vec3& pos = Positions[i];

        unsigned int ix = static_cast<unsigned int>(std::floor((pos.x - bb_min.x) * r0inv));
        unsigned int iy = static_cast<unsigned int>(std::floor((pos.y - bb_min.y) * r0inv));
        unsigned int iz = static_cast<unsigned int>(std::floor((pos.z - bb_min.z) * r0inv));

        HashValues[i] = (ix * 73856093u ^ iy * 19349663u ^ iz * 83492791u) % (unsigned int)NumBuckets;
    }

    // Index array for sorting
    std::vector<int> Indices = sort_with_indices(HashValues);

    // Sort particles
    std::vector<glm::vec3> Temp(NumParticles), Temp3(NumParticles);
    std::vector<float> Temp2(NumParticles);

    for (size_t i = 0; i < NumParticles; i++)
    {
        Temp[i] = Positions[Indices[i]];
        Temp3[i] = Velocities[Indices[i]];
        Temp2[i] = Masses[Indices[i]];
    }
    Positions = std::move(Temp);
    Masses = std::move(Temp2);
    Velocities = std::move(Temp3);

    // Calculate bucket ranges
    std::vector<int> bucketRanges(2 * NumBuckets, -1);

    for (int i = 0; i < NumParticles; i++)
    {
        uint32_t bucket = HashValues[i];

        // If this is the first entry, store it
        if (bucketRanges[2 * bucket] == -1)
            bucketRanges[2 * bucket] = i;

        // Update end index
        bucketRanges[2 * bucket + 1] = i + 1;
    }
    Statistics.add("SPH prepare neighbor search", toc(t_start), "time");
    t_start = tic();

    // Calc density
    cl_kernel kernel = Interface.getKernel("SPH_calc_density_kernelf2");

    Interface.checkBufferSize("Positions", sizeof(glm::vec3) * NumParticles);
    Interface.checkBufferSize("Velocities", sizeof(glm::vec3) * NumParticles);
    Interface.checkBufferSize("Masses", sizeof(float) * NumParticles);
    Interface.checkBufferSize("Acc", sizeof(glm::vec3) * NumParticles);
    Interface.checkBufferSize("BucketRanges", sizeof(int) * NumBuckets * 2);
    Interface.checkBufferSize("Densities", sizeof(float) * NumParticles);
    Interface.checkBufferSize("SurfaceNormals", sizeof(glm::vec3) * NumParticles);
    Interface.checkBufferSize("SizeNeighborhood", sizeof(int) * NumParticles);
    Interface.writeBufferOCL("Positions", Positions.data(), sizeof(glm::vec3) * NumParticles);
    Interface.writeBufferOCL("Velocities", Velocities.data(), sizeof(glm::vec3) * NumParticles);
    Interface.writeBufferOCL("Masses", Masses.data(), sizeof(float) * NumParticles);
    int zero = 0;

    Interface.writeBufferOCL("BucketRanges", bucketRanges.data(), sizeof(int) * NumBuckets * 2);

    cl_mem bufferPositions = Interface.getBufferOCL("Positions");
    cl_mem bufferVelocities = Interface.getBufferOCL("Velocities");
    cl_mem bufferMasses = Interface.getBufferOCL("Masses");
    cl_mem bufferAcc = Interface.getBufferOCL("Acc");
    cl_mem bufferDensities = Interface.getBufferOCL("Densities");
    cl_mem bufferBucketRanges = Interface.getBufferOCL("BucketRanges");
    cl_mem bufferSurfaceNormals = Interface.getBufferOCL("SurfaceNormals");
    cl_mem bufferSizeNeighborhood = Interface.getBufferOCL("SizeNeighborhood");

    cl_float3 bb_min_;
    bb_min_.x = bb_min.x;
    bb_min_.y = bb_min.y;
    bb_min_.z = bb_min.z;
    cl_int3 gridDim_;
    gridDim_.x = gridDim.x;
    gridDim_.y = gridDim.y;
    gridDim_.z = gridDim.z;

    Interface.setKernelArg(kernel, 0, sizeof(cl_mem), &bufferPositions);
    Interface.setKernelArg(kernel, 1, sizeof(cl_mem), &bufferMasses);
    Interface.setKernelArg(kernel, 2, sizeof(cl_mem), &bufferBucketRanges);
    Interface.setKernelArg(kernel, 3, sizeof(cl_mem), &bufferDensities);
    Interface.setKernelArg(kernel, 4, sizeof(cl_mem), &bufferSizeNeighborhood);
    Interface.setKernelArg(kernel, 5, sizeof(cl_mem), &bufferSurfaceNormals);
    Interface.setKernelArg(kernel, 6, sizeof(float), &r0);
    Interface.setKernelArg(kernel, 7, sizeof(cl_float3), &bb_min_);
    Interface.setKernelArg(kernel, 8, sizeof(cl_uint), &NumBuckets);
    Interface.setKernelArg(kernel, 9, sizeof(cl_int3), &gridDim_);
    Interface.setKernelArg(kernel, 10, sizeof(int), &NumParticles);

    Statistics.add("SPH buffer stuff", toc(t_start), "time");
    t_start = tic();
    Interface.runKernel(kernel, cl_int2({ (NumParticles / Settings.Simulation.SPH.local_size + 1) * Settings.Simulation.SPH.local_size, 1 }), cl_int2({ Settings.Simulation.SPH.local_size, 1 }));

    kernel = Interface.getKernel("SPH_calc_forcef2");

    Interface.setKernelArg(kernel, 0, sizeof(cl_mem), &bufferAcc);
    Interface.setKernelArg(kernel, 1, sizeof(cl_mem), &bufferPositions);
    Interface.setKernelArg(kernel, 2, sizeof(cl_mem), &bufferVelocities);
    Interface.setKernelArg(kernel, 3, sizeof(cl_mem), &bufferMasses);
    Interface.setKernelArg(kernel, 4, sizeof(cl_mem), &bufferBucketRanges);
    Interface.setKernelArg(kernel, 5, sizeof(cl_mem), &bufferDensities);
    Interface.setKernelArg(kernel, 6, sizeof(float), &r0);
    Interface.setKernelArg(kernel, 7, sizeof(cl_float3), &bb_min_);
    Interface.setKernelArg(kernel, 8, sizeof(cl_uint), &NumBuckets);
    Interface.setKernelArg(kernel, 9, sizeof(cl_int3), &gridDim_);
    float viscosity = Settings.Simulation.SPH.viscosity, target_density = Settings.Simulation.SPH.target_density, stiffness = Settings.Simulation.SPH.stiffness;
    Interface.setKernelArg(kernel, 10, sizeof(float), &viscosity);
    Interface.setKernelArg(kernel, 11, sizeof(float), &target_density);
    Interface.setKernelArg(kernel, 12, sizeof(float), &stiffness);
    Interface.setKernelArg(kernel, 13, sizeof(int), &NumParticles);

    clFinish(Interface.OCL_commandQueue);
    Statistics.add("SPH density kernel", toc(t_start), "time");
    t_start = tic();
    Interface.runKernel(kernel, cl_int2({ (NumParticles / Settings.Simulation.SPH.local_size + 1) * Settings.Simulation.SPH.local_size, 1 }), cl_int2({ Settings.Simulation.SPH.local_size, 1 }));

    clFinish(Interface.OCL_commandQueue);
    Statistics.add("SPH force kernel", toc(t_start), "time");
    t_start = tic();

    Interface.readBufferOCL("Acc", acc.data(), sizeof(glm::vec3) * NumParticles);

    NeighborhoodCount.resize(NumParticles);
    Interface.readBufferOCL("SizeNeighborhood", NeighborhoodCount.data(), sizeof(int) * NumParticles);
    std::vector<glm::vec3> SurfaceNormals(NumParticles);
    Interface.readBufferOCL("SurfaceNormals", SurfaceNormals.data(), sizeof(glm::vec3) * NumParticles);

    if (AutoSmoothingRadius)
    {
        int sum = NeighborhoodCount[0];
        for (int n = 1; n < NumParticles; n++)
            sum += NeighborhoodCount[n];
        double meanNeighbors = double(sum) / NumParticles;
        if (meanNeighbors < 0.8 * Settings.Simulation.SPH.TargetNeighbors || meanNeighbors > 1.2 * Settings.Simulation.SPH.TargetNeighbors)
        {
            double factor = meanNeighbors < Settings.Simulation.SPH.TargetNeighbors ? 1.1 : 0.9;

            Settings.Simulation.SPH.smoothing_radius *= factor;
        }
    }

    // Bring acc in original order
    static std::vector<glm::dvec3> Tempd(NumParticles);
    Tempd.resize(NumParticles);
    Particles.SurfaceNormals.resize(NumParticles);
    for (int n = 0; n < NumParticles; n++)
    {
        Tempd[Indices[n]] = acc[n];
        if (Settings.View.ThresholdParticleInBody * Settings.Simulation.SPH.TargetNeighbors > NeighborhoodCount[n]
            && Settings.View.ThresholdParticleOnSurface * Settings.Simulation.SPH.TargetNeighbors < NeighborhoodCount[n])
            Particles.SurfaceNormals[Indices[n]] = SurfaceNormals[n];
        else
            Particles.SurfaceNormals[Indices[n]] = glm::vec3(0.0f, 0.0f, 0.0f);
    }

    Statistics.add("SPH the rest", toc(t_start), "time");
    return Tempd;
}

On the GPU side I use the following two OpenCL kernels

__kernel void SPH_calc_density_kernelf2(__global const float* Pos, __global const float* Mass, __global const int* BucketRanges,
    __global float* density, __global int* SizeNeighborhood, __global float* SurfaceNormals, float r0, float3 bb_min, uint NumBuckets, int3 gridDim, int NumParticles)
{
    if (get_global_id(0) > NumParticles)
        return;

    int particle = get_global_id(0), NeighborhoodCount = 0;
    float3 Pos_particle = (float3)(Pos[particle * 3 + 0], Pos[particle * 3 + 1], Pos[particle * 3 + 2]);
    int3 idx = convert_int3((Pos_particle - bb_min) / r0);
    float current_density = 0, r0_sq = r0 * r0;
    int visited_buckets[27] = { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 };
    float3 centerOfMass_neighbors = { 0.0f, 0.0f, 0.0f };

    // Iterate only over neighboring grid cells
    for (int n = 0; n < 27; n++)
    {
        int3 idx2 = idx + neighborOffsets[n];
        if (!(idx2.x >= 0 && idx2.x < gridDim.x && idx2.y >= 0 && idx2.y < gridDim.y && idx2.z >= 0 && idx2.z < gridDim.z)) continue;

        int Bucket = (idx2.x * 73856093 ^ idx2.y * 19349663 ^ idx2.z * 83492791) % NumBuckets;
        bool already_visited = false;
        for (int m = 0; m < n; m++)
        {
            if (visited_buckets[m] == Bucket)
            {
                already_visited = true;
                break;
            }
        }
        if (already_visited) continue;
        visited_buckets[n] = Bucket;
        int BucketStart = BucketRanges[2 * Bucket], BucketEnd = BucketRanges[2 * Bucket + 1];

        for (int neighbor = BucketStart; neighbor < BucketEnd; neighbor++)
        {
            float3 Pos_neighbor = (float3)(Pos[neighbor * 3 + 0], Pos[neighbor * 3 + 1], Pos[neighbor * 3 + 2]);
            float3 delta = Pos_particle - Pos_neighbor;
            float r_sq = dot(delta, delta);
            current_density += Mass[neighbor] * kernelFunf(r_sq, r0);
            if (r_sq < r0_sq && r_sq > 0.0f)
            {
                NeighborhoodCount++;
                centerOfMass_neighbors += Pos_neighbor;
            }
        }
    }

    density[particle] = current_density;
    SizeNeighborhood[particle] = NeighborhoodCount;
    float3 Normal = (Pos_particle - (centerOfMass_neighbors / (float)NeighborhoodCount));
    SurfaceNormals[3 * particle + 0] = Normal.x;
    SurfaceNormals[3 * particle + 1] = Normal.y;
    SurfaceNormals[3 * particle + 2] = Normal.z;
}

__kernel void SPH_calc_forcef2(__global float* Acc, __global const float* Pos, __global const float* Vel, __global const float* Mass, __global const int* BucketRanges,
    __global float* density, float r0, float3 bb_min, uint NumBuckets, int3 gridDim, float viscosity, float target_density, float stiffness, int NumParticles)
{
    if (get_global_id(0) > NumParticles)
        return;

    int particle = get_global_id(0);
    float3 Pos_particle = (float3)(Pos[particle * 3 + 0], Pos[particle * 3 + 1], Pos[particle * 3 + 2]);
    float3 Vel_particle = (float3)(Vel[particle * 3 + 0], Vel[particle * 3 + 1], Vel[particle * 3 + 2]);
    int3 idx = convert_int3((Pos_particle - bb_min) / r0);
    float3 current_acc = -0.0f;
    float density_particle = density[particle];
    float pressure_particle = max(0.0f, stiffness * (density_particle - target_density));
    float Mass_particle_inv = 1.0f / Mass[particle];
    int visited_buckets[27] = { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 };

    // Iterate only over neighboring grid cells
    for (int n = 0; n < 27; n++)
    {
        int3 idx2 = idx + neighborOffsets[n];
        if (!(idx2.x >= 0 && idx2.x < gridDim.x && idx2.y >= 0 && idx2.y < gridDim.y && idx2.z >= 0 && idx2.z < gridDim.z)) continue;

        int Bucket = (idx2.x * 73856093 ^ idx2.y * 19349663 ^ idx2.z * 83492791) % NumBuckets;
        bool already_visited = false;
        for (int m = 0; m < n; m++)
        {
            if (visited_buckets[m] == Bucket)
            {
                already_visited = true;
                break;
            }
        }
        if (already_visited) continue;
        visited_buckets[n] = Bucket;
        int BucketStart = BucketRanges[2 * Bucket], BucketEnd = BucketRanges[2 * Bucket + 1];

        for (int neighbor = BucketStart; neighbor < BucketEnd; neighbor++)
        {
            float3 Pos_neighbor = (float3)(Pos[neighbor * 3 + 0], Pos[neighbor * 3 + 1], Pos[neighbor * 3 + 2]);

            float3 diff = Pos_particle - Pos_neighbor;
            float r_sq = dot(diff, diff);

            if (r_sq == 0.0f || r_sq > r0 * r0) continue;

            float3 Vel_neighbor = (float3)(Vel[neighbor * 3 + 0], Vel[neighbor * 3 + 1], Vel[neighbor * 3 + 2]);
            float density_neighbor = density[neighbor];
            float Mass_neighbor = Mass[neighbor];
            float pressure_neighbor = max(0.0f, stiffness * (density_neighbor - target_density));

            float r = sqrt(r_sq);

            float pressure_force = -Mass_neighbor * (pressure_particle + pressure_neighbor) / (2.0 * density_neighbor) * kernelFun_derivativef(r, r0);
            float3 visc_force = viscosity * Mass_neighbor * (Vel_neighbor - Vel_particle) / density_neighbor * kernelFun_laplacianf(r, r0);

            current_acc += /*Mass_particle_inv * */(pressure_force * diff / r + visc_force);
        }
    }

    Acc[particle * 3 + 0] = current_acc.x;
    Acc[particle * 3 + 1] = current_acc.y;
    Acc[particle * 3 + 2] = current_acc.z;
}

with these helper functions:

inline float kernelFunf(float r, float r0)
{
    float r0_pow9 = r0 * r0;
    r0_pow9 *= r0_pow9;
    r0_pow9 *= r0_pow9;
    r0_pow9 *= r0;
    float factor = (315.0 / (64.0 * M_PI * r0_pow9));
    float temp = r0 * r0 - r;
    float term = temp * temp * temp;
    return max(0.0f, factor * term);
}

inline float kernelFun_derivativef(float r, float r0)
{
    float r_pow6 = r0 * r0 * r0;
    r_pow6 *= r_pow6;
    float factor = (-45.0 / (M_PI * r_pow6));
    return factor * (r0 - r) * (r0 - r);
}

inline float kernelFun_laplacianf(float r, float r0)
{
    float r_pow6 = r0 * r0 * r0;
    r_pow6 *= r_pow6;
    float factor = (45.0 / (M_PI * r_pow6));
    return factor * (r0 - r);
}

__constant int3 neighborOffsets[27] =
{
    { -1, -1, -1 }, {  0, -1, -1 }, {  1, -1, -1 },
    { -1,  0, -1 }, {  0,  0, -1 }, {  1,  0, -1 },
    { -1,  1, -1 }, {  0,  1, -1 }, {  1,  1, -1 },

    { -1, -1,  0 }, {  0, -1,  0 }, {  1, -1,  0 },
    { -1,  0,  0 }, {  0,  0,  0 }, {  1,  0,  0 },
    { -1,  1,  0 }, {  0,  1,  0 }, {  1,  1,  0 },

    { -1, -1,  1 }, {  0, -1,  1 }, {  1, -1,  1 },
    { -1,  0,  1 }, {  0,  0,  1 }, {  1,  0,  1 },
    { -1,  1,  1 }, {  0,  1,  1 }, {  1,  1,  1 }
};

inline int getBucket(int3 idx, int* numel, uint NumBuckets, uint BucketSize, __global const int* BucketCount)
{
    uint Bucket = (idx.x * 73856093 ^ idx.y * 19349663 ^ idx.z * 83492791) % NumBuckets;
    *numel = BucketCount[Bucket];

    return Bucket * BucketSize;
}

Can anybody give me any hints on how to optimize this? Ideally I need a performance boost of >4x. Any help is appreaceated.

Edit: Added full C++ function for OpenCL call

14
  • 2
    It is hard to help you without a complete code (something we can run). We do not know which parts are the most expensive. We also do not know what is the target hardware nor the actual size of the dataset(s). We also do not have any profiling information so we can only make guesses. Commented May 14 at 18:09
  • 1
    Overall, the memory access pattern in global memory seem inefficient. For example the ones to Pos are stridded. Generally GPUs support packed loads for better performance (see an example here) but it will certainly not results in a x4 boost. There are also some minor math improvement you can do. I guess the key is to use SHM or make accesses more contiguous to reduce the memory overhead. Commented May 14 at 18:17
  • 2
    But, first thing fist: please profile the code and report the result here so to better know the overheads (and guide the optimisation to perform), or please provide a minimal reproducible example. Commented May 14 at 18:18
  • I included the full C++-Function for the OpenCL call(s). The goal is a simulation with as many particles as possible. >1Mio particles work fine (depending on parameters like error tolerance, target density, etc.). The target hardware should not matter (it should run on more or less any computer), but right now I'm running this on an AMD Ryzen 9 7940HS w/ Radeon 780M iGPU. Unfortunately I don't know how to profile OpenCL code. When I time the OpenCL kernels, the force-kernel takes about twice as long as the density-kernel (and together they take 99% of the time). That is all I can tell you... Commented May 14 at 18:57
  • Ok it help a bit more but profiling information should help much more. Regarding the code, we need something we can run to profile it or it is not much useful otherwise. I am not very familiar with AMD but I think you can use AMD Radeon GPU profiler for profiling. Commented May 14 at 22:34

0

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.