/* all force pair computations Device code. force computations are based on GEMS3 implimentation by Nyland and trimmed version of nbody in SDK The integrator has been modified to be leapfrog. I have also added a kick step routine useful for symplectic integrators. */ #ifndef _NBODY_KERNEL_H_ #define _NBODY_KERNEL_H_ #include #include #define LOOP_UNROLL 2 // crashes if larger than 2 //__constant__ float softeningSquared; // currently not used /////////////////////////////////////////////////////////// // soft2 is softening squared // add acceleration to ai on particle bi from particle bj // mass for bj is stored in bj.w __device__ float3 bodyBodyInteraction(float3 ai, float4 bi, float4 bj, float soft2) { float3 r; // r_ij [3 FLOPS] I switched sign here from that in SDK r.x = bj.x - bi.x; r.y = bj.y - bi.y; r.z = bj.z - bi.z; // distSqr = dot(r_ij, r_ij) + EPS^2 [6 FLOPS] float distSqr = r.x * r.x + r.y * r.y + r.z * r.z; distSqr += soft2; // invDistCube =1/distSqr^(3/2) [4 FLOPS (2 mul, 1 sqrt, 1 inv)] float invDist = rsqrtf(distSqr); float invDistCube = invDist * invDist * invDist; // m_j * invDistCube [1 FLOP] float s = bj.w * invDistCube; // a_i = a_i + s * r_ij [6 FLOPS] ai.x += r.x * s; ai.y += r.y * s; ai.z += r.z * s; return ai; } ///////////////////////////////////////////////////////// // add forces from every particle looping through the blockdim // This is the "tile_calculation" function from the GPUG3 article. // howerver SDK version had arguments flipped ///////////////////////////////////////////////////////// __device__ float3 gravitation(float4 myPos, float3 accel, float soft2) { extern __shared__ float4 sharedPos[]; int i=0; // Here we unroll the loop // what if blockDim.x is not divisible by LOOP_UNROLL? // typically blockDim.x is 256 threads and no problem // for reasons I can't figure out this routine crashes when looproll >2 for (i = 0; i < blockDim.x; ) { accel = bodyBodyInteraction(accel,myPos,sharedPos[i++],soft2); // I think flipped in SDK? #if LOOP_UNROLL > 1 accel = bodyBodyInteraction(accel,myPos,sharedPos[i++],soft2); #endif #if LOOP_UNROLL > 2 // always crashes if this is true! accel = bodyBodyInteraction(accel,myPos,sharedPos[i++],soft2); accel = bodyBodyInteraction(accel,myPos,sharedPos[i++],soft2); #endif #if LOOP_UNROLL > 4 accel = bodyBodyInteraction(accel,myPos,sharedPos[i++],soft2); accel = bodyBodyInteraction(accel,myPos,sharedPos[i++],soft2); accel = bodyBodyInteraction(accel,myPos,sharedPos[i++],soft2); accel = bodyBodyInteraction(accel,myPos,sharedPos[i++],soft2); #endif } return accel; } // WRAP is used to force each block to start working on a different // chunk (and wrap around back to the beginning of the array) so that // not all multiprocessors try to read the same memory locations at // once. #define WRAP(x,m) (((x) numBodiesMassive. // the computeBodyAccel does not use the blockIdx // Let // numBodiestotal = blockDim.x*gridDim.x which is larger than numBodiesMassive // then all particles with index > numBodiesMassive will not be included // in force computation if (index ==0){ // don't ever kick the central mass accel.x = 0.0; accel.y = 0.0; accel.z = 0.0; } // update velocity vel.x += accel.x * deltaTime; vel.y += accel.y * deltaTime; vel.z += accel.z * deltaTime; // should be 1.0 if no damping // vel.x *= damping; vel.y *= damping; vel.z *= damping; // store position and new velocity newPos[index] = pos; newVel[index] = vel; } /////////////////////////////////////////////////////////// // change the central mass in device global mem // useful for bary centric computations that leave the first coordinate /////////////////////////////////////////////////////////// __global__ void changem0(float4* Pos, float m0) { Pos[0].w = m0; } #endif // #ifndef _NBODY_KERNEL_H_