/* all force pair computations 
   Device code. force computations are based on GEMS3 implimentation by Nyland
   and trimmed version of nbody in SDK
   The integrator has been modified to be leapfrog.
   I have also added a kick step routine useful for symplectic integrators.
 */

#ifndef _NBODY_KERNEL_H_
#define _NBODY_KERNEL_H_

#include <math.h>
#include <stdio.h>

#define LOOP_UNROLL 2
// crashes if larger than 2

//__constant__ float softeningSquared;  // currently not used

///////////////////////////////////////////////////////////
// soft2 is softening squared
// add acceleration to ai on particle bi from particle bj
// mass for bj is stored in bj.w
__device__ float3 
bodyBodyInteraction(float3 ai, float4 bi, float4 bj, float soft2) 
{
    float3 r;

    // r_ij  [3 FLOPS]  I switched sign here from that in SDK
    r.x = bj.x - bi.x;
    r.y = bj.y - bi.y;
    r.z = bj.z - bi.z;

    // distSqr = dot(r_ij, r_ij) + EPS^2  [6 FLOPS]
    float distSqr = r.x * r.x + r.y * r.y + r.z * r.z;
    distSqr += soft2;

    // invDistCube =1/distSqr^(3/2)  [4 FLOPS (2 mul, 1 sqrt, 1 inv)]
    float invDist = rsqrtf(distSqr);
    float invDistCube =  invDist * invDist * invDist;

    //  m_j * invDistCube [1 FLOP]
    float s = bj.w * invDistCube;

    // a_i =  a_i + s * r_ij [6 FLOPS]
    ai.x += r.x * s;
    ai.y += r.y * s;
    ai.z += r.z * s;

    return ai;
}

/////////////////////////////////////////////////////////
// add forces from every particle looping through the blockdim
// This is the "tile_calculation" function from the GPUG3 article.
// howerver SDK version had arguments flipped
/////////////////////////////////////////////////////////
__device__ float3 gravitation(float4 myPos, float3 accel, float soft2)
{
    extern __shared__ float4 sharedPos[];
    int i=0;

    // Here we unroll the loop
    // what if blockDim.x is not divisible by LOOP_UNROLL? 
    // typically blockDim.x is 256 threads and no problem
    // for reasons I can't figure out this routine crashes when looproll >2

    for (i = 0; i < blockDim.x; ) 
    {
        accel = bodyBodyInteraction(accel,myPos,sharedPos[i++],soft2); 
               // I think flipped in SDK?
#if LOOP_UNROLL > 1
        accel = bodyBodyInteraction(accel,myPos,sharedPos[i++],soft2); 
#endif
#if LOOP_UNROLL > 2  // always crashes if this is true!
        accel = bodyBodyInteraction(accel,myPos,sharedPos[i++],soft2); 
        accel = bodyBodyInteraction(accel,myPos,sharedPos[i++],soft2); 
#endif
#if LOOP_UNROLL > 4
        accel = bodyBodyInteraction(accel,myPos,sharedPos[i++],soft2); 
        accel = bodyBodyInteraction(accel,myPos,sharedPos[i++],soft2); 
        accel = bodyBodyInteraction(accel,myPos,sharedPos[i++],soft2); 
        accel = bodyBodyInteraction(accel,myPos,sharedPos[i++],soft2); 
#endif
    }

    return accel;
}

// WRAP is used to force each block to start working on a different 
// chunk (and wrap around back to the beginning of the array) so that
// not all multiprocessors try to read the same memory locations at 
// once.
#define WRAP(x,m) (((x)<m)?(x):(x-m))  // Mod without divide, works on values from 0 up to 2m

////////////////////////////////////////////////////////////////
// Nylands from Cuda 1.1 SDK
// this does not depend on blockIdx
////////////////////////////////////////////////////////////////
__device__ float3
computeBodyAccel(float4 bodyPos, float4* positions, int numBodies, float soft2)
{
    extern __shared__ float4 sharedPos[];

    float3 acc = {0.0f, 0.0f, 0.0f}; // zero initialize
    
    int p = blockDim.x;
    int n = numBodies;
    int i,tile;
   // number of tiles is N/p = blockdim.x which is why wrap works

    __syncthreads();
// load up mem for p particles then compute forces for all blocks with same memory

    for (i = 0, tile = 0; i < n; i += p, tile++)  // looping with step of tile (p) 
    { // load up shared memory 
//      int idx = tile*blockDim.x + threadIdx.x;
//      sharedPos[threadIdx.x] = positions[idx];
        sharedPos[threadIdx.x] = 
            positions[WRAP(blockIdx.x+tile, gridDim.x) * blockDim.x + threadIdx.x];
// no difference between these 2 seen
       
        __syncthreads();
        // This is the "tile_calculation" function from the GPUG3 article.
          acc = gravitation(bodyPos, acc, soft2);
        __syncthreads();
    }

    return acc;
}

////////////////////////////////////////////////////////////////
// does one timestep updating both positions and velocities with leapfrog
////////////////////////////////////////////////////////////////
__global__ void integrateBodies(float4* newPos, float4* newVel, 
                float4* oldPos, float4* oldVel,
                float deltaTime, float damping, float soft2,
                int numBodies)
{
    extern __shared__ float4 sharedPos[];
    int index = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
    float4 vel = oldVel[index];
    float4 pos = oldPos[index];   

    float3 accel = computeBodyAccel(pos, oldPos, numBodies, soft2);

    // new velocity = old velocity + acceleration * deltaTime
    // here force == acceleration
       
   // first update velocity 
    vel.x += accel.x * deltaTime;
    vel.y += accel.y * deltaTime;
    vel.z += accel.z * deltaTime;  
// could add here a central force law or halo term
// float r = sqrt(pos.x*pos.x + pos.y*pos.y + pos.z*pos.z);
// if adding kep term use bodyBodyInteraction to calculate accel

// should be 1.0 if no damping
//    vel.x *= damping;  vel.y *= damping;  vel.z *= damping;
        
   // new position = old position + newvelocity * deltaTime
   // update positions based on the new updated velocity 
   // so is essentially leap frog 
   __syncthreads(); // needed so positions of some particles 
                    // don't change before
                    // all velocities updated.

    pos.x += vel.x * deltaTime;
    pos.y += vel.y * deltaTime;
    pos.z += vel.z * deltaTime;

    // store new position and velocity
    newPos[index] = pos;
    newVel[index] = vel;

}

///////////////////////////////////////////////////////////
// does velocity kicks only, interaction step, positions transfered
// to new positions, velocities + kicks transfered to  newvels
// note if working in helio/bary coords we must not compute interaction
// terms from central mass, so this mass must be zeroed before calculation
///////////////////////////////////////////////////////////
__global__ void interactionStep(float4* newPos, float4* newVel, 
                float4* oldPos, float4* oldVel,
                float deltaTime, float damping, float soft2,
                int numBodiesMassive)
{
    extern __shared__ float4 sharedPos[];
    int index = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;

    float4 vel = oldVel[index];
    float4 pos = oldPos[index];    // mass transfered

// compute acceleration with n^2 force step
    float3 accel = computeBodyAccel(pos, oldPos, numBodiesMassive, soft2);

// to do massless particles: 
// second argument here should contain the massive particles, 
// with third argument specifying the number of massive particles
// the first argument can contain massless particle with index > numBodiesMassive.
// the computeBodyAccel does not use the blockIdx
// Let 
// numBodiestotal = blockDim.x*gridDim.x which is larger than numBodiesMassive
// then all particles with index > numBodiesMassive will not be included
// in force computation

    if (index ==0){ // don't ever kick the central mass
      accel.x = 0.0; accel.y = 0.0; accel.z = 0.0; 
   }
   // update velocity 
   vel.x += accel.x * deltaTime;
   vel.y += accel.y * deltaTime;
   vel.z += accel.z * deltaTime;  

// should be 1.0 if no damping
//    vel.x *= damping;  vel.y *= damping;  vel.z *= damping;

    // store position and new velocity
    newPos[index] = pos;
    newVel[index] = vel;
}

///////////////////////////////////////////////////////////
// change the central mass in device global mem
// useful for bary centric computations that leave the first coordinate
///////////////////////////////////////////////////////////
__global__ void changem0(float4* Pos, float m0)
{
  Pos[0].w = m0;
}

#endif // #ifndef _NBODY_KERNEL_H_