#include <stdio.h>
#include <stdlib.h>
#include <math.h>

#define N         50		/* grid resolution, h = 1/N      */
#define TIMESPAN  0.01		/* total simulated timespan      */
#define OUTPUTS   5		/* number of output intervals    */
#define DT_COEFF  0.1		/* timestep size in units of h^2 */


/* global array pointers that store our various fields */
float *host_field, *field, *ftemp, *dfdt0, *dfdt1;


/* some function prototypes that we will use */
void set_ics(float *f);
__global__ void get_derivative(float *f, float *d2f);
__global__ void add(float *res, float *a, float *b, float s);
void evolve(float *f, float dt);
void output_diagonal(int nr);


int main(int argc, char **argv)
{
  int i, nr;

  cudaGetDeviceCount(&i);
  if(i == 0)
    {
      printf("No usable CUDA device found, exiting.\n");
      exit(1);
    }

  /* allocate the temperature field on the host */
  host_field = (float *) malloc(N * N * sizeof(float));

  /* allocate on the device the temperature field, 
     and auxiliary arrays used during the computation */
  cudaMalloc(&field, N * N * sizeof(float));
  cudaMalloc(&ftemp, N * N * sizeof(float));
  cudaMalloc(&dfdt0, N * N * sizeof(float));
  cudaMalloc(&dfdt1, N * N * sizeof(float));

  /* initialze our initial value problem */
  set_ics(host_field);

  /* copy the temperature field over to the device */
  cudaMemcpy(field, host_field, N * N * sizeof(float), cudaMemcpyHostToDevice);


  /* determinate a suitable timestep size, and the number of steps needed to reach next output time after TIMESPAN/OUTPUTS */
  float dtmax = DT_COEFF * pow(1.0 / N, 2);
  int steps = (TIMESPAN / OUTPUTS) / dtmax + 1;
  float dt = (TIMESPAN / OUTPUTS) / steps;


  /* now evolve the temperature field for the required number of steps */
  for(nr = 0; nr < OUTPUTS; nr++)
    {
      /* produce some output, allowing us to plot the result */
      output_diagonal(nr);

      for(i = 0; i < steps; i++)
	{
	  if(i % ((steps / 100) + 1) == 0)
	    printf("i=%d of %d steps for output %d\n", i, steps, nr);	/* produce a progress message every once in a while */

	  evolve(field, dt);
	}
    }

  /* produce an output at the final time */
  output_diagonal(nr);

  /* free our buffers */
  cudaFree(dfdt1);
  cudaFree(dfdt0);
  cudaFree(ftemp);
  cudaFree(field);
  free(host_field);

  return 0;
}

/* advance the heat conduction equation by one timestep based on a 2nd-order Runge-Kutta scheme */
void evolve(float *f, float dt)
{
  dim3 threads(16, 16);
  int nblocks = (N + 15) / 16;
  dim3 blocks(nblocks, nblocks);

  get_derivative <<< blocks, threads >>> (f, dfdt0);

  add <<< blocks, threads >>> (ftemp, f, dfdt0, dt);

  get_derivative <<< blocks, threads >>> (ftemp, dfdt1);

  add <<< blocks, threads >>> (f, f, dfdt0, 0.5 * dt);
  add <<< blocks, threads >>> (f, f, dfdt1, 0.5 * dt);
}


/* calculate the Laplacian of input field f, and store it in output field d2f */
__global__ void get_derivative(float *f, float *d2f)
{
  int ip, im, jp, jm;

  float prefac = N * N;		/* this is 1/h^2, where h=1/N is the grid spacing */

  int j = threadIdx.y + blockIdx.y * blockDim.y;

  while(j < N)
    {
      int i = threadIdx.x + blockIdx.x * blockDim.x;

      while(i < N)
	{
	  ip = i + 1;
	  jp = j + 1;
	  im = i - 1;
	  jm = j - 1;

	  /* deal with periodic wrap around */
	  if(ip >= N)
	    ip -= N;
	  if(jp >= N)
	    jp -= N;
	  if(im < 0)
	    im += N;
	  if(jm < 0)
	    jm += N;

	  /* now apply discretized Laplacian in 2D */
	  d2f[i + j * N] =
	    prefac * (f[ip + j * N] + f[im + j * N] + f[i + jp * N] + f[i + jm * N] - 4 * f[i + j * N]);

	  i += blockDim.x * gridDim.x;
	}

      j += blockDim.y * gridDim.y;
    }
}


/* calculate res = a + s * b, where a and b are 2D fields, and s is a scalar */
__global__ void add(float *res, float *a, float *b, float s)
{
  int j = threadIdx.y + blockIdx.y * blockDim.y;

  while(j < N)
    {
      int i = threadIdx.x + blockIdx.x * blockDim.x;

      while(i < N)
	{
	  int offset = i + j * N;
	  res[offset] = a[offset] + b[offset] * s;
	  i += blockDim.x * gridDim.x;
	}
      j += blockDim.y * gridDim.y;
    }
}


/* create initial conditions for out test problem */
void set_ics(float *f)
{
  int i, j;

  for(i = 0; i < N; i++)
    for(j = 0; j < N; j++)
      {
	float x = (i + 0.5) / N;
	float y = (j + 0.5) / N;

	if(fabs(x - 0.5) <= 0.125 && fabs(y - 0.5) <= 0.125)
	  f[i + j * N] = 4 * fabs(x + y - 1);
      }
}


/* create an output with number 'nr' containing the diagonal of our temperature field */
void output_diagonal(int nr)
{
  int i;
  char buf[1000];

  /* copy the temperature field from the device to the host */
  cudaMemcpy(host_field, field, N * N * sizeof(float), cudaMemcpyDeviceToHost);

  sprintf(buf, "result_%d.txt", nr);

  FILE *fd = fopen(buf, "w");

  fprintf(fd, "%d\n", N);

  for(i = 0; i < N; i++)
    fprintf(fd, "%f  %f\n", (i + 0.5) / N, host_field[i + i * N]);

  fclose(fd);
}