#include #include #include #define N 50 /* grid resolution, h = 1/N */ #define TIMESPAN 0.01 /* total simulated timespan */ #define OUTPUTS 5 /* number of output intervals */ #define DT_COEFF 0.1 /* timestep size in units of h^2 */ /* global array pointers that store our various fields */ float *host_field, *field, *ftemp, *dfdt0, *dfdt1; /* some function prototypes that we will use */ void set_ics(float *f); __global__ void get_derivative(float *f, float *d2f); __global__ void add(float *res, float *a, float *b, float s); void evolve(float *f, float dt); void output_diagonal(int nr); int main(int argc, char **argv) { int i, nr; cudaGetDeviceCount(&i); if(i == 0) { printf("No usable CUDA device found, exiting.\n"); exit(1); } /* allocate the temperature field on the host */ host_field = (float *) malloc(N * N * sizeof(float)); /* allocate on the device the temperature field, and auxiliary arrays used during the computation */ cudaMalloc(&field, N * N * sizeof(float)); cudaMalloc(&ftemp, N * N * sizeof(float)); cudaMalloc(&dfdt0, N * N * sizeof(float)); cudaMalloc(&dfdt1, N * N * sizeof(float)); /* initialze our initial value problem */ set_ics(host_field); /* copy the temperature field over to the device */ cudaMemcpy(field, host_field, N * N * sizeof(float), cudaMemcpyHostToDevice); /* determinate a suitable timestep size, and the number of steps needed to reach next output time after TIMESPAN/OUTPUTS */ float dtmax = DT_COEFF * pow(1.0 / N, 2); int steps = (TIMESPAN / OUTPUTS) / dtmax + 1; float dt = (TIMESPAN / OUTPUTS) / steps; /* now evolve the temperature field for the required number of steps */ for(nr = 0; nr < OUTPUTS; nr++) { /* produce some output, allowing us to plot the result */ output_diagonal(nr); for(i = 0; i < steps; i++) { if(i % ((steps / 100) + 1) == 0) printf("i=%d of %d steps for output %d\n", i, steps, nr); /* produce a progress message every once in a while */ evolve(field, dt); } } /* produce an output at the final time */ output_diagonal(nr); /* free our buffers */ cudaFree(dfdt1); cudaFree(dfdt0); cudaFree(ftemp); cudaFree(field); free(host_field); return 0; } /* advance the heat conduction equation by one timestep based on a 2nd-order Runge-Kutta scheme */ void evolve(float *f, float dt) { dim3 threads(16, 16); int nblocks = (N + 15) / 16; dim3 blocks(nblocks, nblocks); get_derivative <<< blocks, threads >>> (f, dfdt0); add <<< blocks, threads >>> (ftemp, f, dfdt0, dt); get_derivative <<< blocks, threads >>> (ftemp, dfdt1); add <<< blocks, threads >>> (f, f, dfdt0, 0.5 * dt); add <<< blocks, threads >>> (f, f, dfdt1, 0.5 * dt); } /* calculate the Laplacian of input field f, and store it in output field d2f */ __global__ void get_derivative(float *f, float *d2f) { int ip, im, jp, jm; float prefac = N * N; /* this is 1/h^2, where h=1/N is the grid spacing */ int j = threadIdx.y + blockIdx.y * blockDim.y; while(j < N) { int i = threadIdx.x + blockIdx.x * blockDim.x; while(i < N) { ip = i + 1; jp = j + 1; im = i - 1; jm = j - 1; /* deal with periodic wrap around */ if(ip >= N) ip -= N; if(jp >= N) jp -= N; if(im < 0) im += N; if(jm < 0) jm += N; /* now apply discretized Laplacian in 2D */ d2f[i + j * N] = prefac * (f[ip + j * N] + f[im + j * N] + f[i + jp * N] + f[i + jm * N] - 4 * f[i + j * N]); i += blockDim.x * gridDim.x; } j += blockDim.y * gridDim.y; } } /* calculate res = a + s * b, where a and b are 2D fields, and s is a scalar */ __global__ void add(float *res, float *a, float *b, float s) { int j = threadIdx.y + blockIdx.y * blockDim.y; while(j < N) { int i = threadIdx.x + blockIdx.x * blockDim.x; while(i < N) { int offset = i + j * N; res[offset] = a[offset] + b[offset] * s; i += blockDim.x * gridDim.x; } j += blockDim.y * gridDim.y; } } /* create initial conditions for out test problem */ void set_ics(float *f) { int i, j; for(i = 0; i < N; i++) for(j = 0; j < N; j++) { float x = (i + 0.5) / N; float y = (j + 0.5) / N; if(fabs(x - 0.5) <= 0.125 && fabs(y - 0.5) <= 0.125) f[i + j * N] = 4 * fabs(x + y - 1); } } /* create an output with number 'nr' containing the diagonal of our temperature field */ void output_diagonal(int nr) { int i; char buf[1000]; /* copy the temperature field from the device to the host */ cudaMemcpy(host_field, field, N * N * sizeof(float), cudaMemcpyDeviceToHost); sprintf(buf, "result_%d.txt", nr); FILE *fd = fopen(buf, "w"); fprintf(fd, "%d\n", N); for(i = 0; i < N; i++) fprintf(fd, "%f %f\n", (i + 0.5) / N, host_field[i + i * N]); fclose(fd); }