#include # include "/home/Tit5/lecture27/gpu-lecture/book.h" #define BLOCK_SIZE 16 __global__ void Muld(float* A, float* B, float* C, int N) { int ROW = blockIdx.y*blockDim.y+threadIdx.y; int COL = blockIdx.x*blockDim.x+threadIdx.x; float tmpSum = 0; if (ROW < N && COL < N) { // each thread computes one element of the block sub-matrix // the value is stored in the s for (int i = 0; i < N; i++) { tmpSum += A[ROW * N + i] * B[i * N + COL]; } } C[ROW * N + COL] = tmpSum; } #include #include #include #include // Host multiplication function // Compute C = A * B // hA is the height of A // wA is the width of A // wB is the width of B #define hA 1024 #define wA 1024 #define wB 1024 #define N (1024*1024) int main( void ) { cudaEvent_t start, stop; HANDLE_ERROR( cudaEventCreate( &start ) ); HANDLE_ERROR( cudaEventCreate( &stop ) ); HANDLE_ERROR( cudaEventRecord( start, 0 ) ); int size = N *sizeof( float); float *A,*B,*C; A = (float*)malloc( size ); B = (float*)malloc( size ); C = (float*)malloc( size ); for (int i=0; i will be executed concurrently by BLOCK_SIZE^2 GPU threads ************************/ // dim3 is an integer vector type that can be used in CUDA code // NOTE dim3 grid( 512 ) create a grid of integers: 512 x 1 x 1 dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE); dim3 dimGrid( ( wB + dimBlock.x - 1 ) / dimBlock.x, ( hA + dimBlock.y - 1) / dimBlock.y); printf("# of blocks %d %d \n", dimGrid.x, dimGrid.y); // Launch the device computation Muld<<>>(Ad, Bd, Cd, hA); // Read C from the device cudaMemcpy(C, Cd, size, cudaMemcpyDeviceToHost); // get stop time, and display the timing results HANDLE_ERROR( cudaEventRecord( stop, 0 ) ); HANDLE_ERROR( cudaEventSynchronize( stop ) ); float elapsedTime; HANDLE_ERROR( cudaEventElapsedTime( &elapsedTime, start, stop ) ); printf( "Time : %3.1f ms\n", elapsedTime ); for (int i=0; i