160 likes | 270 Vues
This workshop explores the nuances of optimizing reduction algorithms using CUDA and OpenCL. Key topics include terminology, the dot product test problem, execution configurations, and minimizing divergence in kernel execution. The workshop covers naive reduction methods and implementations for both CUDA and OpenCL, detailing the use of shared and local memory, as well as strategies for effective management of register storage. Participants will learn practical coding techniques applicable to real-world scenarios, with a focus on maximizing performance during parallel computation tasks.
E N D
Optimizing Simple CUDA and OpenCL Kernels Chris Szalwinski December 5 2013 Workshop on Reduction
Overview • Reduction Algorithm • Terminology – CUDA and OpenCL • Test Problem – Dot Product • Test Parameters • Execution Configuration • Divergence • Register Storage
Reduction Algorithm • Select the Largest Value
OpenCL Workspace Workgroups Work Items Terminology - Configuration • CUDA • Grid • Blocks • Threads
OpenCL Global Local Private Terminology - Memory • CUDA • Global • Shared • Local
Dot Product • Vector a – n elements – a[0] a[1] … a[n-1] • Vector b – n elements – b[0] b[1] … b[n-1] • Dot Product • = a[0] * b[0]+a[1] * b[1]+…+a[n-1] * b[n-1] • 2 stages • Multiply matching elements • Sum their products
CUDA – Naive Dot Product const int ntpb = 128; // number of threads per block // Shared Memory // __global__ void dot_D(const float* a, const float* b, float* c, int n) { int gid = blockIdx.x * blockDim.x + threadIdx.x; int tid = threadIdx.x; __shared__ float s[ntpb]; // store product in shared memory if (gid < n) s[tid] = a[gid] * b[gid]; else s[tid] = 0; __syncthreads(); // reduce shared memory entries for (int stride = 1; stride < blockDim.x; stride *= 2) { if (tid % (2 * stride) == 0) s[tid] += s[tid + stride]; __syncthreads(); } if (tid == 0) c[blockIdx.x] = s[0]; }
OpenCL – Naive Dot Product #define NWUPWG 128 // number of work-items per workgroup // Local Memory // __kernel void dot_D(__global const float* a, __global const float* b, __global float* c, int n) { int gid = get_global_id(0); int i = get_local_id(0); int size = get_local_size(0); int wgrp = get_group_id(0); __local float s[NWUPWG]; // store product in local memory if (gid < n) s[i] = a[gid] * b[gid]; else s[i] = 0; barrier(CLK_LOCAL_MEM_FENCE); // reduce local memory entries for (int stride = 1; stride < size; stride *= 2) { if (i % (2 * stride) == 0) s[i] += s[i + stride]; barrier(CLK_LOCAL_MEM_FENCE); } if (i == 0) c[wgrp] = s[0]; }
CUDA – Minimize Divergence const int ntpb = 128; // number of threads per block // Shared Memory // __global__ void dot_DM(const float* a, const float* b, float* c, int n) { int gid = blockIdx.x * blockDim.x + threadIdx.x; int tid = threadIdx.x; __shared__ float s[ntpb]; // store product in shared memory if (gid < n) s[tid] = a[gid] * b[gid]; else s[tid] = 0; __syncthreads(); // reduce shared memory entries for (int stride = blockDim.x >> 1; stride > 0; stride >>= 1) { if (tid < stride) s[tid] += s[tid + stride]; __syncthreads(); } if (tid == 0) c[blockIdx.x] = s[0]; }
OpenCL – Minimize Divergence #define NWUPWG 128 // number of work-items per workgroup // Local Memory // __kernel void dot_DM(__global const float* a, __global const float* b, __global float* c, int n) { int gid = get_global_id(0); int i = get_local_id(0); int size = get_local_size(0); int wgrp = get_group_id(0); __local float s[NWUPWG]; // store product in local memory if (gid < n) s[i] = a[gid] * b[gid]; else s[i] = 0; barrier(CLK_LOCAL_MEM_FENCE); // reduce local memory entries for (int stride = size >> 1; stride > 0; stride >>= 1) { if (i < stride) s[i] += s[i + stride]; barrier(CLK_LOCAL_MEM_FENCE); } if (i == 0) c[wgrp] = s[0]; }
CUDA – Register Accumulator const int ntpb = 128; // number of threads per block // Shared Memory // __global__ voiddot_DMR(const float* a, const float* b, float* c, int n) { int gid = blockIdx.x * blockDim.x + threadIdx.x; int tid = threadIdx.x; __shared__ float s[ntpb]; float x = 0; // store product in shared memory if (gid < n) x = s[tid] = a[gid] * b[gid]; __syncthreads(); // reduce shared memory entries for (int stride = blockDim.x >> 1; stride > 0; stride >>= 1) { if (tid < stride) { x += s[tid + stride]; s[tid] = x; } __syncthreads(); } if (tid == 0) c[blockIdx.x] = x; }
OpenCL – Register Accumulator #define NWUPWG 128 // number of work-items per workgroup // Local Memory // __kernel void dot_DMR(__global const float* a, __global const float* b, __global float* c, int n) { int gid = get_global_id(0); int i = get_local_id(0); int size = get_local_size(0); int wgrp = get_group_id(0); __local float s[NWUPWG]; // store product in local memory float x = 0; if (gid < n) x = s[i] = a[gid] * b[gid]; barrier(CLK_LOCAL_MEM_FENCE); // reduce local memory entries for (int stride = size >> 1; stride > 0; stride >>= 1) { if (i < stride) { x += s[i + stride]; s[i] = x; } } if (i == 0) c[wgrp] = x; }