Optimizing CUDA and OpenCL Kernels for Efficient Reduction Algorithms

Optimizing Simple CUDA and OpenCL Kernels Chris Szalwinski December 5 2013 Workshop on Reduction

Overview • Reduction Algorithm • Terminology – CUDA and OpenCL • Test Problem – Dot Product • Test Parameters • Execution Configuration • Divergence • Register Storage

Reduction Algorithm • Select the Largest Value

Minimum - Naive Reduction

OpenCL Workspace Workgroups Work Items Terminology - Configuration • CUDA • Grid • Blocks • Threads

OpenCL Global Local Private Terminology - Memory • CUDA • Global • Shared • Local

Dot Product • Vector a – n elements – a[0] a[1] … a[n-1] • Vector b – n elements – b[0] b[1] … b[n-1] • Dot Product • = a[0] * b[0]+a[1] * b[1]+…+a[n-1] * b[n-1] • 2 stages • Multiply matching elements • Sum their products

CUDA – Naive Dot Product const int ntpb = 128; // number of threads per block // Shared Memory // __global__ void dot_D(const float* a, const float* b, float* c, int n) { int gid = blockIdx.x * blockDim.x + threadIdx.x; int tid = threadIdx.x; __shared__ float s[ntpb]; // store product in shared memory if (gid < n) s[tid] = a[gid] * b[gid]; else s[tid] = 0; __syncthreads(); // reduce shared memory entries for (int stride = 1; stride < blockDim.x; stride *= 2) { if (tid % (2 * stride) == 0) s[tid] += s[tid + stride]; __syncthreads(); } if (tid == 0) c[blockIdx.x] = s[0]; }

OpenCL – Naive Dot Product #define NWUPWG 128 // number of work-items per workgroup // Local Memory // __kernel void dot_D(__global const float* a, __global const float* b, __global float* c, int n) { int gid = get_global_id(0); int i = get_local_id(0); int size = get_local_size(0); int wgrp = get_group_id(0); __local float s[NWUPWG]; // store product in local memory if (gid < n) s[i] = a[gid] * b[gid]; else s[i] = 0; barrier(CLK_LOCAL_MEM_FENCE); // reduce local memory entries for (int stride = 1; stride < size; stride *= 2) { if (i % (2 * stride) == 0) s[i] += s[i + stride]; barrier(CLK_LOCAL_MEM_FENCE); } if (i == 0) c[wgrp] = s[0]; }

Minimize Divergence

CUDA – Minimize Divergence const int ntpb = 128; // number of threads per block // Shared Memory // __global__ void dot_DM(const float* a, const float* b, float* c, int n) { int gid = blockIdx.x * blockDim.x + threadIdx.x; int tid = threadIdx.x; __shared__ float s[ntpb]; // store product in shared memory if (gid < n) s[tid] = a[gid] * b[gid]; else s[tid] = 0; __syncthreads(); // reduce shared memory entries for (int stride = blockDim.x >> 1; stride > 0; stride >>= 1) { if (tid < stride) s[tid] += s[tid + stride]; __syncthreads(); } if (tid == 0) c[blockIdx.x] = s[0]; }

OpenCL – Minimize Divergence #define NWUPWG 128 // number of work-items per workgroup // Local Memory // __kernel void dot_DM(__global const float* a, __global const float* b, __global float* c, int n) { int gid = get_global_id(0); int i = get_local_id(0); int size = get_local_size(0); int wgrp = get_group_id(0); __local float s[NWUPWG]; // store product in local memory if (gid < n) s[i] = a[gid] * b[gid]; else s[i] = 0; barrier(CLK_LOCAL_MEM_FENCE); // reduce local memory entries for (int stride = size >> 1; stride > 0; stride >>= 1) { if (i < stride) s[i] += s[i + stride]; barrier(CLK_LOCAL_MEM_FENCE); } if (i == 0) c[wgrp] = s[0]; }

CUDA – Register Accumulator const int ntpb = 128; // number of threads per block // Shared Memory // __global__ voiddot_DMR(const float* a, const float* b, float* c, int n) { int gid = blockIdx.x * blockDim.x + threadIdx.x; int tid = threadIdx.x; __shared__ float s[ntpb]; float x = 0; // store product in shared memory if (gid < n) x = s[tid] = a[gid] * b[gid]; __syncthreads(); // reduce shared memory entries for (int stride = blockDim.x >> 1; stride > 0; stride >>= 1) { if (tid < stride) { x += s[tid + stride]; s[tid] = x; } __syncthreads(); } if (tid == 0) c[blockIdx.x] = x; }

OpenCL – Register Accumulator #define NWUPWG 128 // number of work-items per workgroup // Local Memory // __kernel void dot_DMR(__global const float* a, __global const float* b, __global float* c, int n) { int gid = get_global_id(0); int i = get_local_id(0); int size = get_local_size(0); int wgrp = get_group_id(0); __local float s[NWUPWG]; // store product in local memory float x = 0; if (gid < n) x = s[i] = a[gid] * b[gid]; barrier(CLK_LOCAL_MEM_FENCE); // reduce local memory entries for (int stride = size >> 1; stride > 0; stride >>= 1) { if (i < stride) { x += s[i + stride]; s[i] = x; } } if (i == 0) c[wgrp] = x; }

CUDA Results

OpenCL Results

Optimizing CUDA and OpenCL Kernels for Efficient Reduction Algorithms

Optimizing CUDA and OpenCL Kernels for Efficient Reduction Algorithms

Presentation Transcript

SHARC-II Data reduction Workshop

Violence reduction in schools workshop

SPE Workshop: Incident Reduction

Workshop on Integrating Financial Services in Poverty Reduction Strategies

?????????????????? Workshop on ...

Drugscope Harm Reduction Workshop Drugscope Harm Reduction Workshops Overdose Prevention

Workshop on Food Security, Economic Growth and Poverty Reduction

Violence reduction in schools workshop

Issues on Poverty Reduction

Workshop on

Workshop on

workshop on state action plan for reduction / removal of shelterlessness

Workshop Project “Skills for Poverty Reduction”

Second Regional Workshop on Gender and Poverty Reduction Strategies

On Modular-Reduction Vulnerabilities

Welcome to the Idling-Reduction Workshop

Workshop on

Second Regional Workshop on Gender and Poverty Reduction Strategies

Violence reduction in schools workshop

Violence reduction in schools workshop

Violence reduction in schools workshop Welcome !

Colloquium on Pendency Reduction