Lecture 4: Introduction to Parallel Computing Using CUDA

Lecture 4: Introduction to Parallel Computing Using CUDA IEEE Boston Continuing Education Program Ken Domino, Domem Technologies May 23, 2011

Even/Odd sort • Very similar to Bubble Sort • Easily parallelizable

Even/Odd sort 1 7 4 0 9 4 8 8 2 4 5 5 1 7 1 1 for K = 1 to do for I = 1, 3, 5, ..., -2 do if x[I] > x[I+1] then swap(x[I], x[I+1]) end for for I = 0, 2, 4, 6, ..., do if x[I] > x[I+1] then swap(x[I], x[I+1]) end for end for http://en.wikipedia.org/wiki/Odd-even_sort => http://www.eli.sdsu.edu/courses/spring96/cs662/notes/assRelated/assRelated.html

Even/Odd sort 1 7 4 0 9 4 8 8 2 4 5 5 1 7 1 1 1 4 7 0 9 4 8 2 8 4 5 1 5 1 7 1 for K = 1 to do for I = 1, 3, 5, ..., -2 do if x[I] > x[I+1] then swap(x[I], x[I+1]) end for for I = 0, 2, 4, 6, ..., do if x[I] > x[I+1] then swap(x[I], x[I+1]) end for end for http://en.wikipedia.org/wiki/Odd-even_sort => http://www.eli.sdsu.edu/courses/spring96/cs662/notes/assRelated/assRelated.html

Even/Odd sort 1 7 4 0 9 4 8 8 2 4 5 5 1 7 1 1 1 4 7 0 9 4 8 2 8 4 5 1 5 1 7 1 1 4 0 7 4 9 2 8 4 8 1 5 1 5 1 7 for K = 1 to do for I = 1, 3, 5, ..., -2 do if x[I] > x[I+1] then swap(x[I], x[I+1]) end for for I = 0, 2, 4, 6, ..., do if x[I] > x[I+1] then swap(x[I], x[I+1]) end for end for http://en.wikipedia.org/wiki/Odd-even_sort => http://www.eli.sdsu.edu/courses/spring96/cs662/notes/assRelated/assRelated.html

Even/Odd sort 1 7 4 0 9 4 8 8 2 4 5 5 1 7 1 1 1 4 7 0 9 4 8 2 8 4 5 1 5 1 7 1 1 4 0 7 4 9 2 8 4 8 1 5 1 5 1 7 1 4 0 7 4 9 2 8 4 8 1 5 1 5 1 7 1 0 4 4 7 2 9 4 8 1 8 1 5 1 5 7 0 1 4 4 2 7 4 9 1 8 1 8 1 5 5 7 0 1 4 4 2 7 4 9 1 8 1 8 1 5 5 7 0 1 4 2 4 4 7 1 9 1 8 1 8 5 5 7 0 1 2 4 4 4 1 7 1 9 1 8 5 8 5 7 0 1 2 4 4 4 1 7 1 9 1 8 5 8 5 7 0 1 2 4 4 1 4 1 7 1 9 5 8 5 8 7 0 1 2 4 1 4 1 4 1 7 5 9 5 8 7 8 0 1 2 4 1 4 1 4 1 7 5 9 5 8 7 8 0 1 2 1 4 1 4 1 4 5 7 5 9 7 8 8 0 1 1 2 1 4 1 4 4 5 5 7 7 9 8 8 0 1 1 2 1 4 1 4 4 5 5 7 7 9 8 8 0 1 1 1 2 1 4 4 4 5 5 7 7 8 9 8 0 1 1 1 1 2 4 4 4 5 5 7 7 8 8 9 0 1 1 1 1 2 4 4 4 5 5 7 7 8 8 9 0 1 1 1 1 2 4 4 4 5 5 7 7 8 8 9 0 1 1 1 1 2 4 4 4 5 5 7 7 8 8 9 for K = 1 to do for I = 1, 3, 5, ..., -2 do if x[I] > x[I+1] then swap(x[I], x[I+1]) end for for I = 0, 2, 4, 6, ..., do if x[I] > x[I+1] then swap(x[I], x[I+1]) end for end for http://en.wikipedia.org/wiki/Odd-even_sort => http://www.eli.sdsu.edu/courses/spring96/cs662/notes/assRelated/assRelated.html

Even/Odd sort for K = 1 to do for I = 1, 3, 5, ..., -2 do in parallel if x[I] > x[I+1] then swap(x[I], x[I+1]) end for for I = 0, 2, 4, 6, ..., do in parallel if x[I] > x[I+1] then swap(x[I], x[I+1]) end for end for http://en.wikipedia.org/wiki/Odd-even_sort => http://www.eli.sdsu.edu/courses/spring96/cs662/notes/assRelated/assRelated.html

Even/Odd sort void cpuSort(int * _a, int _length) { a = _a; length = _length; bool sorted = false; while (!sorted) { sorted=true; for (int i = 1; i < length-1; i += 2) { if (EvenOddSorter::compare(a, i, i+1, ASCENDING)) sorted = false; } for(int i = 0; i < length-1; i += 2) { if (EvenOddSorter::compare(a, i, i+1, ASCENDING)) sorted = false; } } } void EvenOddSorter::gpuSort(int * _a, int _length) { a = _a; length = _length; int * d_a; cudaMalloc((void**)&d_a, sizeof(int) * length); cudaMemcpy(d_a, a, sizeof(int) * length, cudaMemcpyHostToDevice); int sorted = 0; int * d_sorted; cudaMalloc((void**)&d_sorted, sizeof(int)); while (sorted == 0) { sorted = 1; cudaMemcpy(d_sorted, &sorted, sizeof(int), cudaMemcpyHostToDevice); kernel1<<<length / 32 + 1, 32, 33 * sizeof(int)>>>(d_a, length, d_sorted); kernel2<<<length / 32 + 1, 32, 33 * sizeof(int)>>>(d_a, length, d_sorted); cudaMemcpy(a, d_a, length * sizeof(int), cudaMemcpyDeviceToHost); cudaMemcpy(&sorted, d_sorted, sizeof(int), cudaMemcpyDeviceToHost); } cudaFree(d_a); }

Even/Odd sort void EvenOddSorter::gpuSort(int * _a, int _length) { a = _a; length = _length; int * d_a; cudaMalloc((void**)&d_a, sizeof(int) * length); cudaMemcpy(d_a, a, sizeof(int) * length, cudaMemcpyHostToDevice); int sorted = 0; int * d_sorted; cudaMalloc((void**)&d_sorted, sizeof(int)); while (sorted == 0) { sorted = 1; cudaMemcpy(d_sorted, &sorted, sizeof(int), cudaMemcpyHostToDevice); kernel1<<<length / 32 + 1, 32, 65 * sizeof(int)>>>(d_a, length, d_sorted); kernel2<<<length / 32 + 1, 32, 65 * sizeof(int)>>>(d_a, length, d_sorted); cudaMemcpy(a, d_a, length * sizeof(int), cudaMemcpyDeviceToHost); cudaMemcpy(&sorted, d_sorted, sizeof(int), cudaMemcpyDeviceToHost); } cudaFree(d_a); } Work in blocks of 32 threads. Thread 0 works with a[0..1] (or a[1..2]) Thread 1 works with a[2..3] (or a[3..4]) … Thread 31 works with a[62..63] (or a[63..64] => No gaps in thread id’s => Allocate shared memory to contain 65 contiguous elements of input

Even/Odd sort void cpuSort(int * _a, int _length) { a = _a; length = _length; bool sorted = false; while (!sorted) { sorted=true; for (int i = 1; i < length-1; i += 2) { if (EvenOddSorter::compare(a, i, i+1, ASCENDING)) sorted = false; } for(int i = 0; i < length-1; i += 2) { if (EvenOddSorter::compare(a, i, i+1, ASCENDING)) sorted = false; } } } __global__ void kernel1(int * a, int length, int * sorted) { const unsigned int i = 1 + 2 * (threadIdx.x + blockIdx.x * blockDim.x); if (i+1 >= length) return; // Copy input to shared mem. extern __shared__ int shared[]; int * sa = &shared[-(1 + 2 * blockIdx.x * blockDim.x)]; sa[i] = a[i]; sa[i+1] = a[i+1]; __syncthreads(); if (EvenOddSorter::compare(sa, i, i+1, EvenOddSorter::ASCENDING)) { *sorted = 0; // Write result. a[i] = sa[i]; a[i+1] = sa[i+1]; } __syncthreads(); }

Even/Odd sort void cpuSort(int * _a, int _length) { a = _a; length = _length; bool sorted = false; while (!sorted) { sorted=true; for (int i = 1; i < length-1; i += 2) { if (EvenOddSorter::compare(a, i, i+1, ASCENDING)) sorted = false; } for(int i = 0; i < length-1; i += 2) { if (EvenOddSorter::compare(a, i, i+1, ASCENDING)) sorted = false; } } } __global__ void kernel2(int * a, int length, int * sorted) { const unsigned int i = 2 * (threadIdx.x + blockIdx.x * blockDim.x); if (i+1 >= length) return; // Copy input to shared mem. extern __shared__ int shared[]; int * sa = &shared[-(2 * blockIdx.x * blockDim.x)]; sa[i] = a[i]; sa[i+1] = a[i+1]; __syncthreads(); if (EvenOddSorter::compare(sa, i, i+1, EvenOddSorter::ASCENDING)) { *sorted = 0; // Write result. a[i] = sa[i]; a[i+1] = sa[i+1]; } __syncthreads(); }

Even/Odd sort • Easy to implement • But, VERY slow • Annoying CUDA notes: • void EvenOddSorter::gpuSort(int * _a, int _length) is in a class; • __global__ void kernel1(int * a, int length, int * sorted) (i.e., “kernel”) routines cannot be contained in a class!

Bitonic Sort • Fast and easy to implement in CUDA • Based on merging of sorted lists

Bitonic Sort • A sorted sequence is a monotonically non-decreasing (or non-increasing) sequence. • Abitonic sequence is a sequence with x[0] ≤ x[1] ≤ x[2] … ≤ x[k] ≥ x[k+1] … ≥ x[n-1] for some k, where 0 ≤ k < n, or a circular shift of such a sequence. • [1, 2, 5, 7, 4, 3], [3, 1, 2, 5, 7, 4] are bitonic sequences. • Bitonic sequences are graphically represented with lines.

Bitonic Split • Consider a specialbitonicsequence: x[0] ≤ x[1] ≤ x[2] ≤ … ≤ x[ - 1], and x[] ≥ x[ + 1] ≥ … ≥ x[n - 1]. • A bitonic split are two subsequences: s1 = {min(x[0], x[]), min(x[1], x[]), …, min(x[ - 1], x[n - 1])) s2 = {max(x[0], x[]), max(x[1], x[]), …, max(x[ - 1], x[n - 1])) • s = [1, 2, 5, 7, 4, 3] => s1 = [1, 2, 3], s2 = [7, 4, 5]

Bitonic Merge • With bitonicsequence, a merge results in a sorted bitonic sequence. We reorder if the two halves to make sure each half is bitonic, then sort each half using divide and conquer. (Note: direction!) • Append s1 and s2, each length = n. Then: Merge(int lo, int n) for (int i = lo; i < lo + ; ++i) if (s[i] > s[i+]) exchange(s[i], s[I + ]) Merge(lo, ) Merge(, ) • s1 = [1, 3], s2 = [7, 0] => [1, 3, 7, 0] => [1, 0, 7, 3] => [0, 1, 7, 3] => [0, 1, 3, 7]

Bitonic Sort 1 7 4 0 9 4 8 8 2 4 5 5 1 7 1 1 sort lo = 0 n = 16 dir = 1 sort lo = 0 n = 8 dir = 1 sort lo = 0 n = 4 dir = 1 sort lo = 0 n = 2 dir = 1 merge lo = 0 n = 2 dir = 1 compare i = 0, i+m = 1 a[0] = 1, a[1] = 7 sort lo = 2 n = 2 dir = 0 merge lo = 2 n = 2 dir = 0 compare i = 2, i+m = 3 a[2] = 4, a[3] = 0 merge lo = 0 n = 4 dir = 1 compare i = 0, i+m = 2 a[0] = 1, a[2] = 4 compare i = 1, i+m = 3 a[1] = 7, a[3] = 0 exchanged 1 0 4 7 9 4 8 8 2 4 5 5 1 7 1 1 merge lo = 0 n = 2 dir = 1 compare i = 0, i+m = 1 a[0] = 1, a[1] = 0 exchanged 0 1 4 7 9 4 8 8 2 4 5 5 1 7 1 1 merge lo = 2 n = 2 dir = 1 compare i = 2, i+m = 3 a[2] = 4, a[3] = 7 sort lo = 4 n = 4 dir = 0 ... 0 1 4 7 4 9 8 8 2 4 5 5 1 7 1 1 0 1 4 7 8 9 4 8 2 4 5 5 1 7 1 1 0 1 4 7 9 8 4 8 2 4 5 5 1 7 1 1 0 1 4 7 9 8 8 4 2 4 5 5 1 7 1 1 void sort(int lo, int n, booldir) { if (n>1) { int m=n/2; sort(lo, m, ASCENDING); sort(lo+m, m, DESCENDING); merge(lo, n, dir); } } void merge(int lo, int n, booldir) { if (n>1) { int m=n/2; for (int i=lo; i<lo+m; i++) compare(i, i+m, dir); merge(lo, m, dir); merge(lo+m, m, dir); } }

Bitonic Sort • Known as a sorting network. http://www.inf.fh-flensburg.de/lang/algorithmen/sortieren/bitonic/bitonicen.htm

Bitonic Sort in CUDA void cpuSortNonrecursive(int * _a, int _length) { a = _a; length = _length; inti,j,k; // Parallel bitonic sort for (k = 2; k <= length; k = 2*k) { // Bitonic merge for (j = k >> 1; j > 0; j = j>>1) { for (i=0; i < length; i++) { intixj = i ^ j; if (ixj > i) { if ((i & k) == 0 && a[i] > a[ixj]) exchange(i, ixj); if ((i & k) != 0 && a[i] < a[ixj]) exchange(i, ixj); } } } } } __global__ void bitonicSort(int * a, int length) { const unsigned int i = threadIdx.x; // Copy input to shared mem. __shared__ extern intsa[]; sa[i] = a[i]; __syncthreads(); for (unsigned int k = 2; k <= length; k *= 2) { for (unsigned int j = k / 2; j>0; j /= 2) { unsigned intixj = i ^ j; if (ixj > i) { if ((i & k) == 0 && sa[i] > sa[ixj]) swap(sa[i], sa[ixj]); if ((i & k) != 0 && sa[i] < sa[ixj]) swap(sa[i], sa[ixj]); } __syncthreads(); } } // Write result. a[i] = sa[i]; }

Bitonic Sort in CUDA • Driver not scalable for length > 1024. • (Scaling an exercise. Answer in CUDA SDK Examples. Hint: need to factor sort and merge steps.) void gpuSort(int * _a, int _length) { a = _a; length = _length; int * d_a; cudaMalloc((void**)&d_a, sizeof(int) * length); cudaMemcpy(d_a, a, sizeof(int) * length, cudaMemcpyHostToDevice); bitonicSort<<<1, length, sizeof(int) * length>>>(d_a, length); cudaMemcpy(a, d_a, sizeof(int) * length, cudaMemcpyDeviceToHost); cudaFree(d_a); }

K-Means Clustering • Given a set of observations (x1, x2, …, xn), where each observation is a d-dimensional real vector, k-means clustering aims to partition the n observations into k sets (k ≤ n) S = {S1, S2, …, Sk} so as to minimize the within-cluster sum of squares (WCSS): whereμi is the mean of points in Si. D

K-Means Clustering • Example • Xi = [1, 2, 3, 7, 8, 9]; • K = 2 • S = {S1, S2} where S1 = [1, 2, 3], S2 = [7, 8, 9] • μ1 = 2, μ2 = 8 • D = (1-2)2 + (2-2)2 + (3-2)2 + (7-8)2 + (8-8)2 + (9-8)2= 4

K-Means Clustering sequential void cpu_kmeans(int k, int length, Point * points, int * clusters) { Point * means = (Point*)malloc(k * sizeof(Point)); for (;;) { // Compute means of all clusters. for (int i = 0; i < k; ++i) { // means[i] = Point::Zero; means[i].ZeroOut(); int count = 0; for (int j = 0; j < length; ++j) { if (i == clusters[j]) { count++; means[i] = means[i] + points[j]; } } if (count != 0) means[i] = means[i] / count; } // For all points, get minimum distance from point to all cluster means, // and reassign point to cluster if the minimum distance has changed. bool changed = false; for (int i = 0; i < length; ++i) { float min_delta = FLT_MAX; intmin_cluster = clusters[i]; for (int j = 0; j < k; ++j) { float delta = Point::distance(points[i], means[j]); if (min_delta > delta) { min_cluster = j; min_delta = delta; } } if (min_cluster != clusters[i]) { changed = true; clusters[i] = min_cluster; } } if (! changed) break; } }

K-Means Clustering sequential void cpu_kmeans(int k, int length, Point * points, int * clusters) { Point * means = (Point*)malloc(k * sizeof(Point)); for (;;) { // Compute means of all clusters. for (int i = 0; i < k; ++i) { // means[i] = Point::Zero; means[i].ZeroOut(); int count = 0; for (int j = 0; j < length; ++j) { if (i == clusters[j]) { count++; means[i] = means[i] + points[j]; } } if (count != 0) means[i] = means[i] / count; } // For all points, get minimum distance from point to all cluster means, // and reassign point to cluster if the minimum distance has changed. bool changed = false; for (int i = 0; i < length; ++i) { float min_delta = FLT_MAX; intmin_cluster = clusters[i]; for (int j = 0; j < k; ++j) { float delta = Point::distance(points[i], means[j]); if (min_delta > delta) { min_cluster = j; min_delta = delta; } } if (min_cluster != clusters[i]) { changed = true; clusters[i] = min_cluster; } } if (! changed) break; } } • points= [1, 2, 3, 7, 8, 9] • clusters = [0, 1, 0, 1, 0, 1] • means = [4.0, 6.0]

K-Means Clustering sequential void cpu_kmeans(int k, int length, Point * points, int * clusters) { Point * means = (Point*)malloc(k * sizeof(Point)); for (;;) { // Compute means of all clusters. for (int i = 0; i < k; ++i) { // means[i] = Point::Zero; means[i].ZeroOut(); int count = 0; for (int j = 0; j < length; ++j) { if (i == clusters[j]) { count++; means[i] = means[i] + points[j]; } } if (count != 0) means[i] = means[i] / count; } // For all points, get minimum distance from point to all cluster means, // and reassign point to cluster if the minimum distance has changed. bool changed = false; for (int i = 0; i < length; ++i) { float min_delta = FLT_MAX; intmin_cluster = clusters[i]; for (int j = 0; j < k; ++j) { float delta = Point::distance(points[i], means[j]); if (min_delta > delta) { min_cluster = j; min_delta = delta; } } if (min_cluster != clusters[i]) { changed = true; clusters[i] = min_cluster; } } if (! changed) break; } } • points= [1, 2, 3, 7, 8, 9] • clusters = [0, 0, 0, 1, 1, 1] • means = [4.0, 6.0] • change = true

K-Means Clustering sequential void cpu_kmeans(int k, int length, Point * points, int * clusters) { Point * means = (Point*)malloc(k * sizeof(Point)); for (;;) { // Compute means of all clusters. for (int i = 0; i < k; ++i) { // means[i] = Point::Zero; means[i].ZeroOut(); int count = 0; for (int j = 0; j < length; ++j) { if (i == clusters[j]) { count++; means[i] = means[i] + points[j]; } } if (count != 0) means[i] = means[i] / count; } // For all points, get minimum distance from point to all cluster means, // and reassign point to cluster if the minimum distance has changed. bool changed = false; for (int i = 0; i < length; ++i) { float min_delta = FLT_MAX; intmin_cluster = clusters[i]; for (int j = 0; j < k; ++j) { float delta = Point::distance(points[i], means[j]); if (min_delta > delta) { min_cluster = j; min_delta = delta; } } if (min_cluster != clusters[i]) { changed = true; clusters[i] = min_cluster; } } if (! changed) break; } } • points= [1, 2, 3, 7, 8, 9] • clusters = [0, 0, 0, 1, 1, 1] • means = [2.0, 8.0] • change = false

K-Means Clustering sequential void cpu_kmeans(int k, int length, Point * points, int * clusters) { Point * means = (Point*)malloc(k * sizeof(Point)); for (;;) { // Compute means of all clusters. for (int i = 0; i < k; ++i) { // means[i] = Point::Zero; means[i].ZeroOut(); int count = 0; for (int j = 0; j < length; ++j) { if (i == clusters[j]) { count++; means[i] = means[i] + points[j]; } } if (count != 0) means[i] = means[i] / count; } // For all points, get minimum distance from point to all cluster means, // and reassign point to cluster if the minimum distance has changed. bool changed = false; for (int i = 0; i < length; ++i) { float min_delta = FLT_MAX; intmin_cluster = clusters[i]; for (int j = 0; j < k; ++j) { float delta = Point::distance(points[i], means[j]); if (min_delta > delta) { min_cluster = j; min_delta = delta; } } if (min_cluster != clusters[i]) { changed = true; clusters[i] = min_cluster; } } if (! changed) break; } } • points= [1, 2, 3, 7, 8, 9] • clusters = [0, 0, 0, 1, 1, 1] • means = [2.0, 8.0] • change = false •  break

K-Means Clustering parallel? void cpu_kmeans(int k, int length, Point * points, int * clusters) { Point * means = (Point*)malloc(k * sizeof(Point)); for (;;) { // Compute means of all clusters. for (int i = 0; i < k; ++i) { // means[i] = Point::Zero; means[i].ZeroOut(); int count = 0; for (int j = 0; j < length; ++j) { if (i == clusters[j]) { count++; means[i] = means[i] + points[j]; } } if (count != 0) means[i] = means[i] / count; } // For all points, get minimum distance from point to all cluster means, // and reassign point to cluster if the minimum distance has changed. bool changed = false; for (int i = 0; i < length; ++i) { float min_delta = FLT_MAX; intmin_cluster = clusters[i]; for (int j = 0; j < k; ++j) { float delta = Point::distance(points[i], means[j]); if (min_delta > delta) { min_cluster = j; min_delta = delta; } } if (min_cluster != clusters[i]) { changed = true; clusters[i] = min_cluster; } } if (! changed) break; } } • How to implement nested for-loops?

K-Means Clustering parallel? // Compute means of all clusters. for (int i = 0; i < k; ++i) { // means[i] = Point::Zero; means[i].ZeroOut(); int count = 0; for (int j = 0; j < length; ++j) { if (i == clusters[j]) { count++; means[i] = means[i] + points[j]; } } if (count != 0) means[i] = means[i] / count; } Naïve implementation: one thread for each cluster, used to figure out mean. __global__ void compute_means_orig(Point * means, int length, Point * points, int * clusters) { int i = threadIdx.x; means[i].ZeroOut(); int count = 0; for (int j = 0; j < length; ++j) { if (i == clusters[j]) { count++; means[i] = means[i] + points[j]; } } if (count != 0) means[i] = means[i] / count; } It works, but does not use shared memory, and accounts for 98% of overall runtime.

K-Means Clustering parallel void gpu_kmeans(int k, int length, Point * points, int * clusters) { Point * means = (Point*)malloc(k * sizeof(Point)); Point * d_means; cudaMalloc(&d_means, k * sizeof(Point)); Point * d_points; cudaMalloc(&d_points, length * sizeof(Point)); cudaMemcpy(d_points, points, length * sizeof(Point), cudaMemcpyHostToDevice); int * d_clusters; cudaMalloc(&d_clusters, length * sizeof(int)); cudaMemcpy(d_clusters, clusters, length * sizeof(Point), cudaMemcpyHostToDevice); bool * d_changed; cudaMalloc(&d_changed, sizeof(bool)); int * count = (int*)malloc(k * sizeof(int)); for (int i = 0; i < k; ++i) count[i] = 0; int * d_count; cudaMalloc(&d_count, k * sizeof(int)); int zero = 0; for (;;) { // Compute means of all clusters. compute_means0<<<1,k>>>(d_means, d_count, k); compute_means1<<<length / 32 + 1, 32, k * sizeof(Point) + k * sizeof(int)>>>( d_means, d_count, k, length, d_points, d_clusters); compute_means2<<<1,k>>>(d_means, d_count, k); // For every point, compute minimum distance from point to all // cluster means, // and reassign point to cluster if the minimum distance has changed. bool changed = false; cudaMemcpy(d_changed, &changed, sizeof(bool), cudaMemcpyHostToDevice); compute_clusters<<<length / 32 + 1, 32, k * sizeof(Point)>>>( d_means, k, length, d_points, d_clusters, d_changed); cudaMemcpy(&changed, d_changed, sizeof(bool), cudaMemcpyDeviceToHost); if (! changed) break; } cudaMemcpy(clusters, d_clusters, length * sizeof(Point), cudaMemcpyDeviceToHost); } Have three kernels for calculating means: (1) compute_means0 to initialize means to 0; (2) compute_means1 to compute sum of points and counts; (3) compute_means2 to divide each mean by count.

K-Means Clustering parallel • Kernel compute_means0: • k threads per block, one block 0.1 2.0 -1.3 4.3 means[]

K-Means Clustering parallel • Initialize counts and means (k threads for k means) __global__ void compute_means0(Point * means, int * count, int k) { int i = threadIdx.x; count[i] = 0; means[i].ZeroOut(); } • Note: CUDA annoyance. You cannot define static device data in a class, e.g., for singleton or constants: • static __device__ Point Point::Zero; • means[i] = Zero; => error : memory qualifier on data member is not allowed

K-Means Clustering parallel • Kernel compute_means1: • 32 threads per block, length / 32 + 1 blocks 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 -1.3 -1.3 -1.3 -1.3 -1.3 -1.3 -1.3 -1.3 4.3 4.3 4.3 4.3 4.3 4.3 4.3 4.3 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 -1.3 -1.3 -1.3 -1.3 -1.3 -1.3 -1.3 -1.3 4.3 4.3 4.3 4.3 4.3 4.3 4.3 4.3 points[]

K-Means Clustering parallel • Compute counts and means (32 threads per block, for all points) __global__ void compute_means1(Point * means, int * count, int k, int length, Point * points, int * clusters) { extern __shared__ int buffer[]; Point * smeans = (Point*)buffer; int * scount = (int*)&smeans[k]; int i = threadIdx.x + blockIdx.x * blockDim.x; if (i >= length) return; if (threadIdx.x == 0) { for (int j = 0; j < k; ++j) { smeans[j].ZeroOut(); scount[j] = 0; } } __syncthreads(); int c = clusters[i]; for (int j = 0; j < k; ++j) { if (j == c) { atomicAdd(&scount[j], 1); for (int d = 0; d < NUM_COMPONENTS; ++d) atomicAdd(&smeans[j].x[d], points[i].x[d]); } } __syncthreads(); if (threadIdx.x == 0) { for (int j = 0; j < k; ++j) { count[j] += scount[j]; for (int d = 0; d < NUM_COMPONENTS; ++d) means[j].x[d] += smeans[j].x[d]; } } __syncthreads(); }

K-Means Clustering parallel • Kernel compute_means2: • k threads per block, one block 0.91 11 20.0 44 -1.93 19 4.63 65 means[] count[]

K-Means Clustering parallel • Compute means = sum divided by count for that cluster. __global__ void compute_means2(Point * means, int * count, int k) { int i = threadIdx.x; if (count[i] != 0) means[i] = means[i] / count[i]; } Note: updating global variable “means” to be used in final kernel.

K-Means Clustering parallel __global__ void compute_clusters(Point * means, int k, int length, Point * points, int * clusters, bool * changed) { //extern __shared__ int buffer[]; //Point * smeans = (Point*)buffer; Point * smeans = means; int i = threadIdx.x + blockIdx.x * blockDim.x; if (i >= length) return; // Preload means. if (threadIdx.x == 0) { for (int j = 0; j < k; ++j) smeans[j] = means[j]; } __syncthreads(); float min_delta = FLT_MAX; intmin_cluster = clusters[i]; for (int j = 0; j < k; ++j) { float delta = Point::distance(points[i], means[j]); if (min_delta > delta) { min_cluster = j; min_delta = delta; } } if (min_cluster != clusters[i]) { *changed = true; clusters[i] = min_cluster; } }

Breath-First Search • A graph is a collection of nodes (V) and edges (E). They can be directed or undirected. 13 20 18 0 3 7 11 5 15 2 9 6

Breath-First Search • A graph can be represented a number of ways. But, if it is large, it is very important to choose a representation carefully because the GPU has limited memory.

Breadth-First Search • Breadth-first search is a fundamental algorithm for graphs. • The idea is to keep track of a set of frontier nodes, mark them as such, and navigate to the next set of nodes along edges from each node in the frontier.

Breadth-First Search Cormen, T., Leiserson, C., Rivest, R. and Stein, C. Introduction to Algorithms. MIT Press McGraw-Hill Book Company, 2001.

Parallel Breadth-First Search • Parallel BFS found by keeping a set of nodes for the frontier (F), update frontier (Fu), visited (X), and level (C). Harish, P., Vineet, V. and Narayanan, P. Large Graph Algorithms for Massively Multithreaded Architectures, Large Graph Algorithms for Massively Multithreaded Architectures, Hyderabad, INDIA, 2009.

Parallel Breadth-First Search • Kernel1 picks a node that is in the frontier (F), and marks it no longer in the frontier. • For all neighbor nodes that are not visited, get the depth for the neighbor, and mark it as frontier.

Parallel Breadth-First Search • Kernel2 updates the frontier and visited sets. • If the frontier changed, then the algorithm must pass over the new frontier.

Breadth-First Search in CUDA signed char * BFS::gpuBFS(intblock_side) { // Copy graph into GPU memory. int * V_d; int * E_d; … memset(F_h, 0, Vs*sizeof(unsigned char)); memset(C_h, -1, Vs*sizeof(signed char)); memset(X_h, 0, Vs*sizeof(unsigned char)); F_h[S] = 1; C_h[S] = 0; … int * any_change_d; if (cudaMalloc((void**)&any_change_d, sizeof(int))) return 0; … int N = graph->Vs; double side = sqrt((double)N); side = ceil(side); intnside = (int)side; intn_blocks = nside / block_side + (nside % block_side == 0 ? 0:1); int count = 0; for (;;) { *any_change_h = 0; cudaMemcpy(any_change_d, any_change_h, sizeof(int), cudaMemcpyHostToDevice); dim3 Dg(n_blocks, n_blocks, 1); dim3 Db(block_side, block_side, 1); count++; do_kernel_bfs1<<<Dg, Db>>>(any_change_d, Vs, V_d, Es, E_d, EIs, EI_d, F_d, newF_d, C_d, X_d); cudaMemcpy(any_change_h, any_change_d, sizeof(int), cudaMemcpyDeviceToHost); if (*any_change_h == 0) break; do_kernel_bfs2<<<Dg, Db>>>(any_change_d, Vs, V_d, Es, E_d, EIs, EI_d, F_d, newF_d, C_d, X_d); cudaMemcpy(any_change_h, any_change_d, sizeof(int), cudaMemcpyDeviceToHost); if (*any_change_h == 0) break; } cudaMemcpy(C_h, C_d, Vs * sizeof(unsigned char), cudaMemcpyDeviceToHost); … return C_h; } Two kernels with same dimensions. Is there a way to do global synchronization so only one kernel could be called? NOT KNOWN

Matrix Multiplication • MM is used everywhere!

Matrix Multiplication • A simple sequential CPU implementation: static boolMultiply_Host(Matrix<T> * C, Matrix<T> * A, Matrix<T> * B) { inthA = A->height; intwA = A->width; intwB = B->width; for (int i = 0; i < hA; ++i) for (int j = 0; j < wB; ++j) { T sum = 0; for (int k = 0; k < wA; ++k) { T a = A->data[i * wA + k]; T b = B->data[k * wB + j]; sum += a * b; } C->data[i * wB + j] = sum; } return true; };

Simple Parallel Matrix Multiplication • Idea: Compute C(i, j) in parallel, each in it’s own thread.

Simple Parallel Matrix Multiplication // setup execution parameters dim3 threads(wC, hC); dim3 grid(1,1); Kernel_Matrix_Multiply_Simple<T> <<< grid, threads >>>(d_C, d_A, d_B); template <class T> __global__ void Kernel_Matrix_Multiply_Simple(Matrix<T> * C, Matrix<T> * A, Matrix<T> * B) { intwA = A->width; intwB = B->width; intwC = C->width; // 2D Thread ID int col = threadIdx.x; int row = threadIdx.y; // Pvalue stores the Pd element that is computed by the thread T Pvalue = 0; for (int k = 0; k < wA; ++k) { T Aelement = A->data[row * wA + k]; T Belement = B->data[k * wB + col]; Pvalue += Aelement * Belement; } // Write the matrix to device memory each thread writes one element C->data[row * wC + col] = Pvalue; }

Simple Parallel Matrix Multiplication • Problems: • Slow • Not scalable because (r, c) maximum is (1024, 1024)

Lecture 4: Introduction to Parallel Computing Using CUDA