1
__kernel void multiply(__global float *res, __global float *a, __global float *b, unsigned long local_size, __global float *scratch, __local float *shmem) {
5
int threads = get_global_size(0);
6
int groups = get_num_groups(0);
7
int group = get_group_id(0);
8
int pos = get_global_id(0);
9
int tid = get_local_id(0);
11
for(i = 0; i < local_size; i++) {
12
sum += i;//a[i * threads + pos] * b[i * threads + pos];
17
barrier(CLK_LOCAL_MEM_FENCE);
19
for(i = get_local_size(0)/2; i>0; i >>= 1) {
20
if (tid < i) shmem[tid] += shmem[tid + i];
21
barrier(CLK_LOCAL_MEM_FENCE);
25
scratch[group] = shmem[0];
28
barrier(CLK_GLOBAL_MEM_FENCE);
32
for (i = 0; i < groups; i++)