1
#define A(i, j) shmem[i * BLOCK_SIZE + j]
2
#define B(i, j) shmem[BLOCK_SIZE * BLOCK_SIZE + i * BLOCK_SIZE + j]
4
__kernel void multiply(__global float *res, __global float *a, __global float *b, unsigned long size, __local float *shmem) {
7
int tx = get_local_id(0);
8
int ty = get_local_id(1);
10
int i = get_global_id(1);
11
int j = get_global_id(0);
16
for(k = 0; k < size; k += BLOCK_SIZE) {
17
A(ty, tx) = a[i * size + (k + tx)];
18
B(ty, tx) = b[(k + ty) * size + j];
20
barrier(CLK_LOCAL_MEM_FENCE);
22
#pragma unroll BLOCK_SIZE
23
for (l = 0; l < BLOCK_SIZE; ++l) {
24
sum += A(ty, l) * B(l, tx);
27
barrier(CLK_LOCAL_MEM_FENCE);
30
res[i * size +j] = sum;