1
__constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
3
#define A(i, j) shmem[(i) * BLOCK_SIZE + (j)]
4
#define B(i, j) shmem[PPT * BLOCK_SIZE * BLOCK_SIZE + (i) * PPT * BLOCK_SIZE + (j)]
6
__kernel void multiply(__global float *res, __read_only image2d_t a, __read_only image2d_t b, unsigned long size, __local float *shmem) {
7
float sum[PPT][PPT] = {0};
9
int bx = get_group_id(0) * get_local_size(0) * PPT;
10
int by = get_group_id(1) * get_local_size(1) * PPT;
12
int tx = get_local_id(0);
13
int ty = get_local_id(1);
15
int i = get_global_id(1);
16
int j = get_global_id(0);
22
for(k = 0; k < size; k += BLOCK_SIZE) {
24
for (m = 0; m < PPT; ++m) {
25
A(m * BLOCK_SIZE + ty, tx) = read_imagef(a, sampler, (int2)(k + tx, by + m * BLOCK_SIZE + ty)).x; //a[(by + m * BLOCK_SIZE + ty) * size + (k + tx)];
26
B(ty, m * BLOCK_SIZE + tx) = read_imagef(b, sampler, (int2)(bx + m * BLOCK_SIZE + tx, k + ty)).x; //b[(k + ty) * size + (bx + m * BLOCK_SIZE + tx)];
29
barrier(CLK_LOCAL_MEM_FENCE);
31
//#pragma unroll PPT * BLOCK_SIZE
32
for (l = 0; l < BLOCK_SIZE; ++l) {
34
for (y = 0; y < PPT; ++y) {
36
for (x = 0; x < PPT; ++x) {
37
sum[y][x] += A(y * BLOCK_SIZE + ty, l) * B(l, x * BLOCK_SIZE + tx);
42
barrier(CLK_LOCAL_MEM_FENCE);
46
for (y = 0; y < PPT; ++y) {
48
for (x = 0; x < PPT; ++x) {
49
res[(by + y * BLOCK_SIZE + ty) * size + bx + x * BLOCK_SIZE + tx] = sum[y][x];