/perf/kseta

To get this branch, use:
bzr branch http://darksoft.org/webbzr/perf/kseta
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
#define A(i, j) shmem[(i) * BLOCK_SIZE + (j)]
#define B(i, j) shmem[PPT * BLOCK_SIZE * BLOCK_SIZE + (i) * PPT * BLOCK_SIZE + (j)]

__kernel void multiply(__global float *res, __global float *a, __global float *b, unsigned long size, __local float *shmem) {
    float sum[PPT][PPT] = {0};
    
    int bx = get_group_id(0) * get_local_size(0) * PPT;
    int by = get_group_id(1) * get_local_size(1) * PPT;
    
    int tx = get_local_id(0);
    int ty = get_local_id(1);

    int i = get_global_id(1);
    int j = get_global_id(0);

    int x, y;
    int k, l, m;


    for(k = 0; k < size; k += BLOCK_SIZE) {
	
#pragma unroll PPT
	for (m = 0; m < PPT; ++m) {
	    A(m * BLOCK_SIZE + ty, tx) = a[(by + m * BLOCK_SIZE + ty) * size + (k + tx)];
	    B(ty, m * BLOCK_SIZE + tx) = b[(k + ty) * size + (bx + m * BLOCK_SIZE + tx)];
	}
	
	barrier(CLK_LOCAL_MEM_FENCE);
	
#pragma unroll PPT * BLOCK_SIZE
	for (l = 0; l < BLOCK_SIZE; ++l) {
#pragma unroll PPT
	    for (y = 0; y < PPT; ++y) {
#pragma unroll PPT
		for (x = 0; x < PPT; ++x) {
		    sum[y][x]  += A(y * BLOCK_SIZE + ty, l) * B(l, x * BLOCK_SIZE + tx);
		}
	    }
	}

	barrier(CLK_LOCAL_MEM_FENCE);
    }
    
#pragma unroll PPT
	for (y = 0; y < PPT; ++y) {
#pragma unroll PPT
	    for (x = 0; x < PPT; ++x) {
		res[(by + y * BLOCK_SIZE + ty) * size + bx + x * BLOCK_SIZE + tx] = sum[y][x];
	    }
	}
}