/perf/kseta : revision 1

To get this branch, use:

bzr branch
http://darksoft.org/webbzr/perf/kseta

« back to all changes in this revision

Viewing changes to sources/mm/opencl4.cl

Committer: Suren A. Chilingaryan
Date: 2013-09-30 06:47:09 UTC
Revision ID: csa@dside.dyndns.org-20130930064709-55cde0k5ci76t8z5

Simple matrix multiplication

files added:

.bzrignore

sources

sources/CMakeLists.txt

sources/cmake

sources/cmake/FindAPPML.cmake

sources/cmake/FindATI.cmake

sources/cmake/FindCUDA.cmake

sources/cmake/FindIntel.cmake

sources/cmake/FindMKL.cmake

sources/cmake/FindOpenCL.cmake

sources/cmake/make2cmake.cmake

sources/cmake/parse_cubin.cmake

sources/cmake/run_nvcc.cmake

sources/mm

sources/mm/CMakeLists.txt

sources/mm/amdblas.c

sources/mm/mkl.c

sources/mm/mm.c

sources/mm/opencl.c

sources/mm/opencl1

sources/mm/opencl1.cl

sources/mm/opencl2

sources/mm/opencl2.cl

sources/mm/opencl3.cl

sources/mm/opencl4

sources/mm/opencl4.cl

sources/mm/opencl5.cl

sources/mm/opencl51.c

sources/mm/opencl51.cl

sources/mm/opencl52

sources/mm/opencl52.cl

sources/mm/opencl6.c

sources/mm/opencl6.cl

sources/mm/opencl7.c

sources/mm/opencl7.cl

sources/mm/opencl71.c

sources/mm/opencl71.cl

sources/mm/opencl8.c

sources/mm/opencl8.cl

sources/mm/simple.c

sources/tools

sources/tools/CMakeLists.txt

sources/tools/cl_compiler.c

sources/tools/cl_info.c

sources/tools/fp_cmp.c

Show diffs side-by-side

added added

removed removed

sources/mm/opencl4.cl

#define A(i, j) shmem[(i) * PPT * BLOCK_SIZE + (j)]

#define B(i, j) shmem[PPT * PPT * BLOCK_SIZE * BLOCK_SIZE + (i) * PPT * BLOCK_SIZE + (j)]

__kernel void multiply(__global float *res, __global float *a, __global float *b, unsigned long size, __local float *shmem) {

float sum[PPT][PPT] = {0};

int bx = get_group_id(0) * get_local_size(0) * PPT;

int by = get_group_id(1) * get_local_size(1) * PPT;

int tx = get_local_id(0);

int ty = get_local_id(1);

int i = get_global_id(1);

int j = get_global_id(0);

int x, y;

int k, l;

for(k = 0; k < size; k += PPT * BLOCK_SIZE) {

#pragma unroll PPT

for (y = 0; y < PPT; ++y) {

#pragma unroll PPT

for (x = 0; x < PPT; ++x) {

A(y * BLOCK_SIZE + ty, x * BLOCK_SIZE + tx) = a[(by + y * BLOCK_SIZE + ty) * size + (k + x * BLOCK_SIZE + tx)];

B(y * BLOCK_SIZE + ty, x * BLOCK_SIZE + tx) = b[(k + y * BLOCK_SIZE + ty) * size + (bx + x * BLOCK_SIZE + tx)];

}

barrier(CLK_LOCAL_MEM_FENCE);

#pragma unroll PPT * BLOCK_SIZE

for (l = 0; l < PPT * BLOCK_SIZE; ++l) {

#pragma unroll PPT

for (y = 0; y < PPT; ++y) {

#pragma unroll PPT

for (x = 0; x < PPT; ++x) {

sum[y][x] += A(y * BLOCK_SIZE + ty, l) * B(l, x * BLOCK_SIZE + tx);

}

barrier(CLK_LOCAL_MEM_FENCE);

}

#pragma unroll PPT

for (y = 0; y < PPT; ++y) {

#pragma unroll PPT

for (x = 0; x < PPT; ++x) {

res[(by + y * BLOCK_SIZE + ty) * size + bx + x * BLOCK_SIZE + tx] = sum[y][x];

}

Older »