/perf/kseta : revision 1

To get this branch, use:

bzr branch
http://darksoft.org/webbzr/perf/kseta

« back to all changes in this revision

Viewing changes to sources/mm/opencl3.cl

Committer: Suren A. Chilingaryan
Date: 2013-09-30 06:47:09 UTC
Revision ID: csa@dside.dyndns.org-20130930064709-55cde0k5ci76t8z5

Simple matrix multiplication

files added:

.bzrignore

sources

sources/CMakeLists.txt

sources/cmake

sources/cmake/FindAPPML.cmake

sources/cmake/FindATI.cmake

sources/cmake/FindCUDA.cmake

sources/cmake/FindIntel.cmake

sources/cmake/FindMKL.cmake

sources/cmake/FindOpenCL.cmake

sources/cmake/make2cmake.cmake

sources/cmake/parse_cubin.cmake

sources/cmake/run_nvcc.cmake

sources/mm

sources/mm/CMakeLists.txt

sources/mm/amdblas.c

sources/mm/mkl.c

sources/mm/mm.c

sources/mm/opencl.c

sources/mm/opencl1

sources/mm/opencl1.cl

sources/mm/opencl2

sources/mm/opencl2.cl

sources/mm/opencl3.cl

sources/mm/opencl4

sources/mm/opencl4.cl

sources/mm/opencl5.cl

sources/mm/opencl51.c

sources/mm/opencl51.cl

sources/mm/opencl52

sources/mm/opencl52.cl

sources/mm/opencl6.c

sources/mm/opencl6.cl

sources/mm/opencl7.c

sources/mm/opencl7.cl

sources/mm/opencl71.c

sources/mm/opencl71.cl

sources/mm/opencl8.c

sources/mm/opencl8.cl

sources/mm/simple.c

sources/tools

sources/tools/CMakeLists.txt

sources/tools/cl_compiler.c

sources/tools/cl_info.c

sources/tools/fp_cmp.c

Show diffs side-by-side

added added

removed removed

sources/mm/opencl3.cl

#define A(i, j) shmem[(i) * PPT * BLOCK_SIZE + (j)]

#define B(i, j) shmem[PPT * PPT * BLOCK_SIZE * BLOCK_SIZE + (i) * PPT * BLOCK_SIZE + (j)]

__kernel void multiply(__global float *res, __global float *a, __global float *b, unsigned long size, __local float *shmem) {

float sum[PPT][PPT] = {0};

int tx = get_local_id(0);

int ty = get_local_id(1);

int i = get_global_id(1);

int j = get_global_id(0);

int x, y;

int k, l;

for(k = 0; k < size; k += PPT * BLOCK_SIZE) {

#pragma unroll PPT

for (y = 0; y < PPT; ++y) {

#pragma unroll PPT

for (x = 0; x < PPT; ++x) {

A(ty * PPT + y, tx * PPT + x) = a[(i * PPT + y) * size + (k + tx * PPT + x)];

B(ty * PPT + y, tx * PPT + x) = b[(k + ty * PPT + y) * size + (j * PPT + x)];

}

barrier(CLK_LOCAL_MEM_FENCE);

#pragma unroll PPT * BLOCK_SIZE

for (l = 0; l < PPT * BLOCK_SIZE; ++l) {

#pragma unroll PPT

for (y = 0; y < PPT; ++y) {

#pragma unroll PPT

for (x = 0; x < PPT; ++x) {

sum[y][x] += A(ty * PPT + y, l) * B(l, tx * PPT + x);

// sum[y][x] += A(ty * PPT + y, l) * b[(k + l) * size + (j * PPT + x)];

// sum[y][x] += a[(i * PPT + y) * size + (k + l)] * b[(k + l) * size + (j * PPT + x)];

}

barrier(CLK_LOCAL_MEM_FENCE);

}

#pragma unroll PPT

for (y = 0; y < PPT; ++y) {

#pragma unroll PPT

for (x = 0; x < PPT; ++x) {

res[(i * PPT + y) * size + j * PPT + x] = sum[y][x];

}

Older »