/tomo/pyhst : revision 148

To get this branch, use:

bzr branch
http://darksoft.org/webbzr/tomo/pyhst

« back to all changes in this revision

Viewing changes to docs/optimizations/kepler/hst_linear_good.h

Committer: Suren A. Chilingaryan
Date: 2012-05-10 15:06:33 UTC
Revision ID: csa@dside.dyndns.org-20120510150633-56gdy6t3tflz2gab

OpenCL clean-up

files added:
docs/optimizations/gcn/hst_opencl_dma_8x8_6ppt.cl

docs/optimizations/gcn/subblocks.txt

docs/optimizations/kepler

docs/optimizations/kepler/approximation.txt

docs/optimizations/kepler/hst_linear_and_tex.h

docs/optimizations/kepler/hst_linear_art.h

docs/optimizations/kepler/hst_linear_good.h

docs/optimizations/kepler/hst_linear_multivar.h

docs/optimizations/kepler/hst_tex_uniform.h

tools

tools/gen.sh

files renamed:
docs/optimizations/sources/ => docs/optimizations/fermi/

hst_opencl/docs/ => docs/optimizations/gcn/

files modified:
hst_opencl/hst_opencl.c

hst_opencl/hst_opencl_kernel.h

hst_opencl/hst_opencl_kernels.cl

Show diffs side-by-side

added added

removed removed

docs/optimizations/kepler/hst_linear_good.h

#define PPT 4

// Iterations per line

#define IPT (16/PPT)

//#define PROJ_LINE (2 * IPT * PPT * 3)

#define PROJ_LINE (PPT * BLOCK_SIZE_X * 3 / 2)

__global__ static void hst_cuda_mplinear_kernel(int num_proj, int num_bins, float *d_SLICE, float apos_off_x, float apos_off_y, int batch) {

float res1[PPT][PPT] = {0.f};

float res2[PPT][PPT] = {0.f};

const int tidx = threadIdx.x;

const int tidy = threadIdx.y;

const int bidx = PPT * blockIdx.x * BLOCK_SIZE_X;

const int bidy = batch + PPT * blockIdx.y * BLOCK_SIZE_Y;

const float bx = bidx + apos_off_x;

const float by = bidy + apos_off_y;

__shared__ float f_minh[128];

__shared__ float2 cache[IPT * PROJ_LINE + 1];

const int tbidy = tidy/(PPT/2);

const int tbidx = tidy%(PPT/2);

const int wrapid = (tidy&1) * 16 + tidx;

const int sbidx = 8 * ((tidy / 2) % 2);

const int sbidy = 8 * ((tidy / 2) / 2);

const int stidx = tidx % 8;

const int stidy = 2 * (2 * (tidy&1) + tidx / 8);

const int sidx = (sbidx + stidx);

const int sidy = (sbidy + stidy);

const int idx = bidx + sidx;

const int idy = bidy + sidy;

const float x = idx + apos_off_x;

const float y = idy + apos_off_y;

const float exp23 = exp2(23.f);

const int ttidx = 16 * tidy + tidx;

float projf = tbidy + 0.5f;

const int num_proj_blocks = num_proj / 32;

for (int proj_block=0; proj_block<num_proj_blocks; proj_block += 4) {

const int proj = proj_block * 32;

float4 all = c_all[proj + ttidx];

float minh = floor(all.z + bx * all.x - by * all.y + all.w);

f_minh[ttidx] = minh;

__syncthreads();

float fminh[4];

#pragma unroll 4

for (int i = 0; i < 4; i++) {

fminh[i] = f_minh[32 * i + wrapid];

}

int max_proj = min(num_proj_blocks - proj_block, 4);

#pragma unroll 1

for (int subproj32 = 0; subproj32 < max_proj; subproj32++) {

#pragma unroll 1

for (int subproj = 0; subproj < (32/IPT); subproj++) {

const int loop_proj = 32 * subproj32 + IPT * subproj;

const int proje = proj + loop_proj;

float4 all = c_all[proje + tbidy];

#ifdef HST_OPTIMIZE_KEPLER

int minh = __shfl(fminh[subproj32], IPT * subproj + tbidy, 32);

#else // HST_OPTIMIZE_KEPLER

int minh = i_minh[loop_proj + tbidy];

#endif // HST_OPTIMIZE_KEPLER

#pragma unroll 3

for (int i = 0; i < 3; i++) {

int pos = (i * (PPT/2) + tbidx) * BLOCK_SIZE_X + tidx;

cache[PROJ_LINE * tbidy + pos].x = tex2D(tex_projes, minh + pos, proje + projf);

}

// we may use fence instead

//__syncthreads();

#pragma unroll 3

for (int i = 0; i < 3; i++) {

int pos = (i * (PPT/2) + tbidx) * BLOCK_SIZE_X + tidx;

cache[tbidy * PROJ_LINE + pos].y = cache[tbidy * PROJ_LINE + pos + 1].x - cache[tbidy * PROJ_LINE + pos].x;

}

100

101

__syncthreads();

102

103

#pragma unroll 4 // IPT

104

for (int p = 0; p < IPT; p++) {

105

float4 all = c_all[proje + p];

106

107

#ifdef HST_OPTIMIZE_KEPLER

108

float minh = __shfl(fminh[subproj32], IPT * subproj + p, 32);

109

#else // HST_OPTIMIZE_KEPLER

110

float minh = f_minh[loop_proj + p];

111

#endif // HST_OPTIMIZE_KEPLER

112

113

float h = all.z + x * all.x - y * all.y - minh;

114

115

116

char *cache_row = ((char*)(&cache)) + p * PROJ_LINE * sizeof(float2);

117

118

119

#pragma unroll 4 // PPT

120

for (int i = 0; i < PPT; i++) {

121

float subh = h;

122

h -= 16.f * all.y;

123

#pragma unroll 4 // PPT

124

for (int j = 0; j < PPT; j++) {

125

float subh1 = subh;

126

float subh2 = subh - all.y;

127

128

129

float fsubh1 = subh1 + exp23;

130

int idx1 = (*(int*)(&fsubh1)) - 0x4B000000;

131

fsubh1 = subh1 - (fsubh1 - exp23);

132

133

float fsubh2 = subh2 + exp23;

134

int idx2 = (*(int*)(&fsubh2)) - 0x4B000000;

135

fsubh2 = subh2 - (fsubh2 - exp23);

136

137

138

float2 c1 = cache[p * PROJ_LINE + idx1];

139

res1[i][j] += c1.x + fsubh1 * c1.y;

140

141

float2 c2 = cache[p * PROJ_LINE + idx2];

142

res2[i][j] += c2.x + fsubh2 * c2.y;

143

144

subh += 16.f * all.x;

145

}

146

}

147

148

}

149

}

150

__syncthreads();

151

}

152

}

153

154

#pragma unroll 4

155

for (int i = 0; i < PPT; i++) {

156

#pragma unroll 4

157

for (int j = 0; j < PPT; j++) {

158

d_SLICE[BLOCK_SIZE_X * PPT * gridDim.x * (idy + i * BLOCK_SIZE_Y) + idx + j * BLOCK_SIZE_X] = res1[i][j];

159

d_SLICE[BLOCK_SIZE_X * PPT * gridDim.x * (idy + i * BLOCK_SIZE_Y + 1) + idx + j * BLOCK_SIZE_X] = res2[i][j];

160

}

161

}

162

}

Older »