/tomo/pyhst : revision 148

To get this branch, use:

bzr branch
http://darksoft.org/webbzr/tomo/pyhst

« back to all changes in this revision

Viewing changes to docs/optimizations/kepler/hst_linear_art.h

Committer: Suren A. Chilingaryan
Date: 2012-05-10 15:06:33 UTC
Revision ID: csa@dside.dyndns.org-20120510150633-56gdy6t3tflz2gab

OpenCL clean-up

files added:
docs/optimizations/gcn/hst_opencl_dma_8x8_6ppt.cl

docs/optimizations/gcn/subblocks.txt

docs/optimizations/kepler

docs/optimizations/kepler/approximation.txt

docs/optimizations/kepler/hst_linear_and_tex.h

docs/optimizations/kepler/hst_linear_art.h

docs/optimizations/kepler/hst_linear_good.h

docs/optimizations/kepler/hst_linear_multivar.h

docs/optimizations/kepler/hst_tex_uniform.h

tools

tools/gen.sh

files renamed:
docs/optimizations/sources/ => docs/optimizations/fermi/

hst_opencl/docs/ => docs/optimizations/gcn/

files modified:
hst_opencl/hst_opencl.c

hst_opencl/hst_opencl_kernel.h

hst_opencl/hst_opencl_kernels.cl

Show diffs side-by-side

added added

removed removed

docs/optimizations/kepler/hst_linear_art.h

This is attempt to use __shfl to cache parts of the shared memory cache in the registers.

We are going by half-wraps. Each half-wrap is responsible for dual-area of (4x4) * (4x4)

pixels (vertically stacked). I.e. we have 32x16 block of pixels per half-wrap. The full

wrap encloses 32x32 square. No we are using a full wrap to cache the required part of

cache for each of the half-wraps. A bit of doubling, but may be expected to pay off.

There is no way round: 32x32 will require 48 positions and we can't store it. The same

way processing only half-wraps will require 24 positions, but we would have only 16

places...

Possible benefit of the approach is reduction of LD units usage and possible synergy

with texture fetch algorithm.

This code works somehow, but slow and probably has some artifacts. May be it is possible

to optimize somehow access to shared memory...

__global__ static void hst_cuda_linear_kernel(int num_proj, int num_bins, float *d_SLICE, float apos_off_x, float apos_off_y, int batch) {

float4 res = {0.f, 0.f, 0.f, 0.f};

const int tidx = threadIdx.x;

const int tidy = threadIdx.y;

const int bidx = 2 * blockIdx.x * BLOCK_SIZE_X;

const int bidy = batch + 2 * blockIdx.y * BLOCK_SIZE_Y;

const int idx = bidx + 2 * tidx;

const int idy = bidy + 2 * tidy;

const float bx = bidx + apos_off_x;

const float by = bidy + apos_off_y;

const float x = idx + apos_off_x;

const float y = idy + apos_off_y;

__shared__ float cache_subh[2*BLOCK_SIZE_X][BLOCK_SIZE_Y + 1];

__shared__ float cache_sin[BLOCK_SIZE_Y];

__shared__ float2 cache[BLOCK_SIZE_Y][3 * BLOCK_SIZE_X + 1];

//#pragma unroll 8

float projf = tidy + 0.5;

for (int proje=0; proje<num_proj; proje += BLOCK_SIZE_Y) {

float4 all = c_all[proje + tidy];

int minh = (int)floor(all.z + bx * all.x - by * all.y + all.w);

float subh = (all.z + x * all.x - minh);

cache_subh[2*tidx][tidy] = subh;

cache_subh[2*tidx+1][tidy] = subh + all.x;

if (tidx == 0) cache_sin[tidy] = all.y;

#pragma unroll 3

for (int i = 0; i < 3; i++) {

int pos = i * BLOCK_SIZE_X + tidx;

cache[tidy][pos].x = tex2D(tex_projes, minh + pos, proje + projf);

}

__syncthreads();

#pragma unroll 3

for (int i = 0; i < 3; i++) {

int pos = i * BLOCK_SIZE_X + tidx;

cache[tidy][pos].y = cache[tidy][pos + 1].x - cache[tidy][pos].x;

}

__syncthreads();

#pragma unroll 16

for (int i = 0; i < BLOCK_SIZE_Y; i++) {

float2 c;

float4 v1, v2;

float4 minh = {

cache_subh[2 * tidx][i] - (y) * cache_sin[i],

cache_subh[2 * tidx + 1][i] - (y) * cache_sin[i],

cache_subh[2 * tidx][i] - (y + 1) * cache_sin[i],

cache_subh[2 * tidx + 1][i] - (y + 1) * cache_sin[i],

};

c = cache[i][(int)minh.x];

v1.x = c.x; v2.x = c.y;

c = cache[i][(int)minh.y];

v1.y = c.x; v2.y = c.y;

c = cache[i][(int)minh.z];

v1.z = c.x; v2.z = c.y;

c = cache[i][(int)minh.w];

v1.w = c.x; v2.w = c.y;

res += v1 + (minh - floorf(minh)) * v2;

}

__syncthreads();

}

*(float2*)(d_SLICE + BLOCK_SIZE_X * 2 * gridDim.x * (idy ) + idx) = make_float2(res.x, res.y);

*(float2*)(d_SLICE + BLOCK_SIZE_X * 2 * gridDim.x * (idy + 1) + idx) = make_float2(res.z, res.w);

}

100

101

102

__device__ static void hst_cuda_mplinear_kernel_2(int num_proj, int num_bins, float *d_SLICE, float apos_off_x, float apos_off_y, int batch) {

103

__shared__ float f_minh[128];

104

__shared__ float2 cache_offset[2][2][2][32]; //wrapy * wrapx * halfwrap * proj

105

__shared__ uchar2 cache_ioffset[2][2][2][32]; //wrapy * wrapx * halfwrap * proj

106

__shared__ float2 cache[IPT * PROJ_LINE];

107

108

float res1[2 * PPT][PPT] = {0.f};

109

// float res2[PPT][PPT] = {0.f};

110

111

const int bidx = PPT * blockIdx.x * BLOCK_SIZE_X;

112

const int bidy = batch + PPT * blockIdx.y * BLOCK_SIZE_Y;

113

114

const float bx = bidx + apos_off_x;

115

const float by = bidy + apos_off_y;

116

117

const int tidx = threadIdx.x;

118

const int tidy = threadIdx.y;

119

120

const int halfwrap = tidy&1;

121

const int wrapid = halfwrap * 16 + tidx;

122

const int wrapx = (tidy/2)%2;

123

const int wrapy = (tidy/2)/2;

124

125

// block-wide minh computation

126

const int ttidx = 16 * tidy + tidx;

127

128

// texture fetches

129

const int tbidx = tidy%(PPT/2);

130

const int tbidy = tidy/(PPT/2);

131

132

// sidx computation and offset caching

133

const int sbidx = 4 * (tidy % 4);

134

const int sbidy = 4 * (tidy / 4);

135

136

// sidx computation only

137

const int stidx = tidx % 4;

138

const int stidy = tidx / 4;

139

140

// idx computation only

141

const int sidx = (sbidx + stidx);

142

const int sidy = 2 * sbidy + stidy;

143

144

// x computation and write-out

145

const int idx = bidx + PPT * sidx;

146

const int idy = bidy + PPT * sidy;

147

148

const float x = idx + apos_off_x;

149

const float y = idy + apos_off_y;

150

151

// const int sbidy = 4 * ((tidy / 2) / 2);

152

153

// const float sbx = bx + BLOCK_SIZE_X * sbidx;

154

// const float sby = by + BLOCK_SIZE_Y * sbidy;

155

156

// const int stidy = (2 * (tidy&1) + tidx / 8);

157

158

159

// const int idx = bidx + PPT * sidx;

160

// const int idy = bidy + PPT * sidy;

161

162

163

const float exp23 = exp2(23.f);

164

165

float projf = tbidy + 0.5f;

166

const int num_proj_blocks = num_proj / 32;

167

168

for (int proj_block=0; proj_block<num_proj_blocks; proj_block += 4) {

169

const int proj = proj_block * 32;

170

171

float4 all = c_all[proj + ttidx];

172

float minh = floor(all.z + bx * all.x - by * all.y + all.w);

173

f_minh[ttidx] = minh;

174

175

__syncthreads();

176

177

float fminh[4];

178

179

#pragma unroll 4

180

for (int i = 0; i < 4; i++) {

181

fminh[i] = f_minh[32 * i + wrapid];

182

}

183

184

int max_proj = min(num_proj_blocks - proj_block, 4);

185

186

#pragma unroll 1

187

for (int subproj32 = 0; subproj32 < max_proj; subproj32++) {

188

189

// all.w/4 to compensate smaller size

190

float4 all = c_all[proj + 32 * subproj32 + wrapid];

191

192

float lminh = __shfl(fminh[subproj32], wrapid, 32);

193

194

float2 offsets1 = make_float2(

195

floorf(all.z + (bx + 4 * PPT * (2 * wrapx + 0)) * all.x - (by + 4 * PPT * (2 * wrapy + 0)) * all.y + (0.25f * all.w) - lminh),

196

floorf(all.z + (bx + 4 * PPT * (2 * wrapx + 0)) * all.x - (by + 4 * PPT * (2 * wrapy + 1)) * all.y + (0.25f * all.w) - lminh)

197

);

198

float2 offsets2 = make_float2(

199

floorf(all.z + (bx + 4 * PPT * (2 * wrapx + 1)) * all.x - (by + 4 * PPT * (2 * wrapy + 0)) * all.y + (0.25f * all.w) - lminh),

200

floorf(all.z + (bx + 4 * PPT * (2 * wrapx + 1)) * all.x - (by + 4 * PPT * (2 * wrapy + 1)) * all.y + (0.25f * all.w) - lminh)

201

);

202

203

cache_offset[wrapy][wrapx][0][wrapid] = offsets1;

204

cache_offset[wrapy][wrapx][1][wrapid] = offsets2;

205

206

207

cache_ioffset[wrapy][wrapx][0][wrapid] = make_uchar2(

208

(unsigned char)floor(offsets1.x),

209

(unsigned char)floor(offsets1.y)

210

);

211

212

cache_ioffset[wrapy][wrapx][1][wrapid] = make_uchar2(

213

(unsigned char)floor(offsets2.x),

214

(unsigned char)floor(offsets2.y)

215

);

216

217

__syncthreads();

218

219

#pragma unroll 1

220

for (int subproj = 0; subproj < (32/IPT); subproj++) {

221

const int loop_proj = 32 * subproj32 + IPT * subproj;

222

const int proje = proj + loop_proj;

223

224

float4 all = c_all[proje + tbidy];

225

226

int minh = __shfl(fminh[subproj32], IPT * subproj + tbidy, 32);

227

228

229

#pragma unroll 3

230

for (int i = 0; i < 3; i++) {

231

int pos = (i * (PPT/2) + tbidx) * BLOCK_SIZE_X + tidx;

232

233

cache[PROJ_LINE * tbidy + pos].x = tex2D(tex_projes, minh + pos, proje + projf);

234

}

235

236

237

// we may use fence instead

238

//__syncthreads();

239

240

#pragma unroll 3

241

for (int i = 0; i < 3; i++) {

242

int pos = (i * (PPT/2) + tbidx) * BLOCK_SIZE_X + tidx;

243

cache[tbidy * PROJ_LINE + pos].y = cache[tbidy * PROJ_LINE + pos + 1].x - cache[tbidy * PROJ_LINE + pos].x;

244

}

245

246

__syncthreads();

247

248

#pragma unroll 4 // IPT

249

for (int p = 0; p < IPT; p++) {

250

float4 all = c_all[proje + p];

251

252

float minh = __shfl(fminh[subproj32], IPT * subproj + p, 32);

253

254

255

256

// try to make intermediate step caching in vars

257

float2 offsets_my = cache_offset[wrapy][wrapx][halfwrap][IPT * subproj];

258

float2 offsets_foreign = cache_offset[wrapy][wrapx][!halfwrap][IPT * subproj];

259

uchar2 ioffsets1 = cache_ioffset[wrapy][wrapx][0][IPT * subproj];

260

uchar2 ioffsets2 = cache_ioffset[wrapy][wrapx][1][IPT * subproj];

261

262

float2 lcache1 = make_float2(

263

cache[p * PROJ_LINE + ioffsets1.x + wrapid].x,

264

cache[p * PROJ_LINE + ioffsets1.y + wrapid].x

265

);

266

float2 lcache2 = make_float2(

267

cache[p * PROJ_LINE + ioffsets2.x + wrapid].x,

268

cache[p * PROJ_LINE + ioffsets2.y + wrapid].x

269

);

270

271

272

273

float h = all.z + x * all.x - y * all.y - minh;

274

275

#pragma unroll 4 // PPT

276

for (int i = 0; i < PPT; i++) {

277

#pragma unroll 4 // PPT

278

for (int j = 0; j < PPT; j++) {

279

float subh1 = h - (i ) * all.y + j * all.x - offsets_my.x;

280

float subh2 = h - (i + 4 * PPT) * all.y + j * all.x - offsets_my.y;

281

282

float fsubh1 = subh1 + exp23;

283

int idx1 = (*(int*)(&fsubh1)) - 0x4B000000;

284

fsubh1 = subh1 - (fsubh1 - exp23);

285

286

float fsubh2 = subh2 + exp23;

287

int idx2 = (*(int*)(&fsubh2)) - 0x4B000000;

288

fsubh2 = subh2 - (fsubh2 - exp23);

289

290

// int foreign1 = __shfl_xor(idx1, 0x10, 32);

291

// int foreign2 = __shfl_xor(idx2, 0x10, 32);

292

293

float2 c, c1, c2;

294

295

c1.x = __shfl(lcache1.x, idx1, 32);

296

c2.x = __shfl(lcache2.x, idx1, 32);

297

298

c1.y = __shfl(lcache1.y, idx2, 32);

299

c2.y = __shfl(lcache2.y, idx2, 32);

300

301

//c = halfwrap?c2:c1;

302

303

res1[i][j] += (halfwrap)?c2.x:c1.x;

304

res1[4 + i][j] += (halfwrap)?c2.y:c1.y;

305

306

307

308

float2 c1 = cache[p * PROJ_LINE + idx1];

309

res1[i][j] += c1.x + fsubh1 * c1.y;

310

311

float2 c2 = cache[p * PROJ_LINE + idx2];

312

res1[4+i][j] += c2.x + fsubh2 * c2.y;

313

314

}

315

}

316

}

317

}

318

319

320

__syncthreads();

321

}

322

}

323

324

325

float4 *result = (float4*)(d_SLICE + PPT * BLOCK_SIZE_X * gridDim.x * idy + idx);

326

#pragma unroll 4

327

for (int i = 0; i < PPT; i++) {

328

result[i * gridDim.x * BLOCK_SIZE_X] = *(float4*)&res1[i][0];

329

result[(i + 16) * gridDim.x * BLOCK_SIZE_X] = *(float4*)&res1[4 + i][0];

330

}

331

}

332

333

334

__global__ static void hst_cuda_mplinear_kernel(int num_proj, int num_bins, float *d_SLICE, float apos_off_x, float apos_off_y, int batch) {

335

const int mode = 0;//((gridDim.x * blockIdx.y + blockIdx.x)/8) % 2;

336

337

if (mode) {

338

hst_cuda_mplinear_kernel_1(num_proj, num_bins, d_SLICE, apos_off_x, apos_off_y, batch);

339

} else {

340

hst_cuda_mplinear_kernel_2(num_proj, num_bins, d_SLICE, apos_off_x, apos_off_y, batch);

341

}

342

}

343

Older »