/tomo/pyhst : revision 168

To get this branch, use:

bzr branch
http://darksoft.org/webbzr/tomo/pyhst

« back to all changes in this revision

Viewing changes to dfi_cuda/dfi_cuda_bp_kernels.h

Committer: Suren A. Chilingaryan
Date: 2013-06-14 15:30:33 UTC
Revision ID: csa@dside.dyndns.org-20130614153033-t9b56hr4jdkd3ul8

Placeholders for DFI implementation

files added:
dfi_cuda

dfi_cuda/CMakeLists.txt

dfi_cuda/dfi_cuda.cu

dfi_cuda/dfi_cuda.h

dfi_cuda/dfi_cuda_bp_kernels.h

dfi_cuda/dfi_cuda_defines.h

dfi_cuda/dfi_cuda_kernels.h

files modified:
CMakeLists.txt

PyHST.py

PyHST_c.c

hst.c

hst_cpu/hst_cpu.c

hst_setup.h

setup.py

Show diffs side-by-side

added added

removed removed

dfi_cuda/dfi_cuda_bp_kernels.h

* European Synchrotron Radiation Facility (ESRF) and

* Karlsruhe Institute of Technology (KIT).

* PyHST is free software: you can redistribute it and/or modify it

* under the terms of the GNU General Public License as published by the

* Free Software Foundation, either version 3 of the License, or

* (at your option) any later version.

* hst is distributed in the hope that it will be useful, but

* WITHOUT ANY WARRANTY; without even the implied warranty of

* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.

* See the GNU General Public License for more details.

* You should have received a copy of the GNU General Public License along

* with this program. If not, see <http://www.gnu.org/licenses/>.

__constant__ float4 c_all[ MAXNPROJECTIONS ] ; //!< (cosinuses, sinuses, axis position, min bin

texture<float, 2, cudaReadModeElementType> tex_projes;

__global__ static void hst_cuda_unpack_kernel_fai360(cufftReal *out, int dpitch, cufftReal *data, int spitch, int half, float *params, int batch) {

float tmp1, tmp2;

int idx = blockIdx.x * BLOCK_SIZE + threadIdx.x;

int idy = blockIdx.y * BLOCK_SIZE + threadIdx.y;

int dest_vector = (idx >= half)?1:0;

int dest_pos = 2 * (idx - dest_vector * half);

int src_pos = 2 * (idy * spitch + dest_pos) + dest_vector;

int src_vector = dest_vector;//(src_pos >= 0);

tmp1 = data[src_pos];

tmp2 = data[src_pos + 2];

float axis_position_corr = c_all[batch + 2*idy + src_vector].z;

float flat_zone = params[idy * 4 + 2*src_vector];

float param = params[idy * 4 + 2*src_vector + 1];

int pos = (2 * idy + dest_vector)*dpitch + dest_pos;

float apc_plus_flat = axis_position_corr + flat_zone;

float apc_param_flat = apc_plus_flat + param - dest_pos;

int flag1 = dest_pos < apc_plus_flat;

float multiplier1 = __fdividef((flag1?(apc_param_flat - 2 * flat_zone):apc_param_flat),param);

int flag2 =(dest_pos + 1) < apc_plus_flat;

float multiplier2 = __fdividef((flag2?(apc_param_flat - 2 * flat_zone):apc_param_flat) - 1,param);

if (param > 0) {

out[pos] = tmp1 * min(2., max(flag1?1.:0., multiplier1));

out[pos+1] = tmp2 * min(2., max(flag2?1.:0., multiplier2));

} else {

out[pos] = tmp1 * max(0., min(flag1?1.:2., multiplier1));

out[pos+1] = tmp2 * max(0., min(flag2?1.:2., multiplier2));

}

__global__ static void hst_cuda_kernel(int num_proj, int num_bins, float *d_SLICE, float apos_off_x, float apos_off_y, int batch) {

const int idx = blockIdx.x * BLOCK_SIZE_X + threadIdx.x;

const int idy = blockIdx.y * BLOCK_SIZE_Y + threadIdx.y + batch;

float h;

float res=0;

const float bx = idx + apos_off_x;

const float by = idy + apos_off_y;

#pragma unroll 8

for (int proje=0; proje<num_proj; proje++) {

const float4 all = c_all[proje];

h = all.z + bx * all.x - by * all.y;

res=res+tex2D(tex_projes, h, proje + 0.5f) ;

}

d_SLICE[ BLOCK_SIZE_X*gridDim.x*idy + idx ] = res;

}

__global__ static void hst_kepler_kernel(int num_proj, int num_bins, float *d_SLICE, float apos_off_x, float apos_off_y, int batch) {

float h;

float res[4][4] = {0};

#ifdef HST_OPTIMIZE_KEPLER

__shared__ float buf[16][18]; // 64b for kepler

__shared__ float fill[48];

__shared__ float fin[16][18];

#else // HST_OPTIMIZE_KEPLER

__shared__ float buf[16][17]; // 32b for Fermi & GT200

__shared__ float fill[56];

__shared__ float fin[16][17];

100

#endif // HST_OPTIMIZE_KEPLER

101

102

const int tidx = threadIdx.x;

103

const int tidy = threadIdx.y;

104

105

const int bidx = blockIdx.x * BLOCK_SIZE_X;

106

const int bidy = batch + blockIdx.y * BLOCK_SIZE_Y;

107

108

const int sidx = tidx % 4;

109

const int sidy = tidx / 4;

110

111

const float x = bidx + sidx + apos_off_x;

112

const float y = bidy + sidy + apos_off_y;

113

114

const float projf = tidy + 0.5f;

115

116

for (int proje=0; proje<num_proj; proje+=16) {

117

const float4 all = c_all[proje+tidy];

118

h = all.z + x * all.x - y * all.y;

119

120

#pragma unroll 4

121

for (int i = 0; i < 4; i++) {

122

#pragma unroll 4

123

for (int j = 0; j < 4; j++) {

124

float subh = h + 4 * j * all.x - 4 * i * all.y;

125

res[i][j] += tex2D(tex_projes, subh, proje + projf);

126

}

127

}

128

129

}

130

131

132

#pragma unroll 4

133

for (int i = 0; i < 4; i++) {

134

#pragma unroll 4

135

for (int j = 0; j < 4; j++) {

136

buf[tidx][tidy] = res[i][j];

137

138

__syncthreads();

139

140

141

#ifdef HST_OPTIMIZE_KEPLER

142

float val = buf[tidy][tidx];

143

for (int k=16; k>=1; k/=2)

144

val += __shfl_xor(val, k, 16);

145

#else // HST_OPTIMIZE_KEPLER

146

volatile float *ptr = &buf[tidy][0];

147

for (int k=8; k>=1; k/=2)

148

ptr[tidx] += ptr[tidx+k];

149

float val = ptr[0];

150

#endif // HST_OPTIMIZE_KEPLER

151

152

const int rx = 4 * j + tidy%4;

153

const int ry = 4 * i + tidy/4;

154

155

if (!tidx) {

156

fin[ry][rx] = val;

157

}

158

159

__syncthreads();

160

}

161

}

162

163

164

const int idx = bidx + tidx;

165

const int idy = bidy + tidy;

166

167

d_SLICE[BLOCK_SIZE_X * gridDim.x * idy + idx] = fin[tidy][tidx];

168

}

169

170

171

172

__global__ static void hst_cuda_nearest_kernel(int num_proj, int num_bins, float *d_SLICE, float apos_off_x, float apos_off_y, int batch) {

173

float2 res[2] = {0.f, 0.f, 0.f, 0.f};

174

175

const int tidx = threadIdx.x;

176

const int tidy = threadIdx.y;

177

178

const int bidx = 2 * blockIdx.x * BLOCK_SIZE_X;

179

const int bidy = batch + 2 * blockIdx.y * BLOCK_SIZE_Y;

180

181

const int idx = bidx + 2 * tidx;

182

const int idy = bidy + 2 * tidy;

183

184

const float bx = bidx + apos_off_x;

185

const float by = bidy + apos_off_y;

186

187

const float x = idx + apos_off_x;

188

const float y = idy + apos_off_y;

189

190

191

__shared__ float cache_subh[2*BLOCK_SIZE_X][BLOCK_SIZE_Y + 1];

192

__shared__ float cache_sin[BLOCK_SIZE_Y];

193

__shared__ float cache[BLOCK_SIZE_Y][3 * BLOCK_SIZE_X];

194

195

//#pragma unroll 8

196

float projf = tidy + 0.5;

197

for (int proje=0; proje<num_proj; proje += BLOCK_SIZE_Y) {

198

float4 all = c_all[proje + tidy];

199

int minh = (int)floor(all.z + bx * all.x - by * all.y + all.w);

200

201

float subh = (all.z + x * all.x - minh);

202

cache_subh[2*tidx][tidy] = subh;

203

cache_subh[2*tidx+1][tidy] = subh + all.x;

204

if (tidx == 0) cache_sin[tidy] = all.y;

205

206

207

#pragma unroll 3

208

for (int i = 0; i < 3; i++) {

209

int pos = i * BLOCK_SIZE_X + tidx;

210

cache[tidy][pos] = tex2D(tex_projes, minh + pos, proje + projf);

211

}

212

213

__syncthreads();

214

215

#pragma unroll 16

216

for (int i = 0; i < BLOCK_SIZE_Y; i++) {

217

int4 minh = {

218

cache_subh[2 * tidx][i] - (y) * cache_sin[i],

219

cache_subh[2 * tidx + 1][i] - (y) * cache_sin[i],

220

cache_subh[2 * tidx][i] - (y + 1) * cache_sin[i],

221

cache_subh[2 * tidx + 1][i] - (y + 1) * cache_sin[i],

222

};

223

224

res[0].x += cache[i][minh.x];

225

res[0].y += cache[i][minh.y];

226

res[1].x += cache[i][minh.z];

227

res[1].y += cache[i][minh.w];

228

229

}

230

231

232

__syncthreads();

233

234

}

235

236

*(float2*)(d_SLICE + BLOCK_SIZE_X * 2 * gridDim.x * (idy ) + idx) = res[0];

237

*(float2*)(d_SLICE + BLOCK_SIZE_X * 2 * gridDim.x * (idy + 1) + idx) = res[1];

238

}

239

240

241

__global__ static void hst_cuda_linear_kernel(int num_proj, int num_bins, float *d_SLICE, float apos_off_x, float apos_off_y, int batch) {

242

float4 res = {0.f, 0.f, 0.f, 0.f};

243

244

const int tidx = threadIdx.x;

245

const int tidy = threadIdx.y;

246

247

const int bidx = 2 * blockIdx.x * BLOCK_SIZE_X;

248

const int bidy = batch + 2 * blockIdx.y * BLOCK_SIZE_Y;

249

250

const int idx = bidx + 2 * tidx;

251

const int idy = bidy + 2 * tidy;

252

253

const float bx = bidx + apos_off_x;

254

const float by = bidy + apos_off_y;

255

256

const float x = idx + apos_off_x;

257

const float y = idy + apos_off_y;

258

259

260

__shared__ float cache_subh[2*BLOCK_SIZE_X][BLOCK_SIZE_Y + 1];

261

__shared__ float cache_sin[BLOCK_SIZE_Y];

262

__shared__ float2 cache[BLOCK_SIZE_Y][3 * BLOCK_SIZE_X + 1];

263

264

//#pragma unroll 8

265

float projf = tidy + 0.5;

266

for (int proje=0; proje<num_proj; proje += BLOCK_SIZE_Y) {

267

float4 all = c_all[proje + tidy];

268

int minh = (int)floor(all.z + bx * all.x - by * all.y + all.w);

269

270

float subh = (all.z + x * all.x - minh);

271

cache_subh[2*tidx][tidy] = subh;

272

cache_subh[2*tidx+1][tidy] = subh + all.x;

273

if (tidx == 0) cache_sin[tidy] = all.y;

274

275

276

#pragma unroll 3

277

for (int i = 0; i < 3; i++) {

278

int pos = i * BLOCK_SIZE_X + tidx;

279

cache[tidy][pos].x = tex2D(tex_projes, minh + pos, proje + projf);

280

}

281

282

__syncthreads();

283

284

#pragma unroll 3

285

for (int i = 0; i < 3; i++) {

286

int pos = i * BLOCK_SIZE_X + tidx;

287

cache[tidy][pos].y = cache[tidy][pos + 1].x - cache[tidy][pos].x;

288

}

289

290

__syncthreads();

291

292

#pragma unroll 16

293

for (int i = 0; i < BLOCK_SIZE_Y; i++) {

294

float2 c;

295

float4 v1, v2;

296

float4 minh = {

297

cache_subh[2 * tidx][i] - (y) * cache_sin[i],

298

cache_subh[2 * tidx + 1][i] - (y) * cache_sin[i],

299

cache_subh[2 * tidx][i] - (y + 1) * cache_sin[i],

300

cache_subh[2 * tidx + 1][i] - (y + 1) * cache_sin[i],

301

};

302

303

c = cache[i][(int)minh.x];

304

v1.x = c.x; v2.x = c.y;

305

c = cache[i][(int)minh.y];

306

v1.y = c.x; v2.y = c.y;

307

c = cache[i][(int)minh.z];

308

v1.z = c.x; v2.z = c.y;

309

c = cache[i][(int)minh.w];

310

v1.w = c.x; v2.w = c.y;

311

312

res += v1 + (minh - floorf(minh)) * v2;

313

}

314

315

__syncthreads();

316

317

}

318

319

*(float2*)(d_SLICE + BLOCK_SIZE_X * 2 * gridDim.x * (idy ) + idx) = make_float2(res.x, res.y);

320

*(float2*)(d_SLICE + BLOCK_SIZE_X * 2 * gridDim.x * (idy + 1) + idx) = make_float2(res.z, res.w);

321

}

322

323

324

325

__global__ static void hst_cuda_oversample4_kernel(int num_proj, int num_bins, float *d_SLICE, float apos_off_x, float apos_off_y, int batch) {

326

float2 res[2] = {0.f, 0.f, 0.f, 0.f};

327

328

const int tidx = threadIdx.x;

329

const int tidy = threadIdx.y;

330

331

const int bidx = 2 * blockIdx.x * BLOCK_SIZE_X;

332

const int bidy = batch + 2 * blockIdx.y * BLOCK_SIZE_Y;

333

334

const int idx = bidx + 2 * tidx;

335

const int idy = bidy + 2 * tidy;

336

337

const float bx = bidx + apos_off_x;

338

const float by = bidy + apos_off_y;

339

340

const float x = idx + apos_off_x;

341

const float y = idy + apos_off_y;

342

343

344

__shared__ float cache_subh[2*BLOCK_SIZE_X][BLOCK_SIZE_Y + 1];

345

__shared__ float cache_sin[BLOCK_SIZE_Y];

346

__shared__ float cache[BLOCK_SIZE_Y][12 * BLOCK_SIZE_X];

347

348

//#pragma unroll 8

349

float projf = tidy + 0.5;

350

for (int proje=0; proje<num_proj; proje += BLOCK_SIZE_Y) {

351

float4 all = c_all[proje + tidy];

352

int minh = (int)floor(all.z + bx * all.x - by * all.y + all.w);

353

354

float subh = 4 * (all.z + x * all.x - minh);

355

cache_subh[2*tidx][tidy] = subh;

356

cache_subh[2*tidx+1][tidy] = subh + 4 * all.x;

357

if (tidx == 0) cache_sin[tidy] = 4 * all.y;

358

359

360

#pragma unroll 12

361

for (int i = 0; i < 12; i++) {

362

int pos = i * BLOCK_SIZE_X + tidx;

363

cache[tidy][pos] = tex2D(tex_projes, minh + ((float)pos)/4, proje + projf);

364

}

365

366

__syncthreads();

367

368

#pragma unroll 16

369

for (int i = 0; i < BLOCK_SIZE_Y; i++) {

370

int4 minh = {

371

cache_subh[2 * tidx][i] - (y) * cache_sin[i],

372

cache_subh[2 * tidx + 1][i] - (y) * cache_sin[i],

373

cache_subh[2 * tidx][i] - (y + 1) * cache_sin[i],

374

cache_subh[2 * tidx + 1][i] - (y + 1) * cache_sin[i],

375

};

376

377

res[0].x += cache[i][minh.x];

378

res[0].y += cache[i][minh.y];

379

res[1].x += cache[i][minh.z];

380

res[1].y += cache[i][minh.w];

381

382

}

383

384

385

__syncthreads();

386

387

}

388

389

*(float2*)(d_SLICE + BLOCK_SIZE_X * 2 * gridDim.x * (idy ) + idx) = res[0];

390

*(float2*)(d_SLICE + BLOCK_SIZE_X * 2 * gridDim.x * (idy + 1) + idx) = res[1];

391

}

392

393

394

// Both kernels have problems with pre sm2 builds. Can't tell.

395

396

#define PPT 4

397

// Iterations per line

398

#define IPT (16/PPT)

399

//#define PROJ_LINE (2 * IPT * PPT * 3)

400

#define PROJ_LINE (PPT * BLOCK_SIZE_X * 3 / 2)

401

402

403

__global__ static void hst_cuda_mplinear_kernel(int num_proj, int num_bins, float *d_SLICE, float apos_off_x, float apos_off_y, int batch) {

404

float res1[PPT][PPT] = {0.f};

405

float res2[PPT][PPT] = {0.f};

406

407

const int tidx = threadIdx.x;

408

const int tidy = threadIdx.y;

409

410

const int bidx = PPT * blockIdx.x * BLOCK_SIZE_X;

411

const int bidy = batch + PPT * blockIdx.y * BLOCK_SIZE_Y;

412

413

414

const float bx = bidx + apos_off_x;

415

const float by = bidy + apos_off_y;

416

417

__shared__ float f_minh[128];

418

__shared__ float2 cache[IPT * PROJ_LINE + 1];

419

420

const int tbidy = tidy/(PPT/2);

421

const int tbidx = tidy%(PPT/2);

422

423

const int wrapid = (tidy&1) * 16 + tidx;

424

425

const int sbidx = 8 * ((tidy / 2) % 2);

426

const int sbidy = 8 * ((tidy / 2) / 2);

427

428

const int stidx = tidx % 8;

429

const int stidy = 2 * (2 * (tidy&1) + tidx / 8);

430

431

const int sidx = (sbidx + stidx);

432

const int sidy = (sbidy + stidy);

433

434

const int idx = bidx + sidx;

435

const int idy = bidy + sidy;

436

437

const float x = idx + apos_off_x;

438

const float y = idy + apos_off_y;

439

440

const float exp23 = exp2(23.f);

441

442

const int ttidx = 16 * tidy + tidx;

443

444

float projf = tbidy + 0.5f;

445

446

const int num_proj_blocks = num_proj / 32;

447

448

for (int proj_block=0; proj_block<num_proj_blocks; proj_block += 4) {

449

const int proj = proj_block * 32;

450

451

float4 all = c_all[proj + ttidx];

452

float minh = floor(all.z + bx * all.x - by * all.y + all.w);

453

f_minh[ttidx] = minh;

454

455

__syncthreads();

456

457

float fminh[4];

458

459

#pragma unroll 4

460

for (int i = 0; i < 4; i++) {

461

fminh[i] = f_minh[32 * i + wrapid];

462

}

463

464

int max_proj = min(num_proj_blocks - proj_block, 4);

465

466

#pragma unroll 1

467

for (int subproj32 = 0; subproj32 < max_proj; subproj32++) {

468

#pragma unroll 1

469

for (int subproj = 0; subproj < (32/IPT); subproj++) {

470

const int loop_proj = 32 * subproj32 + IPT * subproj;

471

const int proje = proj + loop_proj;

472

473

float4 all = c_all[proje + tbidy];

474

475

#ifdef HST_OPTIMIZE_KEPLER

476

int minh = __shfl(fminh[subproj32], IPT * subproj + tbidy, 32);

477

#else // HST_OPTIMIZE_KEPLER

478

int minh = f_minh[loop_proj + tbidy];

479

#endif // HST_OPTIMIZE_KEPLER

480

481

#pragma unroll 3

482

for (int i = 0; i < 3; i++) {

483

int pos = (i * (PPT/2) + tbidx) * BLOCK_SIZE_X + tidx;

484

485

cache[PROJ_LINE * tbidy + pos].x = tex2D(tex_projes, minh + pos, proje + projf);

486

}

487

488

// we may use fence instead

489

//__syncthreads();

490

491

#pragma unroll 3

492

for (int i = 0; i < 3; i++) {

493

int pos = (i * (PPT/2) + tbidx) * BLOCK_SIZE_X + tidx;

494

cache[tbidy * PROJ_LINE + pos].y = cache[tbidy * PROJ_LINE + pos + 1].x - cache[tbidy * PROJ_LINE + pos].x;

495

}

496

497

__syncthreads();

498

499

#pragma unroll 4 // IPT

500

for (int p = 0; p < IPT; p++) {

501

float4 all = c_all[proje + p];

502

503

#ifdef HST_OPTIMIZE_KEPLER

504

float minh = __shfl(fminh[subproj32], IPT * subproj + p, 32);

505

#else // HST_OPTIMIZE_KEPLER

506

float minh = f_minh[loop_proj + p];

507

#endif // HST_OPTIMIZE_KEPLER

508

509

float h = all.z + x * all.x - y * all.y - minh;

510

511

512

char *cache_row = ((char*)(&cache)) + p * PROJ_LINE * sizeof(float2);

513

514

515

#pragma unroll 4 // PPT

516

for (int i = 0; i < PPT; i++) {

517

float subh = h;

518

h -= 16.f * all.y;

519

#pragma unroll 4 // PPT

520

for (int j = 0; j < PPT; j++) {

521

float subh1 = subh;

522

float subh2 = subh - all.y;

523

524

525

float fsubh1 = subh1 + exp23;

526

int idx1 = (*(int*)(&fsubh1)) - 0x4B000000;

527

fsubh1 = subh1 - (fsubh1 - exp23);

528

529

float fsubh2 = subh2 + exp23;

530

int idx2 = (*(int*)(&fsubh2)) - 0x4B000000;

531

fsubh2 = subh2 - (fsubh2 - exp23);

532

533

534

float2 c1 = cache[p * PROJ_LINE + idx1];

535

res1[i][j] += c1.x + fsubh1 * c1.y;

536

537

float2 c2 = cache[p * PROJ_LINE + idx2];

538

res2[i][j] += c2.x + fsubh2 * c2.y;

539

540

subh += 16.f * all.x;

541

}

542

}

543

544

}

545

}

546

__syncthreads();

547

}

548

}

549

550

#pragma unroll 4

551

for (int i = 0; i < PPT; i++) {

552

#pragma unroll 4

553

for (int j = 0; j < PPT; j++) {

554

d_SLICE[BLOCK_SIZE_X * PPT * gridDim.x * (idy + i * BLOCK_SIZE_Y) + idx + j * BLOCK_SIZE_X] = res1[i][j];

555

d_SLICE[BLOCK_SIZE_X * PPT * gridDim.x * (idy + i * BLOCK_SIZE_Y + 1) + idx + j * BLOCK_SIZE_X] = res2[i][j];

556

}

557

}

558

}

559

560

561

#undef PROJ_LINE

562

#define PROJ_LINE (2 * PPT * BLOCK_SIZE_X * 3)

563

564

__global__ static void hst_cuda_mpoversample4_kernel(int num_proj, int num_bins, float *d_SLICE, float apos_off_x, float apos_off_y, int batch) {

565

float res[PPT][PPT];

566

567

const int tidx = threadIdx.x;

568

const int tidy = threadIdx.y;

569

570

const int bidx = PPT * blockIdx.x * BLOCK_SIZE_X;

571

const int bidy = batch + PPT * blockIdx.y * BLOCK_SIZE_Y;

572

573

574

const float bx = bidx + apos_off_x;

575

const float by = bidy + apos_off_y;

576

577

__shared__ float f_minh[256];

578

__shared__ float cache[IPT * PROJ_LINE];

579

580

const int tbidy = tidy/(16 / IPT);

581

const int tbidx = tidy%(16 / IPT);

582

583

const int wrapid = (tidy&1) * 16 + tidx;

584

585

const int sbidx = tidy % 4;

586

const int sbidy = tidy / 4;

587

588

const int stidx = tidx % 4;

589

const int stidy = tidx / 4;

590

591

const int sidx = (4 * sbidx + stidx);

592

const int sidy = (4 * sbidy + stidy);

593

594

const int idx = bidx + sidx;

595

const int idy = bidy + sidy;

596

597

const float x = idx + apos_off_x;

598

const float y = idy + apos_off_y;

599

600

const int ttidx = 16 * tidy + tidx;

601

602

float projf = tbidy + 0.5f;

603

604

const int num_proj_blocks = num_proj / 32;

605

606

for (int proj_block=0; proj_block<num_proj_blocks; proj_block += 8) {

607

const int proj = proj_block * 32;

608

609

float4 all = c_all[proj + ttidx];

610

float minh = floor(all.z + bx * all.x - by * all.y + all.w);

611

f_minh[ttidx] = minh;

612

613

__syncthreads();

614

615

#ifdef HST_OPTIMIZE_KEPLER

616

float fminh[8];

617

618

#pragma unroll 8

619

for (int i = 0; i < 8; i++) {

620

fminh[i] = f_minh[32 * i + wrapid];

621

}

622

#endif // HST_OPTIMIZE_KEPLER

623

624

int max_proj = min(num_proj_blocks - proj_block, 8);

625

626

#pragma unroll 1

627

for (int subproj32 = 0; subproj32 < max_proj; subproj32++) {

628

#pragma unroll 1

629

for (int subproj = 0; subproj < (32/IPT); subproj++) {

630

const int loop_proj = 32 * subproj32 + IPT * subproj;

631

const int proje = proj + loop_proj;

632

633

float4 all = c_all[proje + tbidy];

634

635

#ifdef HST_OPTIMIZE_KEPLER

636

int minh = __shfl(fminh[subproj32], IPT * subproj + tbidy, 32);

637

#else // HST_OPTIMIZE_KEPLER

638

int minh = f_minh[loop_proj + tbidy];

639

#endif // HST_OPTIMIZE_KEPLER

640

641

642

#pragma unroll 6

643

for (int i = 0; i < 6; i++) {

644

int pos = (i * PPT + tbidx) * BLOCK_SIZE_X + tidx;

645

cache[PROJ_LINE * tbidy + pos] = tex2D(tex_projes, minh + 0.25f * pos, proje + projf);

646

}

647

648

649

__syncthreads();

650

651

#pragma unroll 4 // IPT

652

for (int p = 0; p < IPT; p++) {

653

float4 all = 4 * c_all[proje + p];

654

all.y = -all.y;

655

656

#ifdef HST_OPTIMIZE_KEPLER

657

float minh = __shfl(fminh[subproj32], IPT * subproj + p, 32);

658

#else // HST_OPTIMIZE_KEPLER

659

float minh = f_minh[loop_proj + p];

660

#endif // HST_OPTIMIZE_KEPLER

661

662

float h = all.z + x * all.x + y * all.y - 4 * minh;

663

664

all.x *= 16.f;

665

all.y *= 16.f;

666

667

#pragma unroll 4 // PPT

668

for (int i = 0; i < PPT; i++) {

669

#pragma unroll 4 // PPT

670

for (int j = 0; j < PPT; j++) {

671

float subh = h + i * all.y + j * all.x;

672

res[i][j] += cache[p*PROJ_LINE + (int)subh];//cache[p * PROJ_LINE + (int)subh];

673

}

674

}

675

676

677

}

678

}

679

__syncthreads();

680

681

}

682

}

683

684

#pragma unroll 4

685

for (int i = 0; i < PPT; i++) {

686

#pragma unroll 4

687

for (int j = 0; j < PPT; j++) {

688

d_SLICE[BLOCK_SIZE_X * PPT * gridDim.x * (idy + i * BLOCK_SIZE_Y) + idx + j * BLOCK_SIZE_X] = res[i][j];

689

}

690

}

691

}

Older »