/tomo/pyhst : revision 228

To get this branch, use:

bzr branch
http://darksoft.org/webbzr/tomo/pyhst

« back to all changes in this revision

Viewing changes to hst_cuda/hst_cuda_bp_kernels.h

Committer: Suren A. Chilingaryan
Date: 2017-09-28 10:34:47 UTC
Revision ID: csa@suren.me-20170928103447-dggjgnuxmymgew2a

Quality fixes (tex-based)

files modified:
CMakeLists.txt

PyHST.py

PyHST_c.c

hst.c

hst_cuda/CMakeLists.txt

hst_cuda/hst_cuda.cu

hst_cuda/hst_cuda_bp_kepler.h

hst_cuda/hst_cuda_bp_kernels.h

hst_cuda/hst_cuda_bp_mplinear.h

hst_cuda/hst_cuda_defines.h

hst_cuda/hst_cuda_kernels.h

setup.py

Show diffs side-by-side

added added

removed removed

hst_cuda/hst_cuda_bp_kernels.h

#define mkf4(xy, zw) make_float4(xy.x, xy.y, zw.x, zw.y)

#define HST_MACRO_JOIN(a, b) a ## b

#include <math_constants.h>

#define M_PI_F CUDART_PI_F

}

#include "hst_cuda_bp_kepler.h"

142

145

# undef HYBRID_KEPLER

143

146

# else

144

147

145

__device__ static void hst_cuda_linear_companion(cudaTextureObject_t texptr, int num_proj, int num_bins, vfloat *d_SLICE, float apos_off_x, float apos_off_y, int batch) {

148

__device__ static void hst_cuda_linear_companion(cudaTextureObject_t texptr, const float * __restrict__ g_all, int num_proj, int num_bins, vfloat *d_SLICE, float apos_off_x, float apos_off_y, int batch) {

146

149

vfloat res[PPT][PPT] = {0};

147

const int idx = PPT * blockIdx.x * BLOCK_SIZE_X + threadIdx.x;

148

const int idy = PPT * blockIdx.y * BLOCK_SIZE_Y + threadIdx.y + batch;

150

151

const int sidx = 4 * (threadIdx.y % 4) + (threadIdx.x % 4);

152

const int sidy = 4 * (threadIdx.y / 4) + (threadIdx.x / 4);

153

154

// const int sidx = threadIdx.x;

155

// const int sidy = threadIdx.y;

156

157

const int idx = PPT * blockIdx.x * BLOCK_SIZE_X + sidx;

158

const int idy = PPT * blockIdx.y * BLOCK_SIZE_Y + sidy + batch;

149

159

150

160

float h;

151

161

const float bx = idx + apos_off_x;

152

162

const float by = idy + apos_off_y;

153

163

154

164

// DS: float loops?

155

165

for (int proje=0; proje<num_proj; proje++) {

156

166

const float4 all = c_all[proje];

157

167

h = all.z + bx * all.x - by * all.y;

192

202

# if defined(HST_SET_BOUNDS)

193

203

__launch_bounds__(256, HST_NEWTEX_MIN_BLOCKS)

194

204

# endif /* HST_SET_BOUNDS */

195

__global__ static void hst_cuda_linear_hybrid(cudaTextureObject_t texptr, float *g_all, const vfloat * __restrict__ sino, int num_proj, int num_bins, int bin_pitch, vfloat *d_SLICE, float apos_off_x, float apos_off_y, int batch) {

205

__global__ static void hst_cuda_linear_hybrid(cudaTextureObject_t texptr, const float * __restrict__ g_all, const vfloat * __restrict__ sino, int num_proj, int num_bins, int bin_pitch, vfloat *d_SLICE, float apos_off_x, float apos_off_y, int batch) {

196

206

// int block = (blockIdx.y * gridDim.x + blockIdx.x)/16;

197

207

198

208

203

213

}

204

214

__syncthreads();

205

215

206

//was &7<4

207

216

if ((block&0x7)<4) {

208

217

hst_cuda_alu_linear(texptr, g_all, sino, num_proj, num_bins, bin_pitch, d_SLICE, apos_off_x, apos_off_y, batch);

209

218

} else {

213

222

const int bidx = (PPT * blockIdx.x + j) * HST_LINEAR_BLOCK;

214

223

const int bidy = batch + (PPT * blockIdx.y + i) * HST_LINEAR_BLOCK;

215

224

216

hst_cuda_linear_companion(texptr, num_proj, num_bins, d_SLICE, apos_off_x, apos_off_y, bidx, bidy);

225

hst_cuda_linear_companion(texptr, g_all, num_proj, num_bins, d_SLICE, apos_off_x, apos_off_y, bidx, bidy);

217

226

}

218

227

}

219

228

#else

220

hst_cuda_linear_companion(texptr, num_proj, num_bins, d_SLICE, apos_off_x, apos_off_y, batch);

229

hst_cuda_linear_companion(texptr, g_all, num_proj, num_bins, d_SLICE, apos_off_x, apos_off_y, batch);

221

230

#endif

222

231

}

223

232

Older »