/tomo/pyhst : revision 168

To get this branch, use:

bzr branch
http://darksoft.org/webbzr/tomo/pyhst

« back to all changes in this revision

Viewing changes to dfi_cuda/dfi_cuda_kernels.h

Committer: Suren A. Chilingaryan
Date: 2013-06-14 15:30:33 UTC
Revision ID: csa@dside.dyndns.org-20130614153033-t9b56hr4jdkd3ul8

Placeholders for DFI implementation

files added:
dfi_cuda

dfi_cuda/CMakeLists.txt

dfi_cuda/dfi_cuda.cu

dfi_cuda/dfi_cuda.h

dfi_cuda/dfi_cuda_bp_kernels.h

dfi_cuda/dfi_cuda_defines.h

dfi_cuda/dfi_cuda_kernels.h

files modified:
CMakeLists.txt

PyHST.py

PyHST_c.c

hst.c

hst_cpu/hst_cpu.c

hst_setup.h

setup.py

Show diffs side-by-side

added added

removed removed

dfi_cuda/dfi_cuda_kernels.h

* European Synchrotron Radiation Facility (ESRF) and

* Karlsruhe Institute of Technology (KIT).

* PyHST is free software: you can redistribute it and/or modify it

* under the terms of the GNU General Public License as published by the

* Free Software Foundation, either version 3 of the License, or

* (at your option) any later version.

* hst is distributed in the hope that it will be useful, but

* WITHOUT ANY WARRANTY; without even the implied warranty of

* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.

* See the GNU General Public License for more details.

* You should have received a copy of the GNU General Public License along

* with this program. If not, see <http://www.gnu.org/licenses/>.

__global__ static void hst_cuda_pack_kernel(cufftReal *out, int dpitch, cufftReal *data, int spitch) {

float tmp1, tmp2;

int idx = blockIdx.x * BLOCK_SIZE + threadIdx.x;

int idy = blockIdx.y * BLOCK_SIZE + threadIdx.y;

int pos = 2 * idy * spitch + idx;

tmp1 = data[pos];

tmp2 = data[pos + spitch];

pos = 2 * (idy * dpitch + idx);

out[pos] = tmp1;

out[pos + 1] = tmp2;

}

__global__ static void hst_cuda_pack_kernel_zpad(cufftReal *out, int dpitch, cufftReal *data, int spitch, int num_bins) {

float tmp1, tmp2;

int idx = blockIdx.x * BLOCK_SIZE + threadIdx.x;

int idy = blockIdx.y * BLOCK_SIZE + threadIdx.y;

int pos;

if (idx < num_bins) {

pos = 2 * idy * spitch + idx;

tmp1 = data[pos];

tmp2 = data[pos + spitch];

} else {

tmp1 = 0;

tmp2 = 0;

}

pos = 2 * (idy * dpitch + idx);

out[pos] = tmp1;

out[pos + 1] = tmp2;

}

__global__ static void hst_cuda_pack_kernel_epad(cufftReal *out, int dpitch, cufftReal *data, int spitch, int num_bins, int mid) {

float tmp1, tmp2;

int idx = blockIdx.x * BLOCK_SIZE + threadIdx.x;

int idy = blockIdx.y * BLOCK_SIZE + threadIdx.y;

int pos;

if (idx < num_bins) {

pos = 2 * idy * spitch + idx;

tmp1 = data[pos];

tmp2 = data[pos + spitch];

} else if (idx < mid) {

pos = 2 * idy * spitch + num_bins - 1;

tmp1 = data[pos];

tmp2 = data[pos + spitch];

} else {

pos = 2 * idy * spitch;

tmp1 = data[pos];

tmp2 = data[pos + spitch];

}

pos = 2 * (idy * dpitch + idx);

out[pos] = tmp1;

out[pos + 1] = tmp2;

}

__global__ static void hst_cuda_filter_kernel(int vector_size, float *data, const float *filter) {

int idx = blockIdx.x * BLOCK_SIZE + threadIdx.x;

int id = idx + (blockIdx.y * BLOCK_SIZE + threadIdx.y)*vector_size;

data[id] *= filter[idx];

}

100

left_side:

101

flat_zone_start = (axis_position_corr - flat_zone)

102

dx = flat_zone_start - i

103

multiplier = (2 * prof_shift + prof_fact) + dx * (prof_fact / pente_zone)

104

105

(2 * prof_shift + prof_fact) is always 1

106

prof_fact is -1 or 1

107

108

1 + prof_fact * min(1, (flat_zone_start - i) / pente_zone)

109

prof_fact < 0

110

param2 is prof_fact / pente_zone

111

param1 is flat_zone_start + 1/param2

112

multiplier = max(0, (param1 - i)*param2)

113

else

114

param2 is prof_fact / pente_zone

115

param1 is flat_zone_start + 1/param2

116

multiplier = min(2, (param1 - i)*param2)

117

118

right_side:

119

flat_zone_start = (axis_position_corr + flat_zone)

120

dx = flat_zone_start - i

121

122

multiplier = (2 * prof_shift + prof_fact) + dx * (prof_fact / pente_zone)

123

124

(2 * prof_shift + prof_fact) is always 1

125

prof_fact is -1 or 1

126

127

1 + prof_fact * max(-1, (flat_zone_start - i) / pente_zone)

128

prof_fact < 0

129

param2 is prof_fact / pente_zone

130

param1 is flat_zone_start + 1/param2

131

multiplier = min(2, (param1 - i)*param2)

132

else

133

param2 is prof_fact / pente_zone

134

param1 is flat_zone_start + 1/param2

135

multiplier = max(0, (param1 - i)*param2)

136

137

__global__ static void limit_kernel_max(float *data, float param1, float param2) {

138

int i = blockIdx.x * BLOCK_SIZE + threadIdx.x;

139

140

data[i] *= max(0., (param1 - i) * param2);

141

}

142

143

__global__ static void limit_kernel_min(float *data, float param1, float param2) {

144

int i = blockIdx.x * BLOCK_SIZE + threadIdx.x;

145

146

data[i] *= min(2., (param1 - i) * param2);

147

}

148

149

the next functions are joint

150

151

__global__ static void limit_kernel_positive(float *data, int first_right, float param1a, float param1b, float param2) {

152

int i = blockIdx.x * BLOCK_SIZE + threadIdx.x;

153

int flag = i < first_right;

154

155

data[i] *= min(2., max(flag?1.:0., ((flag?param1a:param1b) - i) * param2));

156

}

157

158

__global__ static void limit_kernel_negative(float *data, int first_right, float param1a, float param1b, float param2) {

159

int i = blockIdx.x * BLOCK_SIZE + threadIdx.x;

160

int flag = i < first_right;

161

162

data[i] *= max(0., min(flag?1.:2., ((flag?param1a:param1b) - i) * param2));

163

}

164

and now completely blocked and joint:

165

__global__ static void hst_cuda_limit_kernel(float *data, int pitch, int ystart, float *params) {

166

int idx = blockIdx.x * BLOCK_SIZE + threadIdx.x;

167

int idy = blockIdx.y * BLOCK_SIZE + threadIdx.y + ystart;

168

int i = idy * pitch + idx;

169

170

float axis_position_corr = c_axiss[idy];

171

float flat_zone = params[idy * 2];

172

float param = params[idy * 2 + 1];

173

174

int flag = idx < (axis_position_corr + flat_zone);

175

176

float multiplier = ((flag?((axis_position_corr - flat_zone) + param):((axis_position_corr + flat_zone) + param)) - idx)/param;

177

178

179

if (param > 0) {

180

data[i] *= min(2., max(flag?1.:0., multiplier));

181

} else {

182

data[i] *= max(0., min(flag?1.:2., multiplier));

183

}

184

}

185

186

187

188

__global__ static void hst_cuda_unpack_kernel(cufftReal *out, int dpitch, cufftReal *data, int spitch) {

189

float tmp1, tmp2;

190

191

int idx = blockIdx.x * BLOCK_SIZE + threadIdx.x;

192

int idy = blockIdx.y * BLOCK_SIZE + threadIdx.y;

193

194

int pos = 2 * (idy * spitch + idx);

195

196

tmp1 = data[pos];

197

tmp2 = data[pos + 1];

198

199

pos = 2 * idy * dpitch + idx ;

200

201

out[pos] = tmp1;

202

out[pos + dpitch] = tmp2;

203

}

204

205

206

207

208

This is backup variant. Theoretically, the first function should work slightly faster due to the reduced amount

209

of memory accesses. However, it hardly could be seen!

210

__global__ static void hst_cuda_unpack_kernel_fai360(cufftReal *out, int dpitch, cufftReal *data, int spitch, int half, float *params, int batch) {

211

float tmp1, tmp2;

212

213

int idx = blockIdx.x * BLOCK_SIZE + threadIdx.x;

214

int idy = blockIdx.y * BLOCK_SIZE + threadIdx.y;

215

216

int pos = 2 * (idy * spitch + idx);

217

218

tmp1 = data[pos];

219

tmp2 = data[pos + 1];

220

221

pos = 2 * idy * dpitch + idx ;

222

223

224

float axis_position_corr = c_axiss[batch + 2*idy];

225

float flat_zone = params[idy * 4];

226

float param = params[idy * 4 + 1];

227

228

int flag = idx < (axis_position_corr + flat_zone);

229

float multiplier = ((flag?((axis_position_corr - flat_zone) + param):((axis_position_corr + flat_zone) + param)) - idx)/param;

230

231

232

if (param > 0) {

233

out[pos] = tmp1 * min(2., max(flag?1.:0., multiplier));

234

} else {

235

out[pos] = tmp1 * max(0., min(flag?1.:2., multiplier));

236

}

237

238

axis_position_corr = c_axiss[batch + (++idy)];

239

flat_zone = params[idy * 2];

240

param = params[idy * 2 + 1];

241

242

flag = idx < (axis_position_corr + flat_zone);

243

multiplier = ((flag?((axis_position_corr - flat_zone) + param):((axis_position_corr + flat_zone) + param)) - idx)/param;

244

245

pos += dpitch;

246

if (param > 0) {

247

out[pos] = tmp2 * min(2., max(flag?1.:0., multiplier));

248

} else {

249

out[pos] = tmp2 * max(0., min(flag?1.:2., multiplier));

250

}

251

252

}

253

Older »