bzr branch
http://darksoft.org/webbzr/ani/mrses
1
by Suren A. Chilingaryan
Initial import |
1 |
#include <stdio.h> |
2 |
#include <stdlib.h> |
|
3 |
#include <malloc.h> |
|
4 |
#include <string.h> |
|
5 |
#include <sys/time.h> |
|
6 |
||
7 |
#include <blas_s.h> |
|
8 |
#include <cblas.h> |
|
9 |
#include <lapack.h> |
|
10 |
||
11 |
#include <spu_intrinsics.h> |
|
12 |
#include <spu_mfcio.h> |
|
13 |
||
14 |
// we should also consider HW_ALIGN (when change this values)
|
|
15 |
#define SPE_BLOCK 16 // Thats for BLAS/Lapack in items |
|
16 |
||
17 |
#define HW_HIDE_DETAILS
|
|
18 |
#include "mrses_spe.h" |
|
19 |
||
20 |
#define spe_scal(N,a,X,incX) sscal_spu(X, a, N)
|
|
21 |
#define spe_axpy(N, a, X, incX, Y, incY) saxpy_spu(X, Y, a, N)
|
|
22 |
#define spe_syrk(Order,Uplo,Trans, N, K, a, A, lda, b, C, ldc) \
|
|
23 |
ssyrk_spu(A, C, a, N, K, lda, ldc)
|
|
24 |
||
25 |
||
26 |
#undef rnd
|
|
27 |
#define rnd(i) ((int)((i) * (rand() / (RAND_MAX + 1.0))))
|
|
28 |
||
29 |
#ifdef HW_USE_BLOCKED_MULTIPLY
|
|
30 |
# define calc_ralloc(width, walloc) walloc
|
|
31 |
#else
|
|
32 |
//# define calc_ralloc(width, walloc) ((width<13)?width:walloc)
|
|
33 |
# define calc_ralloc(width, walloc) width
|
|
34 |
#endif /* HW_USE_BLOCKED_MULTIPLY */ |
|
35 |
||
36 |
||
37 |
//#include "atlas_potrf.h"
|
|
38 |
#include "vec_potrf.h" |
|
39 |
||
40 |
static inline int mrses_spe_real_multiply( |
|
41 |
MRSESDataType *D, |
|
42 |
short int pack, short int at_once, |
|
43 |
short int d_step, short int width, short int ralloc, short int walloc, short int nA, short int nB, |
|
44 |
MRSESDataType *A, MRSESDataType *B, |
|
45 |
MRSESDataType *Ca, MRSESDataType *Cb |
|
46 |
) { |
|
47 |
short int i, k, l; |
|
48 |
||
49 |
// PRINT_MATRIX("% 6.4f ", A, nA, 16, 16)
|
|
50 |
||
51 |
/*
|
|
52 |
int rww = at_once * walloc * width;
|
|
53 |
memset(Ca, 0, rww * sizeof(MRSESDataType));
|
|
54 |
memset(Cb, 0, rww * sizeof(MRSESDataType));
|
|
55 |
spe_syrk(CblasRowMajor, CblasLower, CblasNoTrans, walloc, nA, 1, A, nA, 0, Ca, walloc);
|
|
56 |
spe_syrk(CblasRowMajor, CblasLower, CblasNoTrans, walloc, nB, 1, B, nB, 0, Cb, walloc);
|
|
57 |
*/
|
|
58 |
||
59 |
||
60 |
vec_ssyrk_rln_11(ralloc, nA, A, nA, Ca, walloc); |
|
61 |
vec_ssyrk_rln_11(ralloc, nB, B, nB, Cb, walloc); |
|
62 |
||
63 |
short int c_step = width * (walloc + 1); |
|
64 |
for (i = 0; i < at_once; ++i) { |
|
65 |
short int c_offset = i * c_step; |
|
66 |
MRSESDataType *Ca_cur = Ca + c_offset; |
|
67 |
MRSESDataType *Cb_cur = Cb + c_offset; |
|
68 |
||
69 |
MRSESDataType *Da = D + (3 * i + 1) * d_step; |
|
70 |
MRSESDataType *Db = Da + d_step; |
|
71 |
||
72 |
for (k = 0; k < width; ++k) { |
|
73 |
for (l = 0; l < width; ++l) { |
|
74 |
Da[pack*(k * width + l)] = Ca_cur[k * walloc + l]; |
|
75 |
Db[pack*(k * width + l)] = Cb_cur[k * walloc + l]; |
|
76 |
}
|
|
77 |
}
|
|
78 |
}
|
|
79 |
||
80 |
||
81 |
return 0; |
|
82 |
}
|
|
83 |
||
84 |
static vector unsigned int mask1 = {0, 0xFFFFFFFF, 0, 0xFFFFFFFF}; |
|
85 |
static vector unsigned int mask2 = {0, 0, 0xFFFFFFFF, 0xFFFFFFFF}; |
|
86 |
static vector unsigned int mask3 = {0xFFFFFFFF, 0, 0, 0xFFFFFFFF}; |
|
87 |
||
88 |
#define VECTOR_PRINT(var, val) \
|
|
89 |
printf("Vector %10s is: {% 6.4e, % 6.4e, % 6.4e, % 6.4e}\n", var, spu_extract(val, 0), spu_extract(val, 1), spu_extract(val, 2), spu_extract(val, 3));
|
|
90 |
||
91 |
#define iVECTOR_PRINT(var, val) \
|
|
92 |
printf("Vector %10s is: {%i, %i, %i, %i}\n", var, spu_extract(val, 0), spu_extract(val, 1), spu_extract(val, 2), spu_extract(val, 3));
|
|
93 |
||
94 |
||
95 |
#define REPACK(Da, Ca_cur) \
|
|
96 |
r1 = *(vector float*)(Ca_cur + k * walloc + l); \
|
|
97 |
r2 = spu_rlqwbyte(*(vector float*)(Ca_cur + c_big + k * walloc + l), -4); \
|
|
98 |
r3 = spu_rlqwbyte(*(vector float*)(Ca_cur + 2 * c_big + k * walloc + l), -8); \
|
|
99 |
r4 = spu_rlqwbyte(*(vector float*)(Ca_cur + 3 * c_big + k * walloc + l), -12); \
|
|
100 |
\
|
|
101 |
r5 = spu_sel(r1, r2, mask1); \
|
|
102 |
r6 = spu_sel(r3, r4, mask1); \
|
|
103 |
\
|
|
104 |
Da[k*width + l ] = spu_sel(r5, r6, mask2); \
|
|
105 |
Da[k*width + l + 2] = spu_rlqwbyte(spu_sel(r6, r5, mask2), 8); \
|
|
106 |
\
|
|
107 |
r5 = spu_sel(r2, r1, mask1); \
|
|
108 |
r6 = spu_sel(r4, r3, mask1); \
|
|
109 |
\
|
|
110 |
Da[k*width + l + 1] = spu_rlqwbyte(spu_sel(r5, r6, mask3), 4); \
|
|
111 |
Da[k*width + l + 3] = spu_rlqwbyte(spu_sel(r6, r5, mask3), 12);
|
|
112 |
||
113 |
||
114 |
||
115 |
static inline int mrses_spe_real_multiply_recover( |
|
116 |
MRSESDataType *D, |
|
117 |
short int pack, short int at_once, |
|
118 |
short int d_step, short int width, short int ralloc, short int walloc, |
|
119 |
MRSESDataType *Ca, MRSESDataType *Cb, |
|
120 |
MRSESIntType *drp_gen, MRSESIntType *rst_gen, |
|
121 |
MRSESDataType *Ea, MRSESDataType *Eb |
|
122 |
) { |
|
123 |
short int i, j, k, l, m; |
|
124 |
short int c_step = width * (walloc + 1); |
|
125 |
short int c_big = ralloc * walloc; |
|
126 |
short int e_big = at_once * walloc; |
|
127 |
||
128 |
for (i = 0; i < at_once; ++i) { |
|
129 |
short int e_offset = i * walloc; |
|
130 |
short int c_offset = i * c_step; |
|
131 |
||
132 |
MRSESDataType *Da = D + (3 * i + 1) * d_step; |
|
133 |
MRSESDataType *Db = Da + d_step; |
|
134 |
||
135 |
for (m = 0; m < pack; ++m, c_offset += c_big, e_offset += e_big) { |
|
136 |
short int gen = drp_gen[m * at_once + i]; |
|
137 |
||
138 |
if (rst_gen[m * at_once + i] > width) { |
|
139 |
for (j = 0; j < width; j++) { |
|
140 |
if (j < gen) { |
|
141 |
l = j; k = gen; |
|
142 |
} else { |
|
143 |
l = gen; k = j; |
|
144 |
}
|
|
145 |
||
146 |
Ca[c_offset + k * walloc + l] = Ea[e_offset + j]; |
|
147 |
Cb[c_offset + k * walloc + l] = Eb[e_offset + j]; |
|
148 |
}
|
|
149 |
}
|
|
150 |
}
|
|
151 |
||
152 |
// Recovering to current stage
|
|
153 |
for (k = 0; k < width; ++k) { |
|
154 |
for (l = 0; l < width; l += 4) { |
|
155 |
register vector float r1, r2, r3, r4, r5, r6; |
|
156 |
||
157 |
REPACK(((vector float*)Da), ((float*)Ca)); |
|
158 |
REPACK(((vector float*)Db), ((float*)Cb)); |
|
159 |
||
160 |
}
|
|
161 |
}
|
|
162 |
}
|
|
163 |
||
164 |
return 0; |
|
165 |
}
|
|
166 |
||
167 |
||
168 |
||
169 |
static inline int mrses_spe_real_multiply_update( |
|
170 |
MRSESDataType *D, |
|
171 |
short int pack, short int at_once, |
|
172 |
short int d_step, short int width, short int ralloc, short int walloc, short int nA, short int nB, |
|
173 |
MRSESDataType *A, MRSESDataType *B, |
|
174 |
MRSESIntType *drp_gen, |
|
175 |
MRSESDataType *Ea, MRSESDataType *Eb |
|
176 |
) { |
|
177 |
short int i, j, k, l; |
|
178 |
/*
|
|
179 |
if (*rst_gen > width) {
|
|
180 |
vec_ssyrk_rln_11(ralloc, nA, A, nA, Ca, walloc);
|
|
181 |
vec_ssyrk_rln_11(ralloc, nB, B, nB, Cb, walloc);
|
|
182 |
}
|
|
183 |
*/
|
|
184 |
||
185 |
for (i = 0; i < at_once; ++i) { |
|
186 |
short int e_offset = i * walloc; |
|
187 |
short int gen = drp_gen[i]; |
|
188 |
||
189 |
MRSESDataType *Ea_cur = Ea + e_offset; |
|
190 |
MRSESDataType *Eb_cur = Eb + e_offset; |
|
191 |
||
192 |
MRSESDataType *Da = D + (3 * i + 1) * d_step; |
|
193 |
MRSESDataType *Db = Da + d_step; |
|
194 |
||
195 |
vec_update_row(gen, ralloc, nA, A, nA, Ea_cur); |
|
196 |
vec_update_row(gen, ralloc, nB, B, nB, Eb_cur); |
|
197 |
||
198 |
for (j = 0; j < width; j++) { |
|
199 |
if (j < gen) { |
|
200 |
l = j; k = gen; |
|
201 |
} else { |
|
202 |
l = gen; k = j; |
|
203 |
}
|
|
204 |
||
205 |
// if ((((int)D)%16)==0) {
|
|
206 |
// if (fabs(Da[pack * (k*width + l)] - Ea_cur[j])>0.001) {
|
|
207 |
// printf("Updating (%i): %i %i: % 6.4f to % 6.4f\n", gen, k, l, Da[pack * (k*width + l)], Ea_cur[j]);
|
|
208 |
// }
|
|
209 |
// }
|
|
210 |
Da[pack*(k * width + l)] = Ea_cur[j]; |
|
211 |
Db[pack*(k * width + l)] = Eb_cur[j]; |
|
212 |
}
|
|
213 |
}
|
|
214 |
||
215 |
return 0; |
|
216 |
}
|
|
217 |
||
218 |
static inline int mrses_spe_real_decompose( |
|
219 |
MRSESDistance dist, MRSESDataType *res, |
|
220 |
short int pack, short int at_once, |
|
221 |
short int step, short int width, |
|
222 |
MRSESDataType *D, MRSESDataType * mean |
|
223 |
) { |
|
224 |
short int i, j; |
|
225 |
short int mstep = pack * width; |
|
226 |
||
227 |
vector unsigned int err; |
|
228 |
vector unsigned int error; |
|
229 |
vector float rcorr; |
|
230 |
vector float rmahal; |
|
231 |
vector float result; |
|
232 |
vector float zero = {0, 0, 0, 0}; |
|
233 |
||
234 |
for (i = 0; i < at_once; ++i) { |
|
235 |
MRSESDataType *C = D + 3 * i * step; |
|
236 |
MRSESDataType *Ca = C + step; |
|
237 |
MRSESDataType *Cb = Ca + step; |
|
238 |
MRSESDataType *m = mean + i * mstep; |
|
239 |
||
240 |
memcpy(C, Ca, step * sizeof(MRSESDataType)); |
|
241 |
spe_axpy(step, 1, Cb, 1, C, 1); |
|
242 |
spe_scal(step, 0.5, C, 1); |
|
243 |
||
244 |
error = vec_spotrf_u(width, C, width); |
|
245 |
||
246 |
err = vec_spotrf_u(width, Ca, width); |
|
247 |
error = spu_and(err, error); |
|
248 |
||
249 |
err = vec_spotrf_u(width, Cb, width); |
|
250 |
error = spu_and(err, error); |
|
251 |
||
252 |
rcorr = vec_rcorr(width, C, Ca, Cb); |
|
253 |
rmahal = vec_rmahal(width, C, m); |
|
254 |
||
255 |
switch (dist) { |
|
256 |
case BHATTACHARYYA: |
|
257 |
result = vec_bhata(rcorr, rmahal); |
|
258 |
break; |
|
259 |
case MAHALANOBIS: |
|
260 |
result = rmahal; |
|
261 |
break; |
|
262 |
case CORCOR: |
|
263 |
result = rcorr; |
|
264 |
break; |
|
265 |
default: |
|
266 |
result = zero; |
|
267 |
}
|
|
268 |
||
269 |
for (j = 0; j < pack; j++) { |
|
270 |
if (*(((unsigned int*)&error)+j)) { |
|
271 |
res[j * at_once + i] = *(((float*)&result)+j); |
|
272 |
// printf("SPU result: %e (mahal: %e, corcor: %e)\n", *(((float*)&result)+j), *(((float*)&rmahal)+j), *(((float*)&rcorr)+j));
|
|
273 |
} else { |
|
274 |
res [j * at_once + i] = 0; |
|
275 |
// printf("SPU result: singular matrix, assuming 0 distance\n");
|
|
276 |
}
|
|
277 |
}
|
|
278 |
}
|
|
279 |
||
280 |
return 0; |
|
281 |
}
|
|
282 |
||
283 |
struct MRSESTemporaryDataT { |
|
284 |
MRSESDataType *A; |
|
285 |
MRSESDataType *B; |
|
286 |
MRSESDataType *Ca; |
|
287 |
MRSESDataType *Cb; |
|
288 |
MRSESDataType *Ea; |
|
289 |
MRSESDataType *Eb; |
|
290 |
MRSESDataType *D; |
|
291 |
MRSESDataType *mean; |
|
292 |
MRSESDataType *mean_copy; |
|
293 |
MRSESIntType *index; |
|
294 |
MRSESIntType *index_storage; |
|
295 |
MRSESDataType *Mfull; |
|
296 |
};
|
|
297 |
typedef struct MRSESTemporaryDataT MRSESTemporaryDataS; |
|
298 |
typedef struct MRSESTemporaryDataT *MRSESTemporaryData; |
|
299 |
||
300 |
static unsigned char *local_data = NULL; |
|
301 |
||
302 |
static inline MRSESTemporaryData mrses_spe_malloc(MRSESContext mrses, unsigned int rdch) { |
|
303 |
unsigned char *allocation; |
|
304 |
short int aA, aB; |
|
305 |
MRSESIntType properties; |
|
306 |
short int at_once, used_ralloc; |
|
307 |
short int pack; |
|
308 |
short int width; |
|
309 |
short int walloc, walloc2, dalloc, ralloc; |
|
310 |
short int index_segment_size; |
|
2
by Suren A. Chilingaryan
Fix SPE crashes on big (above 4096) and non-power-of-2 number of properties |
311 |
int i, pos = 0, size; |
1
by Suren A. Chilingaryan
Initial import |
312 |
int extra_rows = 0; |
313 |
||
314 |
MRSESTemporaryData data; |
|
315 |
||
316 |
properties = mrses->properties; |
|
317 |
||
318 |
// pos = calc_alloc(properties * sizeof(uint32_t), HW_ALIGN);
|
|
319 |
if (local_data) return (MRSESTemporaryData)(local_data + pos); |
|
320 |
||
321 |
width = mrses->width; |
|
322 |
aA = calc_alloc(mrses->nA, SPE_BLOCK); |
|
323 |
aB = calc_alloc(mrses->nB, SPE_BLOCK); |
|
324 |
walloc = calc_alloc(width, SPE_BLOCK); |
|
325 |
ralloc = calc_ralloc(width, walloc); |
|
326 |
walloc2 = walloc * walloc; |
|
327 |
at_once = max(1, ralloc / width); |
|
328 |
pack = SIMD_BLOCK / sizeof(MRSESDataType); |
|
329 |
dalloc = calc_alloc(pack * width * width, 128);//64); |
|
330 |
index_segment_size = (HW_ALIGN / sizeof(MRSESIntType)); |
|
331 |
||
332 |
used_ralloc = at_once * width; |
|
333 |
||
334 |
if ((ralloc < walloc)&&(ralloc > 12)) { |
|
335 |
extra_rows = walloc - ralloc; |
|
336 |
}
|
|
337 |
||
338 |
size = ( |
|
339 |
//calc_alloc(properties * sizeof(uint32_t), HW_ALIGN) + // result histogram
|
|
340 |
calc_alloc(sizeof(MRSESTemporaryDataS), HW_ALIGN) + // structure |
|
341 |
(pack * ralloc + extra_rows) * (aA + aB) * sizeof(MRSESDataType) + // A, B - source matrices |
|
342 |
2 * (pack * ralloc + extra_rows) * walloc * sizeof(MRSESDataType) + // Ca, Cb - variances (temp) |
|
343 |
2 * pack * at_once * walloc * sizeof(MRSESDataType) + // Ea, Eb - variance updates |
|
344 |
3 * at_once * dalloc * sizeof(MRSESDataType) + // D - variances (aligned) |
|
345 |
2 * pack * ralloc * sizeof(MRSESDataType) + // mean, mean_copy |
|
346 |
// calc_alloc(mrses->iterate_size * properties * sizeof(MRSESIntType), HW_ALIGN) +
|
|
347 |
(walloc + index_segment_size) * at_once * pack * sizeof(MRSESIntType) + // index + exchange area |
|
348 |
calc_alloc(properties * sizeof(MRSESDataType), HW_ALIGN) // Mfull, complete mean |
|
349 |
);
|
|
350 |
/*
|
|
351 |
printf("Allocating data: %i KB (A+B: %li, C: %li, D: %li, mean: %li, Mfull: %li)\n", size / 1024,
|
|
352 |
pack * ralloc * (aA + aB) * sizeof(MRSESDataType) / 1024,
|
|
353 |
2 * (pack * ralloc + extra_rows) * walloc * sizeof(MRSESDataType) / 1024,
|
|
354 |
3 * at_once * dalloc * sizeof(MRSESDataType) / 1024,
|
|
355 |
2 * (pack * ralloc + extra_rows) * sizeof(MRSESDataType) / 1024,
|
|
356 |
calc_alloc(properties * sizeof(MRSESDataType), HW_ALIGN) / 1024
|
|
357 |
);
|
|
358 |
*/
|
|
359 |
allocation = (unsigned char*)memalign(HW_ALIGN, size); |
|
360 |
if (!allocation) return NULL; |
|
361 |
||
2
by Suren A. Chilingaryan
Fix SPE crashes on big (above 4096) and non-power-of-2 number of properties |
362 |
// memset(allocation, 0, properties * sizeof(uint32_t));
|
1
by Suren A. Chilingaryan
Initial import |
363 |
|
364 |
data = (MRSESTemporaryData)(allocation + pos); |
|
365 |
pos += calc_alloc(sizeof(MRSESTemporaryDataS), HW_ALIGN); |
|
366 |
||
367 |
data->A = (MRSESDataType*)(allocation + pos); |
|
368 |
pos += (pack * ralloc + extra_rows) * aA * sizeof(MRSESDataType); |
|
369 |
||
370 |
data->B = (MRSESDataType*)(allocation + pos); |
|
371 |
pos += (pack * ralloc + extra_rows) * aB * sizeof(MRSESDataType); |
|
372 |
||
373 |
data->Ca = (MRSESDataType*)(allocation + pos); |
|
374 |
pos += (pack * ralloc + extra_rows) * walloc * sizeof(MRSESDataType); |
|
375 |
||
376 |
data->Cb = (MRSESDataType*)(allocation + pos); |
|
377 |
pos += (pack * ralloc + extra_rows) * walloc * sizeof(MRSESDataType); |
|
378 |
||
379 |
data->Ea = (MRSESDataType*)(allocation + pos); |
|
380 |
pos += pack * at_once * walloc * sizeof(MRSESDataType); |
|
381 |
||
382 |
data->Eb = (MRSESDataType*)(allocation + pos); |
|
383 |
pos += pack * at_once * walloc * sizeof(MRSESDataType); |
|
384 |
||
385 |
data->D = (MRSESDataType*)(allocation + pos); |
|
386 |
pos += 3 * at_once * dalloc * sizeof(MRSESDataType); |
|
387 |
||
388 |
data->mean = (MRSESDataType*)(allocation + pos); |
|
389 |
pos += pack * ralloc * sizeof(MRSESDataType); |
|
390 |
||
391 |
data->mean_copy = (MRSESDataType*)(allocation + pos); |
|
392 |
pos += pack * ralloc * sizeof(MRSESDataType); |
|
393 |
||
394 |
data->index = (MRSESIntType*)(allocation + pos); |
|
395 |
pos += walloc * at_once * pack * sizeof(MRSESIntType); |
|
396 |
||
397 |
data->index_storage = (MRSESIntType*)(allocation + pos); |
|
398 |
pos += index_segment_size * at_once * pack * sizeof(MRSESIntType); |
|
399 |
||
400 |
data->Mfull = (MRSESDataType*)(allocation + pos); |
|
401 |
||
2
by Suren A. Chilingaryan
Fix SPE crashes on big (above 4096) and non-power-of-2 number of properties |
402 |
size = calc_alloc(properties * sizeof(MRSESDataType), HW_ALIGN); |
403 |
for (i = 0; i < size; i += SPE_MAX_TRANSFER_SIZE) { |
|
404 |
spu_mfcdma32((void*)(((char *)(data->Mfull)) + i), (unsigned int)(((char*)mrses->mean) + i), min(size - i, SPE_MAX_TRANSFER_SIZE), rdch, MFC_GET_CMD); |
|
405 |
}
|
|
1
by Suren A. Chilingaryan
Initial import |
406 |
|
407 |
local_data = (void*)allocation; |
|
408 |
||
409 |
if (used_ralloc < ralloc) { |
|
410 |
// we need to do it, to clean the end parts for non-even case
|
|
411 |
memset(data->A, 0, pack * (ralloc) * aA * sizeof(MRSESDataType)); |
|
412 |
memset(data->B, 0, pack * (ralloc) * aB * sizeof(MRSESDataType)); |
|
413 |
||
414 |
memset(data->mean, 0, pack * ralloc * sizeof(MRSESDataType)); |
|
415 |
memset(data->mean_copy, 0, pack * ralloc * sizeof(MRSESDataType)); |
|
416 |
||
417 |
}
|
|
418 |
||
419 |
return data; |
|
420 |
}
|
|
421 |
||
422 |
||
423 |
int mrses_spe_iterate(int block_group, MRSESContext mrses, unsigned int rdch) { |
|
424 |
int idx; |
|
425 |
short int j, k, l, m; |
|
426 |
||
427 |
short int width = mrses->width; |
|
428 |
short int walloc = calc_alloc(width, SPE_BLOCK); |
|
429 |
short int alloc = mrses->alloc; |
|
430 |
short int ralloc = calc_ralloc(width, walloc); |
|
431 |
short int nA = mrses->nA; |
|
432 |
short int cA = calc_alloc(nA * sizeof(MRSESDataType), HW_ALIGN); |
|
433 |
short int aA = calc_alloc(nA, SPE_BLOCK); // aligned size |
|
434 |
short int nB = mrses->nB; |
|
435 |
short int cB = calc_alloc(nB * sizeof(MRSESDataType), HW_ALIGN); |
|
436 |
short int aB = calc_alloc(nB, SPE_BLOCK); |
|
437 |
MRSESDistance dist = mrses->dist; |
|
438 |
||
439 |
MRSESIntType properties = mrses->properties; |
|
2
by Suren A. Chilingaryan
Fix SPE crashes on big (above 4096) and non-power-of-2 number of properties |
440 |
MRSESIntType palloc = mrses->palloc; |
1
by Suren A. Chilingaryan
Initial import |
441 |
int i, iterations = mrses->iterations; |
442 |
||
443 |
short int iterate_size = mrses->iterate_size; |
|
444 |
short int at_once = max(1, ralloc / width); |
|
445 |
short int pack = SIMD_BLOCK / sizeof(MRSESDataType); |
|
446 |
short int dalloc = calc_alloc(pack * width * width, 128); |
|
447 |
short int index_segment_size = (HW_ALIGN / sizeof(MRSESIntType)); |
|
448 |
||
449 |
int spe_real_block = pack * at_once; |
|
450 |
MRSESIntType rpl_gen_k, rpl_gen_segment; |
|
451 |
MRSESIntType drp_gen[spe_real_block], rpl_gen[spe_real_block], rst_gen[spe_real_block]; |
|
452 |
MRSESDataType result[spe_real_block], cur_result[spe_real_block]; |
|
453 |
// MRSESDataType *result = mrses->result + block;
|
|
454 |
||
455 |
volatile MRSESDataType *Afull = mrses->A; |
|
456 |
volatile MRSESDataType *Bfull = mrses->B; |
|
457 |
MRSESIntType *mrses_index = mrses->index; |
|
458 |
||
459 |
MRSESTemporaryData data; |
|
460 |
||
461 |
MRSESDataType *A; |
|
462 |
MRSESDataType *B; |
|
463 |
MRSESDataType *Ca; |
|
464 |
MRSESDataType *Cb; |
|
465 |
MRSESDataType *Ea; |
|
466 |
MRSESDataType *Eb; |
|
467 |
MRSESDataType *D; |
|
468 |
MRSESDataType *mean; |
|
469 |
MRSESDataType *mean_copy; |
|
470 |
||
471 |
volatile MRSESIntType *index_storage, *index_storage_k; |
|
472 |
volatile MRSESIntType *index, *index_k; |
|
473 |
volatile MRSESDataType *Mfull; |
|
474 |
||
475 |
#ifdef USE_FAST_RANDOM
|
|
476 |
unsigned int g_seed; |
|
477 |
struct timeval tvseed; |
|
478 |
#endif /* USE_FAST_RANDOM */ |
|
479 |
||
480 |
#ifdef USE_FAST_RANDOM
|
|
481 |
gettimeofday(&tvseed, NULL); |
|
482 |
g_seed = tvseed.tv_usec; |
|
483 |
#endif /* USE_FAST_RANDOM */ |
|
484 |
||
485 |
data = mrses_spe_malloc(mrses, rdch); |
|
486 |
if (!data) { |
|
487 |
printf("Memory allocation failed\n"); |
|
488 |
exit(1); |
|
489 |
}
|
|
490 |
||
491 |
A = data->A; |
|
492 |
B = data->B; |
|
493 |
Ca = data->Ca; |
|
494 |
Cb = data->Cb; |
|
495 |
Ea = data->Ea; |
|
496 |
Eb = data->Eb; |
|
497 |
D = data->D; |
|
498 |
mean = data->mean; |
|
499 |
mean_copy = data->mean_copy; |
|
500 |
index = data->index; |
|
501 |
index_storage = data->index_storage; |
|
502 |
Mfull = data->Mfull; |
|
503 |
||
504 |
// printf("Iterate %i, pack %i, at_once %i ralloc %i walloc %i\n", iterate_size, pack, at_once, ralloc, walloc);
|
|
505 |
for (l = 0; l < iterate_size; l += pack * at_once) { |
|
506 |
||
507 |
for (j = 0; j < spe_real_block; j++) { |
|
2
by Suren A. Chilingaryan
Fix SPE crashes on big (above 4096) and non-power-of-2 number of properties |
508 |
spu_mfcdma32((void *)(index + j * walloc), (unsigned int)(mrses_index + (j + l + block_group * iterate_size) * palloc), walloc * sizeof(MRSESIntType), rdch, MFC_GET_CMD); |
1
by Suren A. Chilingaryan
Initial import |
509 |
}
|
510 |
||
511 |
(void)spu_mfcstat(MFC_TAG_UPDATE_ALL); |
|
512 |
||
513 |
for (m = 0; m < pack; m++) { |
|
514 |
for (j = 0, k = 0; j < at_once; ++j) { |
|
515 |
//printf("SPE: %i = %i %i %i\n", l + m * at_once + j, l, m, j);
|
|
516 |
index_k = index + (m * at_once + j) * walloc; |
|
517 |
for (i = 0; i < width; ++i, ++k) { |
|
518 |
idx = index_k[i]; |
|
519 |
//printf(" * %i\n", idx);
|
|
520 |
spu_mfcdma32((void *)(A + ((int)(m * ralloc + k)) * aA), (unsigned int)(Afull + idx * alloc), cA, rdch, MFC_GET_CMD); |
|
521 |
spu_mfcdma32((void *)(B + ((int)(m * ralloc + k)) * aB), (unsigned int)(Bfull + idx * alloc), cB, rdch, MFC_GET_CMD); |
|
522 |
mean[k * pack + m] = Mfull[idx]; |
|
523 |
}
|
|
524 |
}
|
|
525 |
||
526 |
(void)spu_mfcstat(MFC_TAG_UPDATE_ALL); |
|
527 |
||
528 |
mrses_spe_real_multiply(D + m, pack, at_once, dalloc, width, ralloc, walloc, aA, aB, A + m * ralloc * aA, B + m * ralloc * aB, Ca + m * ralloc * walloc, Cb + m * ralloc * walloc); |
|
529 |
}
|
|
530 |
||
531 |
memcpy(mean_copy, mean, pack * ralloc * sizeof(MRSESDataType)); |
|
532 |
||
533 |
for (k = 0; k < spe_real_block; k++) { |
|
534 |
rst_gen[k] = width; |
|
535 |
}
|
|
536 |
||
537 |
mrses_spe_real_decompose(dist, cur_result, pack, at_once, dalloc, width, D, mean_copy); |
|
538 |
||
539 |
||
540 |
// }
|
|
541 |
// return 0;
|
|
542 |
||
543 |
// {
|
|
544 |
||
545 |
||
546 |
for (i = 0; i < iterations; ++i) |
|
547 |
{
|
|
548 |
memcpy(mean_copy, mean, pack * ralloc * sizeof(MRSESDataType)); |
|
549 |
||
550 |
#ifndef HW_USE_BLOCKED_MULTIPLY
|
|
551 |
mrses_spe_real_multiply_recover(D, pack, at_once, dalloc, width, ralloc, walloc, Ca, Cb, drp_gen, rst_gen, Ea, Eb); |
|
552 |
#endif /* HW_USE_BLOCKED_MULTIPLY */ |
|
553 |
||
554 |
for (m = 0, k = 0; m < pack; ++m) { |
|
555 |
for (j = 0; j < at_once; ++j, ++k) { |
|
556 |
index_k = index + k * walloc; |
|
557 |
||
558 |
drp_gen[k] = rnd(width); |
|
559 |
rpl_gen[k] = rnd(properties - width) + width; |
|
560 |
||
561 |
if ((rst_gen[k] < width)&&(rst_gen[k] != drp_gen[k])) { |
|
562 |
idx = index_k[rst_gen[k]]; |
|
563 |
//printf("%i, Restoring: %i to index %i\n", k, rst_gen[k], idx);
|
|
564 |
spu_mfcdma32((void *)(A + ((int)(m * ralloc + j * width + rst_gen[k])) * aA), (unsigned int)(Afull + idx * alloc), cA, rdch, MFC_GET_CMD); |
|
565 |
spu_mfcdma32((void *)(B + ((int)(m * ralloc + j * width + rst_gen[k])) * aB), (unsigned int)(Bfull + idx * alloc), cB, rdch, MFC_GET_CMD); |
|
566 |
}
|
|
567 |
||
568 |
||
569 |
rpl_gen_k = rpl_gen[k]; |
|
570 |
if (rpl_gen_k < walloc) { |
|
571 |
idx = index_k[rpl_gen[k]]; |
|
572 |
} else { |
|
573 |
index_storage_k = index_storage + k * index_segment_size; |
|
574 |
rpl_gen_segment = (rpl_gen_k / index_segment_size) * index_segment_size; |
|
575 |
||
576 |
// printf("%i, getting part of index %i, index segment %i\n", k + l, rpl_gen_k, rpl_gen_segment);
|
|
577 |
||
2
by Suren A. Chilingaryan
Fix SPE crashes on big (above 4096) and non-power-of-2 number of properties |
578 |
spu_mfcdma32((void*)(index_storage_k), (unsigned int)(mrses_index + (l + k + block_group * iterate_size) * palloc + rpl_gen_segment), HW_ALIGN, rdch, MFC_GET_CMD); |
1
by Suren A. Chilingaryan
Initial import |
579 |
(void)spu_mfcstat(MFC_TAG_UPDATE_ALL); |
580 |
||
581 |
idx = index_storage_k[rpl_gen_k - rpl_gen_segment]; |
|
582 |
//printf("done, index: %i\n", idx);
|
|
583 |
}
|
|
584 |
||
585 |
// printf("%i, receiving data for %i, index %i (pack %i)\n", k + l, drp_gen[k], idx, m);
|
|
586 |
spu_mfcdma32((void *)(A + ((int)(m * ralloc + j * width + drp_gen[k])) * aA), (unsigned int)(Afull + idx * alloc), cA, rdch, MFC_GET_CMD); |
|
587 |
spu_mfcdma32((void *)(B + ((int)(m * ralloc + j * width + drp_gen[k])) * aB), (unsigned int)(Bfull + idx * alloc), cB, rdch, MFC_GET_CMD); |
|
588 |
//puts("done");
|
|
589 |
mean_copy[(j * width + drp_gen[k]) * pack + m] = Mfull[idx]; |
|
590 |
}
|
|
591 |
(void)spu_mfcstat(MFC_TAG_UPDATE_ALL); |
|
592 |
||
593 |
||
594 |
#ifdef HW_USE_BLOCKED_MULTIPLY
|
|
595 |
mrses_spe_real_multiply(D + m, pack, at_once, dalloc, width, ralloc, walloc, aA, aB, A + m * ralloc * aA, B + m * ralloc * aB, Ca, Cb); |
|
596 |
#else /* HW_USE_BLOCKED_MULTIPLY */ |
|
597 |
mrses_spe_real_multiply_update( |
|
598 |
D + m, |
|
599 |
pack, at_once, dalloc, width, ralloc, walloc, aA, aB, |
|
600 |
A + m * ralloc * aA, B + m * ralloc * aB, |
|
601 |
drp_gen + m * at_once, |
|
602 |
Ea + m * at_once * walloc, Eb + m * at_once * walloc |
|
603 |
);
|
|
604 |
#endif /* HW_USE_BLOCKED_MULTIPLY */ |
|
605 |
}
|
|
606 |
||
607 |
mrses_spe_real_decompose(dist, result, pack, at_once, dalloc, width, D, mean_copy); |
|
608 |
||
609 |
for (m = 0, k = 0; m < pack; ++m) { |
|
610 |
for (j = 0; j < at_once; ++j, ++k) { |
|
611 |
if (result[k] < cur_result[k]) { |
|
612 |
rst_gen[k] = drp_gen[k]; |
|
613 |
} else { |
|
614 |
rst_gen[k] = width + 1; |
|
615 |
cur_result[k] = result[k]; |
|
616 |
||
617 |
index_k = index + k * walloc; |
|
618 |
||
619 |
rpl_gen_k = rpl_gen[k]; |
|
620 |
if (rpl_gen_k < walloc) { |
|
621 |
idx = index_k[rpl_gen_k]; |
|
622 |
SWAPij(index_k, drp_gen[k], rpl_gen_k); |
|
623 |
} else { |
|
624 |
index_storage_k = index_storage + k * index_segment_size; |
|
625 |
rpl_gen_segment = (rpl_gen_k / index_segment_size) * index_segment_size; |
|
626 |
idx = index_storage_k[rpl_gen_k - rpl_gen_segment]; |
|
627 |
||
628 |
index_storage_k[rpl_gen_k - rpl_gen_segment] = index_k[drp_gen[k]]; |
|
629 |
||
630 |
//printf("%i, write back\n", k);
|
|
631 |
// write back segment
|
|
2
by Suren A. Chilingaryan
Fix SPE crashes on big (above 4096) and non-power-of-2 number of properties |
632 |
spu_mfcdma32((void*)(index_storage_k), (unsigned int)(mrses_index + (l + k + block_group * iterate_size) * palloc + rpl_gen_segment), HW_ALIGN, rdch, MFC_PUT_CMD); |
1
by Suren A. Chilingaryan
Initial import |
633 |
//(void)spu_mfcstat(MFC_TAG_UPDATE_ALL);
|
634 |
||
635 |
//puts("done");
|
|
636 |
index_k[drp_gen[k]] = idx; |
|
637 |
}
|
|
638 |
mean[(j * width + drp_gen[k]) * pack + m] = Mfull[idx]; |
|
639 |
}
|
|
640 |
}
|
|
641 |
}
|
|
642 |
||
643 |
}
|
|
644 |
||
645 |
// We can write back initial part of index, but we don't really need to
|
|
646 |
for (j = 0; j < spe_real_block; j++) { |
|
2
by Suren A. Chilingaryan
Fix SPE crashes on big (above 4096) and non-power-of-2 number of properties |
647 |
spu_mfcdma32((void *)(index + j * walloc), (unsigned int)(mrses_index + (j + l + block_group * iterate_size) * palloc), walloc * sizeof(MRSESIntType), rdch, MFC_PUT_CMD); |
1
by Suren A. Chilingaryan
Initial import |
648 |
}
|
649 |
(void)spu_mfcstat(MFC_TAG_UPDATE_ALL); |
|
650 |
/*
|
|
651 |
for (k = 0; k < spe_real_block; k++) {
|
|
652 |
printf("SPU result, block %i: %e\n", l + k, cur_result[k]);
|
|
653 |
}
|
|
654 |
*/
|
|
655 |
}
|
|
656 |
||
657 |
return 0; |
|
658 |
}
|
|
659 |
||
660 |
#include <simdmath.h> |
|
661 |
int main(unsigned long long id __attribute__ ((unused)), unsigned long long argp, unsigned long long envp __attribute__ ((unused))) { |
|
662 |
short int size; |
|
663 |
volatile SPUParameters param; |
|
664 |
volatile MRSESContext mrses; |
|
665 |
unsigned int rdch; //, wrch; |
|
666 |
||
667 |
#ifndef FIX_RANDOM
|
|
668 |
struct timeval tv; |
|
669 |
#endif /* FIX_RANDOM */ |
|
670 |
||
671 |
#ifdef TRACE_TIMINGS
|
|
672 |
struct timeval tv1, tv2; |
|
673 |
#endif /* TRACE_TIMINGS */ |
|
674 |
||
675 |
#ifdef TRACE_TIMINGS
|
|
676 |
gettimeofday(&tv1, NULL); |
|
677 |
#endif /* TRACE_TIMINGS */ |
|
678 |
||
679 |
rdch = mfc_tag_reserve(); |
|
680 |
if (rdch == MFC_TAG_INVALID) { |
|
681 |
printf("Unable to reserve DMA tag, returning"); |
|
682 |
exit(1); |
|
683 |
}
|
|
684 |
||
685 |
spu_writech(MFC_WrTagMask, 1 << rdch); |
|
686 |
||
687 |
spu_mfcdma32(¶m, argp, sizeof(SPUParameters), rdch, MFC_GET_CMD); |
|
688 |
(void)spu_mfcstat(MFC_TAG_UPDATE_ALL); |
|
689 |
||
690 |
||
691 |
size = calc_alloc(sizeof(MRSESContextS), HW_ALIGN); |
|
692 |
mrses = (MRSESContext)malloc(size); |
|
693 |
spu_mfcdma32(mrses, (unsigned int)(param.mrses), size, rdch, MFC_GET_CMD); |
|
694 |
(void)spu_mfcstat(MFC_TAG_UPDATE_ALL); |
|
695 |
||
696 |
#ifdef FIX_RANDOM
|
|
697 |
srand(FIX_RANDOM); |
|
698 |
#else /* FIX_RANDOM */ |
|
699 |
gettimeofday(&tv, NULL); |
|
700 |
srand(tv.tv_usec); |
|
701 |
//srand(block);
|
|
702 |
#endif /* FIX_RANDOM */ |
|
703 |
||
704 |
||
705 |
mrses_spe_iterate(param.block_group, mrses, rdch); |
|
706 |
||
707 |
// Optimize?
|
|
708 |
// free(local_data);
|
|
709 |
// local_data = NULL;
|
|
710 |
||
711 |
#ifdef TRACE_TIMINGS
|
|
712 |
gettimeofday(&tv2, NULL); |
|
713 |
printf("SPU 0x%llx: %li ms\n", id, (tv2.tv_sec - tv1.tv_sec)*1000+(tv2.tv_usec - tv1.tv_usec)/1000); |
|
714 |
#endif /* TRACE_TIMINGS */ |
|
715 |
||
716 |
return 0; |
|
717 |
}
|