11
#include <spu_intrinsics.h>
12
#include <spu_mfcio.h>
14
// we should also consider HW_ALIGN (when change this values)
15
#define SPE_BLOCK 16 // Thats for BLAS/Lapack in items
17
#define HW_HIDE_DETAILS
18
#include "mrses_spe.h"
20
#define spe_scal(N,a,X,incX) sscal_spu(X, a, N)
21
#define spe_axpy(N, a, X, incX, Y, incY) saxpy_spu(X, Y, a, N)
22
#define spe_syrk(Order,Uplo,Trans, N, K, a, A, lda, b, C, ldc) \
23
ssyrk_spu(A, C, a, N, K, lda, ldc)
27
#define rnd(i) ((int)((i) * (rand() / (RAND_MAX + 1.0))))
29
#ifdef HW_USE_BLOCKED_MULTIPLY
30
# define calc_ralloc(width, walloc) walloc
32
//# define calc_ralloc(width, walloc) ((width<13)?width:walloc)
33
# define calc_ralloc(width, walloc) width
34
#endif /* HW_USE_BLOCKED_MULTIPLY */
37
//#include "atlas_potrf.h"
38
#include "vec_potrf.h"
40
static inline int mrses_spe_real_multiply(
42
short int pack, short int at_once,
43
short int d_step, short int width, short int ralloc, short int walloc, short int nA, short int nB,
44
MRSESDataType *A, MRSESDataType *B,
45
MRSESDataType *Ca, MRSESDataType *Cb
49
// PRINT_MATRIX("% 6.4f ", A, nA, 16, 16)
52
int rww = at_once * walloc * width;
53
memset(Ca, 0, rww * sizeof(MRSESDataType));
54
memset(Cb, 0, rww * sizeof(MRSESDataType));
55
spe_syrk(CblasRowMajor, CblasLower, CblasNoTrans, walloc, nA, 1, A, nA, 0, Ca, walloc);
56
spe_syrk(CblasRowMajor, CblasLower, CblasNoTrans, walloc, nB, 1, B, nB, 0, Cb, walloc);
60
vec_ssyrk_rln_11(ralloc, nA, A, nA, Ca, walloc);
61
vec_ssyrk_rln_11(ralloc, nB, B, nB, Cb, walloc);
63
short int c_step = width * (walloc + 1);
64
for (i = 0; i < at_once; ++i) {
65
short int c_offset = i * c_step;
66
MRSESDataType *Ca_cur = Ca + c_offset;
67
MRSESDataType *Cb_cur = Cb + c_offset;
69
MRSESDataType *Da = D + (3 * i + 1) * d_step;
70
MRSESDataType *Db = Da + d_step;
72
for (k = 0; k < width; ++k) {
73
for (l = 0; l < width; ++l) {
74
Da[pack*(k * width + l)] = Ca_cur[k * walloc + l];
75
Db[pack*(k * width + l)] = Cb_cur[k * walloc + l];
84
static vector unsigned int mask1 = {0, 0xFFFFFFFF, 0, 0xFFFFFFFF};
85
static vector unsigned int mask2 = {0, 0, 0xFFFFFFFF, 0xFFFFFFFF};
86
static vector unsigned int mask3 = {0xFFFFFFFF, 0, 0, 0xFFFFFFFF};
88
#define VECTOR_PRINT(var, val) \
89
printf("Vector %10s is: {% 6.4e, % 6.4e, % 6.4e, % 6.4e}\n", var, spu_extract(val, 0), spu_extract(val, 1), spu_extract(val, 2), spu_extract(val, 3));
91
#define iVECTOR_PRINT(var, val) \
92
printf("Vector %10s is: {%i, %i, %i, %i}\n", var, spu_extract(val, 0), spu_extract(val, 1), spu_extract(val, 2), spu_extract(val, 3));
95
#define REPACK(Da, Ca_cur) \
96
r1 = *(vector float*)(Ca_cur + k * walloc + l); \
97
r2 = spu_rlqwbyte(*(vector float*)(Ca_cur + c_big + k * walloc + l), -4); \
98
r3 = spu_rlqwbyte(*(vector float*)(Ca_cur + 2 * c_big + k * walloc + l), -8); \
99
r4 = spu_rlqwbyte(*(vector float*)(Ca_cur + 3 * c_big + k * walloc + l), -12); \
101
r5 = spu_sel(r1, r2, mask1); \
102
r6 = spu_sel(r3, r4, mask1); \
104
Da[k*width + l ] = spu_sel(r5, r6, mask2); \
105
Da[k*width + l + 2] = spu_rlqwbyte(spu_sel(r6, r5, mask2), 8); \
107
r5 = spu_sel(r2, r1, mask1); \
108
r6 = spu_sel(r4, r3, mask1); \
110
Da[k*width + l + 1] = spu_rlqwbyte(spu_sel(r5, r6, mask3), 4); \
111
Da[k*width + l + 3] = spu_rlqwbyte(spu_sel(r6, r5, mask3), 12);
115
static inline int mrses_spe_real_multiply_recover(
117
short int pack, short int at_once,
118
short int d_step, short int width, short int ralloc, short int walloc,
119
MRSESDataType *Ca, MRSESDataType *Cb,
120
MRSESIntType *drp_gen, MRSESIntType *rst_gen,
121
MRSESDataType *Ea, MRSESDataType *Eb
123
short int i, j, k, l, m;
124
short int c_step = width * (walloc + 1);
125
short int c_big = ralloc * walloc;
126
short int e_big = at_once * walloc;
128
for (i = 0; i < at_once; ++i) {
129
short int e_offset = i * walloc;
130
short int c_offset = i * c_step;
132
MRSESDataType *Da = D + (3 * i + 1) * d_step;
133
MRSESDataType *Db = Da + d_step;
135
for (m = 0; m < pack; ++m, c_offset += c_big, e_offset += e_big) {
136
short int gen = drp_gen[m * at_once + i];
138
if (rst_gen[m * at_once + i] > width) {
139
for (j = 0; j < width; j++) {
146
Ca[c_offset + k * walloc + l] = Ea[e_offset + j];
147
Cb[c_offset + k * walloc + l] = Eb[e_offset + j];
152
// Recovering to current stage
153
for (k = 0; k < width; ++k) {
154
for (l = 0; l < width; l += 4) {
155
register vector float r1, r2, r3, r4, r5, r6;
157
REPACK(((vector float*)Da), ((float*)Ca));
158
REPACK(((vector float*)Db), ((float*)Cb));
169
static inline int mrses_spe_real_multiply_update(
171
short int pack, short int at_once,
172
short int d_step, short int width, short int ralloc, short int walloc, short int nA, short int nB,
173
MRSESDataType *A, MRSESDataType *B,
174
MRSESIntType *drp_gen,
175
MRSESDataType *Ea, MRSESDataType *Eb
177
short int i, j, k, l;
179
if (*rst_gen > width) {
180
vec_ssyrk_rln_11(ralloc, nA, A, nA, Ca, walloc);
181
vec_ssyrk_rln_11(ralloc, nB, B, nB, Cb, walloc);
185
for (i = 0; i < at_once; ++i) {
186
short int e_offset = i * walloc;
187
short int gen = drp_gen[i];
189
MRSESDataType *Ea_cur = Ea + e_offset;
190
MRSESDataType *Eb_cur = Eb + e_offset;
192
MRSESDataType *Da = D + (3 * i + 1) * d_step;
193
MRSESDataType *Db = Da + d_step;
195
vec_update_row(gen, ralloc, nA, A, nA, Ea_cur);
196
vec_update_row(gen, ralloc, nB, B, nB, Eb_cur);
198
for (j = 0; j < width; j++) {
205
// if ((((int)D)%16)==0) {
206
// if (fabs(Da[pack * (k*width + l)] - Ea_cur[j])>0.001) {
207
// printf("Updating (%i): %i %i: % 6.4f to % 6.4f\n", gen, k, l, Da[pack * (k*width + l)], Ea_cur[j]);
210
Da[pack*(k * width + l)] = Ea_cur[j];
211
Db[pack*(k * width + l)] = Eb_cur[j];
218
static inline int mrses_spe_real_decompose(
219
MRSESDistance dist, MRSESDataType *res,
220
short int pack, short int at_once,
221
short int step, short int width,
222
MRSESDataType *D, MRSESDataType * mean
225
short int mstep = pack * width;
227
vector unsigned int err;
228
vector unsigned int error;
232
vector float zero = {0, 0, 0, 0};
234
for (i = 0; i < at_once; ++i) {
235
MRSESDataType *C = D + 3 * i * step;
236
MRSESDataType *Ca = C + step;
237
MRSESDataType *Cb = Ca + step;
238
MRSESDataType *m = mean + i * mstep;
240
memcpy(C, Ca, step * sizeof(MRSESDataType));
241
spe_axpy(step, 1, Cb, 1, C, 1);
242
spe_scal(step, 0.5, C, 1);
244
error = vec_spotrf_u(width, C, width);
246
err = vec_spotrf_u(width, Ca, width);
247
error = spu_and(err, error);
249
err = vec_spotrf_u(width, Cb, width);
250
error = spu_and(err, error);
252
rcorr = vec_rcorr(width, C, Ca, Cb);
253
rmahal = vec_rmahal(width, C, m);
257
result = vec_bhata(rcorr, rmahal);
269
for (j = 0; j < pack; j++) {
270
if (*(((unsigned int*)&error)+j)) {
271
res[j * at_once + i] = *(((float*)&result)+j);
272
// printf("SPU result: %e (mahal: %e, corcor: %e)\n", *(((float*)&result)+j), *(((float*)&rmahal)+j), *(((float*)&rcorr)+j));
274
res [j * at_once + i] = 0;
275
// printf("SPU result: singular matrix, assuming 0 distance\n");
283
struct MRSESTemporaryDataT {
292
MRSESDataType *mean_copy;
294
MRSESIntType *index_storage;
295
MRSESDataType *Mfull;
297
typedef struct MRSESTemporaryDataT MRSESTemporaryDataS;
298
typedef struct MRSESTemporaryDataT *MRSESTemporaryData;
300
static unsigned char *local_data = NULL;
302
static inline MRSESTemporaryData mrses_spe_malloc(MRSESContext mrses, unsigned int rdch) {
303
unsigned char *allocation;
305
MRSESIntType properties;
306
short int at_once, used_ralloc;
309
short int walloc, walloc2, dalloc, ralloc;
310
short int index_segment_size;
314
MRSESTemporaryData data;
316
properties = mrses->properties;
318
// pos = calc_alloc(properties * sizeof(uint32_t), HW_ALIGN);
319
if (local_data) return (MRSESTemporaryData)(local_data + pos);
321
width = mrses->width;
322
aA = calc_alloc(mrses->nA, SPE_BLOCK);
323
aB = calc_alloc(mrses->nB, SPE_BLOCK);
324
walloc = calc_alloc(width, SPE_BLOCK);
325
ralloc = calc_ralloc(width, walloc);
326
walloc2 = walloc * walloc;
327
at_once = max(1, ralloc / width);
328
pack = SIMD_BLOCK / sizeof(MRSESDataType);
329
dalloc = calc_alloc(pack * width * width, 128);//64);
330
index_segment_size = (HW_ALIGN / sizeof(MRSESIntType));
332
used_ralloc = at_once * width;
334
if ((ralloc < walloc)&&(ralloc > 12)) {
335
extra_rows = walloc - ralloc;
339
//calc_alloc(properties * sizeof(uint32_t), HW_ALIGN) + // result histogram
340
calc_alloc(sizeof(MRSESTemporaryDataS), HW_ALIGN) + // structure
341
(pack * ralloc + extra_rows) * (aA + aB) * sizeof(MRSESDataType) + // A, B - source matrices
342
2 * (pack * ralloc + extra_rows) * walloc * sizeof(MRSESDataType) + // Ca, Cb - variances (temp)
343
2 * pack * at_once * walloc * sizeof(MRSESDataType) + // Ea, Eb - variance updates
344
3 * at_once * dalloc * sizeof(MRSESDataType) + // D - variances (aligned)
345
2 * pack * ralloc * sizeof(MRSESDataType) + // mean, mean_copy
346
// calc_alloc(mrses->iterate_size * properties * sizeof(MRSESIntType), HW_ALIGN) +
347
(walloc + index_segment_size) * at_once * pack * sizeof(MRSESIntType) + // index + exchange area
348
calc_alloc(properties * sizeof(MRSESDataType), HW_ALIGN) // Mfull, complete mean
351
printf("Allocating data: %i KB (A+B: %li, C: %li, D: %li, mean: %li, Mfull: %li)\n", size / 1024,
352
pack * ralloc * (aA + aB) * sizeof(MRSESDataType) / 1024,
353
2 * (pack * ralloc + extra_rows) * walloc * sizeof(MRSESDataType) / 1024,
354
3 * at_once * dalloc * sizeof(MRSESDataType) / 1024,
355
2 * (pack * ralloc + extra_rows) * sizeof(MRSESDataType) / 1024,
356
calc_alloc(properties * sizeof(MRSESDataType), HW_ALIGN) / 1024
359
allocation = (unsigned char*)memalign(HW_ALIGN, size);
360
if (!allocation) return NULL;
362
memset(allocation, 0, properties * sizeof(uint32_t));
364
data = (MRSESTemporaryData)(allocation + pos);
365
pos += calc_alloc(sizeof(MRSESTemporaryDataS), HW_ALIGN);
367
data->A = (MRSESDataType*)(allocation + pos);
368
pos += (pack * ralloc + extra_rows) * aA * sizeof(MRSESDataType);
370
data->B = (MRSESDataType*)(allocation + pos);
371
pos += (pack * ralloc + extra_rows) * aB * sizeof(MRSESDataType);
373
data->Ca = (MRSESDataType*)(allocation + pos);
374
pos += (pack * ralloc + extra_rows) * walloc * sizeof(MRSESDataType);
376
data->Cb = (MRSESDataType*)(allocation + pos);
377
pos += (pack * ralloc + extra_rows) * walloc * sizeof(MRSESDataType);
379
data->Ea = (MRSESDataType*)(allocation + pos);
380
pos += pack * at_once * walloc * sizeof(MRSESDataType);
382
data->Eb = (MRSESDataType*)(allocation + pos);
383
pos += pack * at_once * walloc * sizeof(MRSESDataType);
385
data->D = (MRSESDataType*)(allocation + pos);
386
pos += 3 * at_once * dalloc * sizeof(MRSESDataType);
388
data->mean = (MRSESDataType*)(allocation + pos);
389
pos += pack * ralloc * sizeof(MRSESDataType);
391
data->mean_copy = (MRSESDataType*)(allocation + pos);
392
pos += pack * ralloc * sizeof(MRSESDataType);
394
data->index = (MRSESIntType*)(allocation + pos);
395
pos += walloc * at_once * pack * sizeof(MRSESIntType);
397
data->index_storage = (MRSESIntType*)(allocation + pos);
398
pos += index_segment_size * at_once * pack * sizeof(MRSESIntType);
400
data->Mfull = (MRSESDataType*)(allocation + pos);
402
spu_mfcdma32((void *)(data->Mfull), (unsigned int)(mrses->mean), calc_alloc(properties * sizeof(MRSESDataType), HW_ALIGN), rdch, MFC_GET_CMD);
404
local_data = (void*)allocation;
406
if (used_ralloc < ralloc) {
407
// we need to do it, to clean the end parts for non-even case
408
memset(data->A, 0, pack * (ralloc) * aA * sizeof(MRSESDataType));
409
memset(data->B, 0, pack * (ralloc) * aB * sizeof(MRSESDataType));
411
memset(data->mean, 0, pack * ralloc * sizeof(MRSESDataType));
412
memset(data->mean_copy, 0, pack * ralloc * sizeof(MRSESDataType));
420
int mrses_spe_iterate(int block_group, MRSESContext mrses, unsigned int rdch) {
422
short int j, k, l, m;
424
short int width = mrses->width;
425
short int walloc = calc_alloc(width, SPE_BLOCK);
426
short int alloc = mrses->alloc;
427
short int ralloc = calc_ralloc(width, walloc);
428
short int nA = mrses->nA;
429
short int cA = calc_alloc(nA * sizeof(MRSESDataType), HW_ALIGN);
430
short int aA = calc_alloc(nA, SPE_BLOCK); // aligned size
431
short int nB = mrses->nB;
432
short int cB = calc_alloc(nB * sizeof(MRSESDataType), HW_ALIGN);
433
short int aB = calc_alloc(nB, SPE_BLOCK);
434
MRSESDistance dist = mrses->dist;
436
MRSESIntType properties = mrses->properties;
437
int i, iterations = mrses->iterations;
439
short int iterate_size = mrses->iterate_size;
440
short int at_once = max(1, ralloc / width);
441
short int pack = SIMD_BLOCK / sizeof(MRSESDataType);
442
short int dalloc = calc_alloc(pack * width * width, 128);
443
short int index_segment_size = (HW_ALIGN / sizeof(MRSESIntType));
445
int spe_real_block = pack * at_once;
446
MRSESIntType rpl_gen_k, rpl_gen_segment;
447
MRSESIntType drp_gen[spe_real_block], rpl_gen[spe_real_block], rst_gen[spe_real_block];
448
MRSESDataType result[spe_real_block], cur_result[spe_real_block];
449
// MRSESDataType *result = mrses->result + block;
451
volatile MRSESDataType *Afull = mrses->A;
452
volatile MRSESDataType *Bfull = mrses->B;
453
MRSESIntType *mrses_index = mrses->index;
455
MRSESTemporaryData data;
465
MRSESDataType *mean_copy;
467
volatile MRSESIntType *index_storage, *index_storage_k;
468
volatile MRSESIntType *index, *index_k;
469
volatile MRSESDataType *Mfull;
471
#ifdef USE_FAST_RANDOM
473
struct timeval tvseed;
474
#endif /* USE_FAST_RANDOM */
476
#ifdef USE_FAST_RANDOM
477
gettimeofday(&tvseed, NULL);
478
g_seed = tvseed.tv_usec;
479
#endif /* USE_FAST_RANDOM */
481
data = mrses_spe_malloc(mrses, rdch);
483
printf("Memory allocation failed\n");
495
mean_copy = data->mean_copy;
497
index_storage = data->index_storage;
500
// printf("Iterate %i, pack %i, at_once %i ralloc %i walloc %i\n", iterate_size, pack, at_once, ralloc, walloc);
501
for (l = 0; l < iterate_size; l += pack * at_once) {
503
for (j = 0; j < spe_real_block; j++) {
504
spu_mfcdma32((void *)(index + j * walloc), (unsigned int)(mrses_index + (j + l + block_group * iterate_size) * properties), walloc * sizeof(MRSESIntType), rdch, MFC_GET_CMD);
507
(void)spu_mfcstat(MFC_TAG_UPDATE_ALL);
509
for (m = 0; m < pack; m++) {
510
for (j = 0, k = 0; j < at_once; ++j) {
511
//printf("SPE: %i = %i %i %i\n", l + m * at_once + j, l, m, j);
512
index_k = index + (m * at_once + j) * walloc;
513
for (i = 0; i < width; ++i, ++k) {
515
//printf(" * %i\n", idx);
516
spu_mfcdma32((void *)(A + ((int)(m * ralloc + k)) * aA), (unsigned int)(Afull + idx * alloc), cA, rdch, MFC_GET_CMD);
517
spu_mfcdma32((void *)(B + ((int)(m * ralloc + k)) * aB), (unsigned int)(Bfull + idx * alloc), cB, rdch, MFC_GET_CMD);
518
mean[k * pack + m] = Mfull[idx];
522
(void)spu_mfcstat(MFC_TAG_UPDATE_ALL);
524
mrses_spe_real_multiply(D + m, pack, at_once, dalloc, width, ralloc, walloc, aA, aB, A + m * ralloc * aA, B + m * ralloc * aB, Ca + m * ralloc * walloc, Cb + m * ralloc * walloc);
527
memcpy(mean_copy, mean, pack * ralloc * sizeof(MRSESDataType));
529
for (k = 0; k < spe_real_block; k++) {
533
mrses_spe_real_decompose(dist, cur_result, pack, at_once, dalloc, width, D, mean_copy);
542
for (i = 0; i < iterations; ++i)
544
memcpy(mean_copy, mean, pack * ralloc * sizeof(MRSESDataType));
546
#ifndef HW_USE_BLOCKED_MULTIPLY
547
mrses_spe_real_multiply_recover(D, pack, at_once, dalloc, width, ralloc, walloc, Ca, Cb, drp_gen, rst_gen, Ea, Eb);
548
#endif /* HW_USE_BLOCKED_MULTIPLY */
550
for (m = 0, k = 0; m < pack; ++m) {
551
for (j = 0; j < at_once; ++j, ++k) {
552
index_k = index + k * walloc;
554
drp_gen[k] = rnd(width);
555
rpl_gen[k] = rnd(properties - width) + width;
557
if ((rst_gen[k] < width)&&(rst_gen[k] != drp_gen[k])) {
558
idx = index_k[rst_gen[k]];
559
//printf("%i, Restoring: %i to index %i\n", k, rst_gen[k], idx);
560
spu_mfcdma32((void *)(A + ((int)(m * ralloc + j * width + rst_gen[k])) * aA), (unsigned int)(Afull + idx * alloc), cA, rdch, MFC_GET_CMD);
561
spu_mfcdma32((void *)(B + ((int)(m * ralloc + j * width + rst_gen[k])) * aB), (unsigned int)(Bfull + idx * alloc), cB, rdch, MFC_GET_CMD);
565
rpl_gen_k = rpl_gen[k];
566
if (rpl_gen_k < walloc) {
567
idx = index_k[rpl_gen[k]];
569
index_storage_k = index_storage + k * index_segment_size;
570
rpl_gen_segment = (rpl_gen_k / index_segment_size) * index_segment_size;
572
// printf("%i, getting part of index %i, index segment %i\n", k + l, rpl_gen_k, rpl_gen_segment);
574
spu_mfcdma32((void*)(index_storage_k), (unsigned int)(mrses_index + (l + k + block_group * iterate_size) * properties + rpl_gen_segment), HW_ALIGN, rdch, MFC_GET_CMD);
575
(void)spu_mfcstat(MFC_TAG_UPDATE_ALL);
577
idx = index_storage_k[rpl_gen_k - rpl_gen_segment];
578
//printf("done, index: %i\n", idx);
581
// printf("%i, receiving data for %i, index %i (pack %i)\n", k + l, drp_gen[k], idx, m);
582
spu_mfcdma32((void *)(A + ((int)(m * ralloc + j * width + drp_gen[k])) * aA), (unsigned int)(Afull + idx * alloc), cA, rdch, MFC_GET_CMD);
583
spu_mfcdma32((void *)(B + ((int)(m * ralloc + j * width + drp_gen[k])) * aB), (unsigned int)(Bfull + idx * alloc), cB, rdch, MFC_GET_CMD);
585
mean_copy[(j * width + drp_gen[k]) * pack + m] = Mfull[idx];
587
(void)spu_mfcstat(MFC_TAG_UPDATE_ALL);
590
#ifdef HW_USE_BLOCKED_MULTIPLY
591
mrses_spe_real_multiply(D + m, pack, at_once, dalloc, width, ralloc, walloc, aA, aB, A + m * ralloc * aA, B + m * ralloc * aB, Ca, Cb);
592
#else /* HW_USE_BLOCKED_MULTIPLY */
593
mrses_spe_real_multiply_update(
595
pack, at_once, dalloc, width, ralloc, walloc, aA, aB,
596
A + m * ralloc * aA, B + m * ralloc * aB,
597
drp_gen + m * at_once,
598
Ea + m * at_once * walloc, Eb + m * at_once * walloc
600
#endif /* HW_USE_BLOCKED_MULTIPLY */
603
mrses_spe_real_decompose(dist, result, pack, at_once, dalloc, width, D, mean_copy);
605
for (m = 0, k = 0; m < pack; ++m) {
606
for (j = 0; j < at_once; ++j, ++k) {
607
if (result[k] < cur_result[k]) {
608
rst_gen[k] = drp_gen[k];
610
rst_gen[k] = width + 1;
611
cur_result[k] = result[k];
613
index_k = index + k * walloc;
615
rpl_gen_k = rpl_gen[k];
616
if (rpl_gen_k < walloc) {
617
idx = index_k[rpl_gen_k];
618
SWAPij(index_k, drp_gen[k], rpl_gen_k);
620
index_storage_k = index_storage + k * index_segment_size;
621
rpl_gen_segment = (rpl_gen_k / index_segment_size) * index_segment_size;
622
idx = index_storage_k[rpl_gen_k - rpl_gen_segment];
624
index_storage_k[rpl_gen_k - rpl_gen_segment] = index_k[drp_gen[k]];
626
//printf("%i, write back\n", k);
627
// write back segment
628
spu_mfcdma32((void*)(index_storage_k), (unsigned int)(mrses_index + (l + k + block_group * iterate_size) * properties + rpl_gen_segment), HW_ALIGN, rdch, MFC_PUT_CMD);
629
//(void)spu_mfcstat(MFC_TAG_UPDATE_ALL);
632
index_k[drp_gen[k]] = idx;
634
mean[(j * width + drp_gen[k]) * pack + m] = Mfull[idx];
641
// We can write back initial part of index, but we don't really need to
642
for (j = 0; j < spe_real_block; j++) {
643
spu_mfcdma32((void *)(index + j * walloc), (unsigned int)(mrses_index + (j + l + block_group * iterate_size) * properties), walloc * sizeof(MRSESIntType), rdch, MFC_PUT_CMD);
645
(void)spu_mfcstat(MFC_TAG_UPDATE_ALL);
647
for (k = 0; k < spe_real_block; k++) {
648
printf("SPU result, block %i: %e\n", l + k, cur_result[k]);
656
#include <simdmath.h>
657
int main(unsigned long long id __attribute__ ((unused)), unsigned long long argp, unsigned long long envp __attribute__ ((unused))) {
659
volatile SPUParameters param;
660
volatile MRSESContext mrses;
661
unsigned int rdch; //, wrch;
665
#endif /* FIX_RANDOM */
668
struct timeval tv1, tv2;
669
#endif /* TRACE_TIMINGS */
672
gettimeofday(&tv1, NULL);
673
#endif /* TRACE_TIMINGS */
675
rdch = mfc_tag_reserve();
676
if (rdch == MFC_TAG_INVALID) {
677
printf("Unable to reserve DMA tag, returning");
681
spu_writech(MFC_WrTagMask, 1 << rdch);
683
spu_mfcdma32(¶m, argp, sizeof(SPUParameters), rdch, MFC_GET_CMD);
684
(void)spu_mfcstat(MFC_TAG_UPDATE_ALL);
687
size = calc_alloc(sizeof(MRSESContextS), HW_ALIGN);
688
mrses = (MRSESContext)malloc(size);
689
spu_mfcdma32(mrses, (unsigned int)(param.mrses), size, rdch, MFC_GET_CMD);
690
(void)spu_mfcstat(MFC_TAG_UPDATE_ALL);
694
#else /* FIX_RANDOM */
695
gettimeofday(&tv, NULL);
698
#endif /* FIX_RANDOM */
701
mrses_spe_iterate(param.block_group, mrses, rdch);
705
// local_data = NULL;
708
gettimeofday(&tv2, NULL);
709
printf("SPU 0x%llx: %li ms\n", id, (tv2.tv_sec - tv1.tv_sec)*1000+(tv2.tv_usec - tv1.tv_usec)/1000);
710
#endif /* TRACE_TIMINGS */