/ani/mrses

To get this branch, use:
bzr branch http://darksoft.org/webbzr/ani/mrses
1 by Suren A. Chilingaryan
Initial import
1
#include <stdio.h>
2
#include <stdlib.h>
3
#include <malloc.h>
4
#include <string.h>
5
#include <sys/time.h>
6
7
#include <blas_s.h>
8
#include <cblas.h>
9
#include <lapack.h>
10
11
#include <spu_intrinsics.h>
12
#include <spu_mfcio.h>
13
14
    // we should also consider HW_ALIGN (when change this values)
15
#define SPE_BLOCK 16		// Thats for BLAS/Lapack in items
16
17
#define HW_HIDE_DETAILS
18
#include "mrses_spe.h"
19
20
#define spe_scal(N,a,X,incX) sscal_spu(X, a, N)
21
#define spe_axpy(N, a, X, incX, Y, incY) saxpy_spu(X, Y, a, N)
22
#define spe_syrk(Order,Uplo,Trans, N, K, a, A, lda, b, C, ldc) \
23
    ssyrk_spu(A, C, a, N, K, lda, ldc)
24
25
26
#undef rnd
27
#define rnd(i) ((int)((i) * (rand() / (RAND_MAX + 1.0))))
28
29
#ifdef HW_USE_BLOCKED_MULTIPLY
30
# define calc_ralloc(width, walloc) walloc
31
#else
32
//# define calc_ralloc(width, walloc) ((width<13)?width:walloc)
33
# define calc_ralloc(width, walloc) width
34
#endif /* HW_USE_BLOCKED_MULTIPLY */
35
36
37
//#include "atlas_potrf.h"
38
#include "vec_potrf.h"
39
40
static inline int mrses_spe_real_multiply(
41
    MRSESDataType *D,
42
    short int pack, short int at_once,
43
    short int d_step, short int width, short int ralloc, short int walloc, short int nA, short int nB,
44
    MRSESDataType *A, MRSESDataType *B,
45
    MRSESDataType *Ca, MRSESDataType *Cb
46
) {
47
    short int i, k, l;
48
49
//    PRINT_MATRIX("% 6.4f ", A, nA, 16, 16)
50
51
/*
52
    int rww = at_once * walloc * width;
53
    memset(Ca, 0, rww * sizeof(MRSESDataType));
54
    memset(Cb, 0, rww * sizeof(MRSESDataType));
55
    spe_syrk(CblasRowMajor, CblasLower, CblasNoTrans, walloc, nA, 1, A, nA, 0, Ca, walloc);
56
    spe_syrk(CblasRowMajor, CblasLower, CblasNoTrans, walloc, nB, 1, B, nB, 0, Cb, walloc);
57
*/
58
    
59
60
    vec_ssyrk_rln_11(ralloc, nA, A, nA, Ca, walloc);
61
    vec_ssyrk_rln_11(ralloc, nB, B, nB, Cb, walloc);
62
63
    short int c_step = width * (walloc + 1);
64
    for (i = 0; i < at_once; ++i) {
65
	short int c_offset = i * c_step;
66
	MRSESDataType *Ca_cur = Ca + c_offset;
67
	MRSESDataType *Cb_cur = Cb + c_offset;
68
69
	MRSESDataType *Da = D + (3 * i + 1) * d_step;
70
	MRSESDataType *Db = Da + d_step;
71
	
72
	for (k = 0; k < width; ++k) {
73
	    for (l = 0; l < width; ++l) {
74
		Da[pack*(k * width + l)] = Ca_cur[k * walloc + l];
75
		Db[pack*(k * width + l)] = Cb_cur[k * walloc + l];
76
	    }
77
	}
78
    }
79
80
81
    return 0;
82
}
83
84
static vector unsigned int mask1 = {0, 0xFFFFFFFF, 0, 0xFFFFFFFF};
85
static vector unsigned int mask2 = {0, 0, 0xFFFFFFFF, 0xFFFFFFFF};
86
static vector unsigned int mask3 = {0xFFFFFFFF, 0, 0, 0xFFFFFFFF};
87
88
#define VECTOR_PRINT(var, val) \
89
    printf("Vector %10s is: {% 6.4e, % 6.4e, % 6.4e, % 6.4e}\n", var, spu_extract(val, 0), spu_extract(val, 1), spu_extract(val, 2), spu_extract(val, 3));
90
91
#define iVECTOR_PRINT(var, val) \
92
    printf("Vector %10s is: {%i, %i, %i, %i}\n", var, spu_extract(val, 0), spu_extract(val, 1), spu_extract(val, 2), spu_extract(val, 3));
93
94
95
#define REPACK(Da, Ca_cur) \
96
		r1 =              *(vector float*)(Ca_cur +             k * walloc + l); \
97
		r2 = spu_rlqwbyte(*(vector float*)(Ca_cur +     c_big + k * walloc + l), -4); \
98
		r3 = spu_rlqwbyte(*(vector float*)(Ca_cur + 2 * c_big + k * walloc + l), -8); \
99
		r4 = spu_rlqwbyte(*(vector float*)(Ca_cur + 3 * c_big + k * walloc + l), -12); \
100
		\
101
		r5 = spu_sel(r1, r2, mask1); \
102
		r6 = spu_sel(r3, r4, mask1); \
103
		\
104
		Da[k*width + l    ] = spu_sel(r5, r6, mask2); \
105
	        Da[k*width + l + 2] = spu_rlqwbyte(spu_sel(r6, r5, mask2), 8); \
106
		\
107
		r5 = spu_sel(r2, r1, mask1); \
108
		r6 = spu_sel(r4, r3, mask1); \
109
		\
110
		Da[k*width + l + 1] = spu_rlqwbyte(spu_sel(r5, r6, mask3), 4); \
111
		Da[k*width + l + 3] = spu_rlqwbyte(spu_sel(r6, r5, mask3), 12);
112
113
114
115
static inline int mrses_spe_real_multiply_recover(
116
    MRSESDataType *D,
117
    short int pack, short int at_once,
118
    short int d_step, short int width, short int ralloc, short int walloc,
119
    MRSESDataType *Ca, MRSESDataType *Cb,
120
    MRSESIntType *drp_gen, MRSESIntType *rst_gen,
121
    MRSESDataType *Ea, MRSESDataType *Eb
122
) {
123
    short int i, j, k, l, m;
124
    short int c_step = width * (walloc + 1);
125
    short int c_big = ralloc * walloc;
126
    short int e_big = at_once * walloc;
127
  
128
    for (i = 0; i < at_once; ++i) {
129
	short int e_offset = i * walloc;
130
	short int c_offset = i * c_step;
131
132
	MRSESDataType *Da = D + (3 * i + 1) * d_step;
133
	MRSESDataType *Db = Da + d_step;
134
135
	for (m = 0; m < pack; ++m, c_offset += c_big, e_offset += e_big) {
136
	    short int gen = drp_gen[m * at_once + i];
137
138
	    if (rst_gen[m * at_once + i] > width) {
139
		for (j = 0; j < width; j++) {
140
		    if (j < gen) {
141
			l = j; k = gen;
142
		    } else {
143
			l = gen; k = j;
144
		    }
145
		
146
		    Ca[c_offset + k * walloc + l] = Ea[e_offset + j];
147
		    Cb[c_offset + k * walloc + l] = Eb[e_offset + j];
148
		}
149
	    }
150
	}
151
152
	    // Recovering to current stage
153
    	for (k = 0; k < width; ++k) {
154
	    for (l = 0; l < width; l += 4) {
155
		register vector float r1, r2, r3, r4, r5, r6;
156
		
157
		REPACK(((vector float*)Da), ((float*)Ca));
158
		REPACK(((vector float*)Db), ((float*)Cb));
159
		
160
	    }
161
	}
162
    }
163
    
164
    return 0;
165
}
166
167
168
169
static inline int mrses_spe_real_multiply_update(
170
    MRSESDataType *D,
171
    short int pack, short int at_once,
172
    short int d_step, short int width, short int ralloc, short int walloc, short int nA, short int nB,
173
    MRSESDataType *A, MRSESDataType *B,
174
    MRSESIntType *drp_gen,
175
    MRSESDataType *Ea, MRSESDataType *Eb
176
) {
177
    short int i, j, k, l;
178
/*
179
    if (*rst_gen > width) {
180
	vec_ssyrk_rln_11(ralloc, nA, A, nA, Ca, walloc);
181
	vec_ssyrk_rln_11(ralloc, nB, B, nB, Cb, walloc);
182
    }
183
*/
184
185
    for (i = 0; i < at_once; ++i) {
186
	short int e_offset = i * walloc;
187
	short int gen = drp_gen[i];
188
189
	MRSESDataType *Ea_cur = Ea + e_offset;
190
	MRSESDataType *Eb_cur = Eb + e_offset;
191
192
	MRSESDataType *Da = D + (3 * i + 1) * d_step;
193
	MRSESDataType *Db = Da + d_step;
194
	
195
        vec_update_row(gen, ralloc, nA, A, nA, Ea_cur);
196
        vec_update_row(gen, ralloc, nB, B, nB, Eb_cur);
197
	
198
	for (j = 0; j < width; j++) {
199
	    if (j < gen) {
200
		l = j; k = gen;
201
	    } else {
202
		l = gen; k = j;
203
	    }
204
205
//    if ((((int)D)%16)==0) {
206
//	    if (fabs(Da[pack * (k*width + l)] - Ea_cur[j])>0.001) {
207
//		printf("Updating (%i): %i %i: % 6.4f to % 6.4f\n", gen, k, l, Da[pack * (k*width + l)], Ea_cur[j]);
208
//	    }
209
//    }
210
	    Da[pack*(k * width + l)] = Ea_cur[j];
211
	    Db[pack*(k * width + l)] = Eb_cur[j];
212
	}
213
    }
214
215
    return 0;
216
}
217
218
static inline int mrses_spe_real_decompose(
219
    MRSESDistance dist, MRSESDataType *res,
220
    short int pack, short int at_once,
221
    short int step, short int width,
222
    MRSESDataType *D, MRSESDataType * mean
223
) {
224
    short int i, j;
225
    short int mstep = pack * width;
226
227
    vector unsigned int err;
228
    vector unsigned int error;
229
    vector float rcorr;
230
    vector float rmahal;
231
    vector float result;
232
    vector float zero = {0, 0, 0, 0};
233
    
234
    for (i = 0; i < at_once; ++i) {
235
    	MRSESDataType *C = D + 3 * i * step;
236
	MRSESDataType *Ca = C + step;
237
    	MRSESDataType *Cb = Ca + step;
238
	MRSESDataType *m = mean + i * mstep;
239
240
	memcpy(C, Ca, step * sizeof(MRSESDataType));
241
	spe_axpy(step, 1, Cb, 1, C, 1);
242
	spe_scal(step, 0.5, C, 1);
243
244
	error = vec_spotrf_u(width, C, width);
245
246
	err = vec_spotrf_u(width, Ca, width);
247
	error = spu_and(err, error);
248
249
	err = vec_spotrf_u(width, Cb, width);
250
	error = spu_and(err, error);
251
	
252
	rcorr = vec_rcorr(width, C, Ca, Cb);
253
	rmahal = vec_rmahal(width, C, m);
254
255
	switch (dist) {
256
	    case BHATTACHARYYA:
257
		result = vec_bhata(rcorr, rmahal);
258
	    break;
259
	    case MAHALANOBIS:
260
		result = rmahal;
261
	    break;
262
	    case CORCOR:
263
		result = rcorr;
264
	    break;
265
	    default:
266
		result = zero;
267
	}
268
269
	for (j = 0; j < pack; j++) {
270
	    if (*(((unsigned int*)&error)+j)) {
271
		res[j * at_once + i] = *(((float*)&result)+j);
272
//		printf("SPU result: %e (mahal: %e, corcor: %e)\n", *(((float*)&result)+j), *(((float*)&rmahal)+j), *(((float*)&rcorr)+j));
273
	    } else {
274
		res [j * at_once + i] = 0;
275
//		printf("SPU result: singular matrix, assuming 0 distance\n");
276
	    }
277
	}
278
    }
279
    
280
    return 0;
281
}
282
283
struct MRSESTemporaryDataT {
284
    MRSESDataType *A;
285
    MRSESDataType *B;
286
    MRSESDataType *Ca;
287
    MRSESDataType *Cb;
288
    MRSESDataType *Ea;
289
    MRSESDataType *Eb;
290
    MRSESDataType *D;
291
    MRSESDataType *mean;
292
    MRSESDataType *mean_copy;
293
    MRSESIntType *index;
294
    MRSESIntType *index_storage;
295
    MRSESDataType *Mfull;
296
};
297
typedef struct MRSESTemporaryDataT MRSESTemporaryDataS;
298
typedef struct MRSESTemporaryDataT *MRSESTemporaryData;
299
300
static unsigned char *local_data = NULL;
301
302
static inline MRSESTemporaryData mrses_spe_malloc(MRSESContext mrses, unsigned int rdch) {
303
    unsigned char *allocation;
304
    short int aA, aB;
305
    MRSESIntType properties;
306
    short int at_once, used_ralloc;
307
    short int pack;
308
    short int width;
309
    short int walloc, walloc2, dalloc, ralloc;
310
    short int index_segment_size; 
2 by Suren A. Chilingaryan
Fix SPE crashes on big (above 4096) and non-power-of-2 number of properties
311
    int i, pos = 0, size;
1 by Suren A. Chilingaryan
Initial import
312
    int extra_rows = 0;
313
314
    MRSESTemporaryData data;
315
    
316
    properties = mrses->properties;
317
318
//    pos = calc_alloc(properties * sizeof(uint32_t), HW_ALIGN);
319
    if (local_data) return (MRSESTemporaryData)(local_data + pos);
320
321
    width = mrses->width;
322
    aA = calc_alloc(mrses->nA, SPE_BLOCK);
323
    aB = calc_alloc(mrses->nB, SPE_BLOCK);
324
    walloc = calc_alloc(width, SPE_BLOCK);
325
    ralloc = calc_ralloc(width, walloc);
326
    walloc2 = walloc * walloc;
327
    at_once = max(1, ralloc / width);
328
    pack = SIMD_BLOCK / sizeof(MRSESDataType);
329
    dalloc = calc_alloc(pack * width * width, 128);//64);
330
    index_segment_size = (HW_ALIGN / sizeof(MRSESIntType));
331
332
    used_ralloc = at_once * width;
333
    
334
    if ((ralloc < walloc)&&(ralloc > 12)) {
335
	 extra_rows = walloc - ralloc;
336
    }
337
338
    size = (
339
	//calc_alloc(properties * sizeof(uint32_t), HW_ALIGN) +							// result histogram
340
	calc_alloc(sizeof(MRSESTemporaryDataS), HW_ALIGN) +							// structure
341
	(pack * ralloc + extra_rows) * (aA + aB) * sizeof(MRSESDataType) +					// A, B		- source matrices
342
	2 * (pack * ralloc + extra_rows) * walloc * sizeof(MRSESDataType) +					// Ca, Cb	- variances (temp)
343
	2 * pack * at_once * walloc * sizeof(MRSESDataType) +							// Ea, Eb	- variance updates
344
	    3 * at_once * dalloc * sizeof(MRSESDataType) +							// D		- variances (aligned)
345
	2 * pack * ralloc * sizeof(MRSESDataType) +								// mean, mean_copy
346
//	calc_alloc(mrses->iterate_size * properties * sizeof(MRSESIntType), HW_ALIGN) +
347
	(walloc + index_segment_size) * at_once * pack * sizeof(MRSESIntType) +					// index + exchange area
348
	calc_alloc(properties * sizeof(MRSESDataType), HW_ALIGN)						// Mfull, complete mean
349
    );
350
/*    
351
    printf("Allocating data: %i KB (A+B: %li, C: %li, D: %li, mean: %li, Mfull: %li)\n", size / 1024,
352
	pack * ralloc * (aA + aB) * sizeof(MRSESDataType) / 1024,
353
	2 * (pack * ralloc + extra_rows) * walloc * sizeof(MRSESDataType) / 1024,
354
	3 * at_once * dalloc * sizeof(MRSESDataType) / 1024,
355
	2 * (pack * ralloc + extra_rows) * sizeof(MRSESDataType) / 1024,
356
	calc_alloc(properties * sizeof(MRSESDataType), HW_ALIGN) / 1024
357
    );
358
*/
359
    allocation = (unsigned char*)memalign(HW_ALIGN, size);
360
    if (!allocation) return NULL;
361
2 by Suren A. Chilingaryan
Fix SPE crashes on big (above 4096) and non-power-of-2 number of properties
362
//    memset(allocation, 0, properties * sizeof(uint32_t));
1 by Suren A. Chilingaryan
Initial import
363
    
364
    data = (MRSESTemporaryData)(allocation + pos);
365
    pos += calc_alloc(sizeof(MRSESTemporaryDataS), HW_ALIGN);
366
367
    data->A = (MRSESDataType*)(allocation + pos);
368
    pos += (pack * ralloc + extra_rows) * aA * sizeof(MRSESDataType);
369
370
    data->B = (MRSESDataType*)(allocation + pos);
371
    pos += (pack * ralloc + extra_rows) * aB * sizeof(MRSESDataType);
372
373
    data->Ca = (MRSESDataType*)(allocation + pos);
374
    pos += (pack * ralloc + extra_rows) * walloc * sizeof(MRSESDataType);
375
376
    data->Cb = (MRSESDataType*)(allocation + pos);
377
    pos += (pack * ralloc + extra_rows) * walloc * sizeof(MRSESDataType);
378
379
    data->Ea = (MRSESDataType*)(allocation + pos);
380
    pos += pack * at_once * walloc * sizeof(MRSESDataType);
381
382
    data->Eb = (MRSESDataType*)(allocation + pos);
383
    pos += pack * at_once * walloc * sizeof(MRSESDataType);
384
    
385
    data->D = (MRSESDataType*)(allocation + pos);
386
    pos += 3 * at_once * dalloc * sizeof(MRSESDataType);
387
388
    data->mean = (MRSESDataType*)(allocation + pos);
389
    pos += pack * ralloc * sizeof(MRSESDataType);
390
391
    data->mean_copy = (MRSESDataType*)(allocation + pos);
392
    pos += pack * ralloc * sizeof(MRSESDataType);
393
394
    data->index = (MRSESIntType*)(allocation + pos);
395
    pos += walloc * at_once * pack * sizeof(MRSESIntType);
396
397
    data->index_storage = (MRSESIntType*)(allocation + pos);
398
    pos += index_segment_size * at_once * pack * sizeof(MRSESIntType);
399
400
    data->Mfull = (MRSESDataType*)(allocation + pos);
401
2 by Suren A. Chilingaryan
Fix SPE crashes on big (above 4096) and non-power-of-2 number of properties
402
    size = calc_alloc(properties * sizeof(MRSESDataType), HW_ALIGN);
403
    for (i = 0; i < size; i += SPE_MAX_TRANSFER_SIZE) {
404
        spu_mfcdma32((void*)(((char *)(data->Mfull)) + i), (unsigned int)(((char*)mrses->mean) + i), min(size - i, SPE_MAX_TRANSFER_SIZE), rdch, MFC_GET_CMD);
405
    }
1 by Suren A. Chilingaryan
Initial import
406
407
    local_data = (void*)allocation;
408
409
    if (used_ralloc < ralloc) {
410
	    // we need to do it, to clean the end parts for non-even case
411
	memset(data->A, 0, pack * (ralloc) * aA * sizeof(MRSESDataType));
412
	memset(data->B, 0, pack * (ralloc) * aB * sizeof(MRSESDataType));
413
414
	memset(data->mean, 0, pack * ralloc * sizeof(MRSESDataType));
415
	memset(data->mean_copy, 0, pack * ralloc * sizeof(MRSESDataType));
416
417
    }
418
419
    return data;
420
}
421
422
423
int mrses_spe_iterate(int block_group, MRSESContext mrses, unsigned int rdch) {
424
    int idx;
425
    short int j, k, l, m;
426
427
    short int width = mrses->width;
428
    short int walloc = calc_alloc(width, SPE_BLOCK);
429
    short int alloc = mrses->alloc;
430
    short int ralloc = calc_ralloc(width, walloc);
431
    short int nA = mrses->nA;
432
    short int cA = calc_alloc(nA * sizeof(MRSESDataType), HW_ALIGN);
433
    short int aA = calc_alloc(nA, SPE_BLOCK);	// aligned size
434
    short int nB = mrses->nB;
435
    short int cB = calc_alloc(nB * sizeof(MRSESDataType), HW_ALIGN);
436
    short int aB = calc_alloc(nB, SPE_BLOCK);
437
    MRSESDistance dist = mrses->dist;
438
439
    MRSESIntType properties = mrses->properties;
2 by Suren A. Chilingaryan
Fix SPE crashes on big (above 4096) and non-power-of-2 number of properties
440
    MRSESIntType palloc = mrses->palloc;
1 by Suren A. Chilingaryan
Initial import
441
    int i, iterations = mrses->iterations;
442
443
    short int iterate_size = mrses->iterate_size;
444
    short int at_once = max(1, ralloc / width);
445
    short int pack = SIMD_BLOCK / sizeof(MRSESDataType);
446
    short int dalloc = calc_alloc(pack * width * width, 128);
447
    short int index_segment_size = (HW_ALIGN / sizeof(MRSESIntType));
448
449
    int spe_real_block = pack * at_once;
450
    MRSESIntType rpl_gen_k, rpl_gen_segment;
451
    MRSESIntType drp_gen[spe_real_block], rpl_gen[spe_real_block], rst_gen[spe_real_block];
452
    MRSESDataType result[spe_real_block], cur_result[spe_real_block];
453
//    MRSESDataType *result = mrses->result + block;
454
    
455
    volatile MRSESDataType *Afull = mrses->A;
456
    volatile MRSESDataType *Bfull = mrses->B;
457
    MRSESIntType *mrses_index = mrses->index;
458
459
    MRSESTemporaryData data;
460
461
    MRSESDataType *A;
462
    MRSESDataType *B;
463
    MRSESDataType *Ca;
464
    MRSESDataType *Cb;
465
    MRSESDataType *Ea;
466
    MRSESDataType *Eb;
467
    MRSESDataType *D;
468
    MRSESDataType *mean;
469
    MRSESDataType *mean_copy;
470
471
    volatile MRSESIntType *index_storage, *index_storage_k;
472
    volatile MRSESIntType *index, *index_k;
473
    volatile MRSESDataType *Mfull;
474
475
#ifdef USE_FAST_RANDOM
476
    unsigned int g_seed;
477
    struct timeval tvseed;
478
#endif /* USE_FAST_RANDOM */
479
480
#ifdef USE_FAST_RANDOM
481
    gettimeofday(&tvseed, NULL);
482
    g_seed = tvseed.tv_usec;
483
#endif /* USE_FAST_RANDOM */
484
485
    data = mrses_spe_malloc(mrses, rdch);
486
    if (!data) {
487
	printf("Memory allocation failed\n");
488
	exit(1);
489
    }
490
491
    A = data->A;
492
    B = data->B;
493
    Ca = data->Ca;
494
    Cb = data->Cb;
495
    Ea = data->Ea;
496
    Eb = data->Eb;
497
    D = data->D;
498
    mean = data->mean;
499
    mean_copy = data->mean_copy;
500
    index = data->index;
501
    index_storage = data->index_storage;
502
    Mfull = data->Mfull;
503
504
//    printf("Iterate %i, pack %i, at_once %i ralloc %i walloc %i\n", iterate_size, pack, at_once, ralloc, walloc);
505
    for (l = 0; l < iterate_size; l += pack * at_once) {
506
507
	for (j = 0; j < spe_real_block; j++) {
2 by Suren A. Chilingaryan
Fix SPE crashes on big (above 4096) and non-power-of-2 number of properties
508
	    spu_mfcdma32((void *)(index + j * walloc), (unsigned int)(mrses_index + (j + l + block_group * iterate_size) * palloc), walloc * sizeof(MRSESIntType), rdch, MFC_GET_CMD);
1 by Suren A. Chilingaryan
Initial import
509
	}
510
511
	(void)spu_mfcstat(MFC_TAG_UPDATE_ALL);
512
513
	for (m = 0; m < pack; m++) {
514
	    for (j = 0, k = 0; j < at_once; ++j) {
515
		//printf("SPE: %i = %i %i %i\n", l + m * at_once + j, l, m, j);
516
		index_k = index + (m * at_once + j) * walloc;
517
		for (i = 0; i < width; ++i, ++k) {
518
		    idx = index_k[i];
519
		    //printf(" * %i\n", idx);
520
		    spu_mfcdma32((void *)(A + ((int)(m * ralloc + k)) * aA), (unsigned int)(Afull + idx * alloc), cA, rdch, MFC_GET_CMD);
521
		    spu_mfcdma32((void *)(B + ((int)(m * ralloc + k)) * aB), (unsigned int)(Bfull + idx * alloc), cB, rdch, MFC_GET_CMD);
522
		    mean[k * pack + m] = Mfull[idx];
523
		}
524
	    }
525
526
	    (void)spu_mfcstat(MFC_TAG_UPDATE_ALL);
527
528
	    mrses_spe_real_multiply(D + m, pack, at_once, dalloc, width, ralloc, walloc, aA, aB, A + m * ralloc * aA, B + m * ralloc * aB, Ca + m * ralloc * walloc, Cb + m * ralloc * walloc);
529
	}
530
531
	memcpy(mean_copy, mean, pack * ralloc * sizeof(MRSESDataType));
532
533
	for (k = 0; k < spe_real_block; k++) {
534
	    rst_gen[k] = width;
535
	}
536
537
	mrses_spe_real_decompose(dist, cur_result, pack, at_once, dalloc, width, D, mean_copy);
538
539
540
//    }
541
//    return 0;
542
	    
543
//    {
544
545
546
	for (i = 0; i < iterations; ++i) 
547
	{
548
	    memcpy(mean_copy, mean, pack * ralloc * sizeof(MRSESDataType));
549
550
#ifndef HW_USE_BLOCKED_MULTIPLY
551
	    mrses_spe_real_multiply_recover(D, pack, at_once, dalloc, width, ralloc, walloc, Ca, Cb, drp_gen, rst_gen, Ea, Eb);
552
#endif /* HW_USE_BLOCKED_MULTIPLY */
553
554
	    for (m = 0, k = 0; m < pack; ++m) {
555
		for (j = 0; j < at_once; ++j, ++k) {
556
		    index_k = index + k * walloc;
557
558
	    	    drp_gen[k] = rnd(width);
559
		    rpl_gen[k] = rnd(properties - width) + width;
560
	
561
		    if ((rst_gen[k] < width)&&(rst_gen[k] != drp_gen[k])) {
562
			idx = index_k[rst_gen[k]];
563
			//printf("%i, Restoring: %i to index %i\n", k, rst_gen[k], idx);
564
	    		spu_mfcdma32((void *)(A + ((int)(m * ralloc + j * width + rst_gen[k])) * aA), (unsigned int)(Afull + idx * alloc), cA, rdch, MFC_GET_CMD);
565
			spu_mfcdma32((void *)(B + ((int)(m * ralloc + j * width + rst_gen[k])) * aB), (unsigned int)(Bfull + idx * alloc), cB, rdch, MFC_GET_CMD);
566
		    }
567
568
		    
569
		    rpl_gen_k = rpl_gen[k];
570
		    if (rpl_gen_k < walloc) {
571
			idx = index_k[rpl_gen[k]];
572
		    } else {
573
			index_storage_k = index_storage + k * index_segment_size;
574
			rpl_gen_segment = (rpl_gen_k / index_segment_size) * index_segment_size;
575
576
//			printf("%i, getting part of index %i, index segment %i\n", k + l, rpl_gen_k, rpl_gen_segment);
577
2 by Suren A. Chilingaryan
Fix SPE crashes on big (above 4096) and non-power-of-2 number of properties
578
			spu_mfcdma32((void*)(index_storage_k), (unsigned int)(mrses_index + (l + k + block_group * iterate_size) * palloc + rpl_gen_segment), HW_ALIGN, rdch, MFC_GET_CMD);
1 by Suren A. Chilingaryan
Initial import
579
			(void)spu_mfcstat(MFC_TAG_UPDATE_ALL);
580
			
581
			idx = index_storage_k[rpl_gen_k - rpl_gen_segment];
582
			//printf("done, index: %i\n", idx);
583
		    }
584
			
585
//		    printf("%i, receiving data for %i, index %i (pack %i)\n", k + l, drp_gen[k], idx, m);
586
		    spu_mfcdma32((void *)(A + ((int)(m * ralloc + j * width + drp_gen[k])) * aA), (unsigned int)(Afull + idx * alloc), cA, rdch, MFC_GET_CMD);
587
		    spu_mfcdma32((void *)(B + ((int)(m * ralloc + j * width + drp_gen[k])) * aB), (unsigned int)(Bfull + idx * alloc), cB, rdch, MFC_GET_CMD);
588
		    //puts("done");
589
		    mean_copy[(j * width + drp_gen[k]) * pack + m] = Mfull[idx];
590
		}
591
	        (void)spu_mfcstat(MFC_TAG_UPDATE_ALL);
592
593
594
#ifdef HW_USE_BLOCKED_MULTIPLY
595
		mrses_spe_real_multiply(D + m, pack, at_once, dalloc, width, ralloc, walloc, aA, aB, A + m * ralloc * aA, B + m * ralloc * aB, Ca, Cb);
596
#else  /* HW_USE_BLOCKED_MULTIPLY */
597
		mrses_spe_real_multiply_update(
598
		    D + m, 
599
		    pack, at_once, dalloc, width, ralloc, walloc, aA, aB, 
600
		    A + m * ralloc * aA, B + m * ralloc * aB, 
601
		    drp_gen + m * at_once,
602
		    Ea + m * at_once * walloc, Eb + m * at_once * walloc
603
		);
604
#endif /* HW_USE_BLOCKED_MULTIPLY */
605
	    }
606
	
607
	    mrses_spe_real_decompose(dist, result, pack, at_once, dalloc, width, D, mean_copy);
608
	    
609
	    for (m = 0, k = 0; m < pack; ++m) {
610
		for (j = 0; j < at_once; ++j, ++k) {
611
		    if (result[k] < cur_result[k]) {
612
			rst_gen[k] = drp_gen[k];
613
		    } else {
614
			rst_gen[k] = width + 1;
615
		    	cur_result[k] = result[k];
616
617
			index_k = index + k * walloc;
618
			
619
			rpl_gen_k = rpl_gen[k];
620
			if (rpl_gen_k < walloc) {
621
			    idx = index_k[rpl_gen_k];
622
			    SWAPij(index_k, drp_gen[k], rpl_gen_k);
623
			} else {
624
			    index_storage_k = index_storage + k * index_segment_size;
625
			    rpl_gen_segment = (rpl_gen_k / index_segment_size) * index_segment_size;
626
			    idx = index_storage_k[rpl_gen_k - rpl_gen_segment];
627
			    
628
			    index_storage_k[rpl_gen_k - rpl_gen_segment] = index_k[drp_gen[k]];
629
			    
630
			    //printf("%i, write back\n", k);
631
				// write back segment
2 by Suren A. Chilingaryan
Fix SPE crashes on big (above 4096) and non-power-of-2 number of properties
632
			    spu_mfcdma32((void*)(index_storage_k), (unsigned int)(mrses_index + (l + k + block_group * iterate_size) * palloc + rpl_gen_segment), HW_ALIGN, rdch, MFC_PUT_CMD);
1 by Suren A. Chilingaryan
Initial import
633
		    	    //(void)spu_mfcstat(MFC_TAG_UPDATE_ALL);
634
			    
635
			    //puts("done");
636
			    index_k[drp_gen[k]] = idx;
637
			}
638
			mean[(j * width + drp_gen[k]) * pack + m] = Mfull[idx];
639
		    }
640
		}
641
	    }
642
643
	}
644
645
	    // We can write back initial part of index, but we don't really need to
646
	for (j = 0; j < spe_real_block; j++) {
2 by Suren A. Chilingaryan
Fix SPE crashes on big (above 4096) and non-power-of-2 number of properties
647
	    spu_mfcdma32((void *)(index + j * walloc), (unsigned int)(mrses_index + (j + l + block_group * iterate_size) * palloc), walloc * sizeof(MRSESIntType), rdch, MFC_PUT_CMD);
1 by Suren A. Chilingaryan
Initial import
648
	}
649
	(void)spu_mfcstat(MFC_TAG_UPDATE_ALL);
650
/*
651
	for (k = 0; k < spe_real_block; k++) {
652
	    printf("SPU result, block %i: %e\n", l + k, cur_result[k]);
653
	}
654
*/
655
    }
656
657
    return 0;
658
}
659
660
#include <simdmath.h>
661
int main(unsigned long long id __attribute__ ((unused)), unsigned long long argp, unsigned long long envp __attribute__ ((unused))) {
662
    short int size;
663
    volatile SPUParameters param;
664
    volatile MRSESContext mrses;
665
    unsigned int rdch; //, wrch;
666
667
#ifndef FIX_RANDOM
668
    struct timeval tv;
669
#endif /* FIX_RANDOM */
670
671
#ifdef TRACE_TIMINGS
672
    struct timeval tv1, tv2;
673
#endif /* TRACE_TIMINGS */
674
675
#ifdef TRACE_TIMINGS
676
    gettimeofday(&tv1, NULL);
677
#endif /* TRACE_TIMINGS */
678
    
679
    rdch = mfc_tag_reserve();
680
    if (rdch == MFC_TAG_INVALID) {
681
	printf("Unable to reserve DMA tag, returning");
682
	exit(1);
683
    }
684
685
    spu_writech(MFC_WrTagMask, 1 << rdch);
686
687
    spu_mfcdma32(&param, argp, sizeof(SPUParameters), rdch, MFC_GET_CMD);
688
    (void)spu_mfcstat(MFC_TAG_UPDATE_ALL);
689
690
691
    size = calc_alloc(sizeof(MRSESContextS), HW_ALIGN);
692
    mrses = (MRSESContext)malloc(size);
693
    spu_mfcdma32(mrses, (unsigned int)(param.mrses), size, rdch, MFC_GET_CMD);
694
    (void)spu_mfcstat(MFC_TAG_UPDATE_ALL);
695
696
#ifdef FIX_RANDOM
697
    srand(FIX_RANDOM);
698
#else /* FIX_RANDOM */
699
    gettimeofday(&tv, NULL);
700
    srand(tv.tv_usec);
701
    //srand(block);
702
#endif /* FIX_RANDOM */
703
704
705
    mrses_spe_iterate(param.block_group, mrses, rdch);
706
    
707
	// Optimize?
708
//    free(local_data);
709
//    local_data = NULL;
710
711
#ifdef TRACE_TIMINGS
712
    gettimeofday(&tv2, NULL);
713
    printf("SPU 0x%llx: %li ms\n", id, (tv2.tv_sec - tv1.tv_sec)*1000+(tv2.tv_usec - tv1.tv_usec)/1000);
714
#endif /* TRACE_TIMINGS */
715
716
    return 0;
717
}