/ani/mrses

To get this branch, use:
bzr branch http://darksoft.org/webbzr/ani/mrses
1 by Suren A. Chilingaryan
Initial import
1
#include <stdlib.h>
2
#include <stdio.h>
3
#include <errno.h>
4
#include <string.h>
5
#include <sys/time.h>
6
7
#include <cblas.h>
8
9
#include <assert.h>
10
11
#include "msg.h"
12
#include "hw_sched.h"
13
#include "mrses_impl.h"
14
15
enum MRSESEntriesT {
16
    MRSES_RUN_ENTRY = 0,
17
    MRSES_ITERATE_ENTRY = 1
18
};
19
typedef enum MRSESEntriesT MRSESEntries;
20
21
MRSESContext mrses_create_context() {
22
    MRSESContext ctx;
23
    posix_memalign((void*)&ctx, HW_ALIGN, calc_alloc(sizeof(MRSESContextS), HW_ALIGN));
24
    if (ctx) {
25
        memset(ctx, 0, sizeof(MRSESContextS));
26
    }
27
    return ctx;
28
}
29
30
int mrses_init_context(MRSESContext ctx, int properties, int nA, const MRSESDataType *A, int nB, const MRSESDataType *B) {
31
    int err = 0;
32
    int i;
33
    int nAB;
34
35
    int alloc;
36
    int palloc;
37
38
#ifdef HW_HAVE_SPU
39
    int aA, aB;
40
#endif /* HW_HAVE_SPU */
41
42
    int matrix_size;
43
    MRSESDataType *ones, *meanA, *meanB;
44
45
    assert(ctx);
46
    assert(A);
47
    assert(B);
48
    assert(nA > 0);
49
    assert(nB > 0);
50
    assert(properties > 0);
51
52
    mrses_free_context(ctx);
53
54
    nAB = max(nA, nB);    
55
    alloc = calc_alloc(nAB * sizeof(MRSESDataType), HW_ALIGN) / sizeof(MRSESDataType);
56
    palloc = calc_alloc(properties * sizeof(MRSESDataType), HW_ALIGN) / sizeof(MRSESDataType);
57
    matrix_size = properties * alloc;
58
    
59
    ctx->nA = nA;
60
    ctx->nB = nB;
61
    ctx->alloc = alloc;
62
    ctx->properties = properties;    
2 by Suren A. Chilingaryan
Fix SPE crashes on big (above 4096) and non-power-of-2 number of properties
63
    ctx->palloc = calc_alloc(properties * sizeof(MRSESIntType), HW_ALIGN) / sizeof(MRSESIntType);
1 by Suren A. Chilingaryan
Initial import
64
65
    posix_memalign((void*)&ctx->A, HW_ALIGN, (alloc * properties)*sizeof(MRSESDataType));
66
    posix_memalign((void*)&ctx->B, HW_ALIGN, (alloc * properties)*sizeof(MRSESDataType));
67
    posix_memalign((void*)&ctx->mean, HW_ALIGN, palloc * sizeof(MRSESDataType));
68
    posix_memalign((void*)&meanB, HW_ALIGN, properties*sizeof(MRSESDataType));
69
    posix_memalign((void*)&ones, HW_ALIGN, nAB*sizeof(MRSESDataType));
70
71
    meanA = ctx->mean;
72
73
    if ((!ctx->A)||(!ctx->B)||(!meanA)||(!meanB)||(!ones)) {
74
	if (ones) free(ones);
75
	if (meanB) free(meanB);
76
	return 1;
77
    }
78
	
79
    for (i = 0; i < nAB; i++) 
80
	ones[i] = 1.;
81
    
82
    memset(ctx->A, 0x7F, alloc * properties * sizeof(MRSESDataType));
83
//    blas_gemv(CblasRowMajor,CblasNoTrans, properties, nA, 1. / nA, A, nA, ones, 1, 0, meanA, 1);
84
//    blas_gemv(CblasRowMajor,CblasNoTrans, properties, nB, 1. / nB, B, nB, ones, 1, 0, meanB, 1);
85
86
    for (i = 0; i < properties; i++) {
87
	memcpy(ctx->A + i * alloc, A + i * nA, nA * sizeof(MRSESDataType));
88
    }
89
90
#ifdef HW_HAVE_SPU
91
    aA = calc_alloc(nA * sizeof(MRSESDataType), HW_ALIGN) - nA * sizeof(MRSESDataType);
92
    if (aA) {
93
	for (i = 0; i < properties; i++) {
94
	    memset(ctx->A + i * alloc + nA, 0, aA);
95
	}
96
    }
97
#endif /* HW_HAVE_SPU */
98
99
    blas_gemv(CblasRowMajor,CblasNoTrans, properties, nA, 1. / nA, ctx->A, alloc, ones, 1, 0, meanA, 1);
100
    for (i = 0; i < nA; i++) {
101
	blas_axpy(properties, -1, meanA, 1, ctx->A + i, alloc);
102
    }
103
    blas_scal(matrix_size, 1.0 / sqrt(nA), ctx->A, 1);
104
105
106
    for (i = 0; i < properties; i++) {
107
	memcpy(ctx->B + i * alloc, B + i * nB, nB * sizeof(MRSESDataType));
108
    }
109
110
#ifdef HW_HAVE_SPU
111
    aB = calc_alloc(nB * sizeof(MRSESDataType), HW_ALIGN) - nB * sizeof(MRSESDataType);
112
    if (aB) {
113
	for (i = 0; i < properties; i++) {
114
	    memset(ctx->B + i * alloc + nB, 0, aB);
115
	}
116
    }
117
#endif /* HW_HAVE_SPU */
118
119
    blas_gemv(CblasRowMajor,CblasNoTrans, properties, nB, 1. / nB, ctx->B, alloc, ones, 1, 0, meanB, 1);
120
    for (i = 0; i < nB; i++) {
121
	blas_axpy(properties, -1.0, meanB, 1, ctx->B + i, alloc);
122
    }
123
    blas_scal(matrix_size, 1.0 / sqrt(nB), ctx->B, 1);
124
125
    blas_axpy(properties, -1.0, meanB, 1, meanA, 1);
126
127
128
/*
129
    blas_gemv(CblasRowMajor,CblasNoTrans, properties, nA, 1. / nA, A, nA, ones, 1, 0, meanA, 1);
130
    blas_gemv(CblasRowMajor,CblasNoTrans, properties, nB, 1. / nB, B, nB, ones, 1, 0, meanB, 1);
131
132
    matrix_size = properties * nA;
133
    memcpy(ctx->A, A, matrix_size * sizeof(MRSESDataType));
134
    for (i = 0; i < nA; i++) {
135
	blas_axpy(properties, -1, meanA, 1, ctx->A + i, nA);
136
    }
137
    blas_scal(matrix_size, 1.0 / sqrt(nA), ctx->A, 1);
138
139
    matrix_size = properties * nB;
140
    memcpy(ctx->B, B, matrix_size * sizeof(MRSESDataType));
141
    for (i = 0; i < nB; i++) {
142
	blas_axpy(properties, -1.0, meanB, 1, ctx->B + i, nB);
143
    }
144
    blas_scal(matrix_size, 1.0 / sqrt(nB), ctx->B, 1);
145
    blas_axpy(properties, -1.0, meanB, 1, meanA, 1);
146
*/    
147
148
/*
149
    //printf("%f %f.\n", meanA[4], meanA[5]);
150
    //printf("%f %f.\n", meanB[4], meanB[5]);
151
*/
152
153
154
155
    ctx->sched = hw_sched_create();
156
    hw_sched_set_sequential_mode(ctx->sched, &ctx->block_size, &ctx->cur_chunk);
157
158
    free(ones);
159
    free(meanB);
160
    
161
    return err;
162
}
163
164
int mrses_init_context_from_double(MRSESContext ctx, int properties, int nA, const double *A, int nB, const double *B) {
165
    int err = 0;
166
    int i, j;
167
    int nAB;
168
169
    int alloc;
170
    int palloc;
171
172
#ifdef HW_HAVE_SPU
173
    int aA, aB;
174
#endif /* HW_HAVE_SPU */
175
176
    int matrix_size;
177
    MRSESDataType *ones, *meanA, *meanB;
178
179
    assert(ctx);
180
    assert(A);
181
    assert(B);
182
    assert(nA > 0);
183
    assert(nB > 0);
184
    assert(properties > 0);
185
186
    mrses_free_context(ctx);
187
188
    nAB = max(nA, nB);    
189
    alloc = calc_alloc(nAB * sizeof(MRSESDataType), HW_ALIGN) / sizeof(MRSESDataType);
190
    palloc = calc_alloc(properties * sizeof(MRSESDataType), HW_ALIGN) / sizeof(MRSESDataType);
191
    matrix_size = properties * alloc;
192
    
193
    ctx->nA = nA;
194
    ctx->nB = nB;
195
    ctx->alloc = alloc;
196
    ctx->properties = properties;    
2 by Suren A. Chilingaryan
Fix SPE crashes on big (above 4096) and non-power-of-2 number of properties
197
    ctx->palloc = calc_alloc(properties * sizeof(MRSESIntType), HW_ALIGN) / sizeof(MRSESIntType);
1 by Suren A. Chilingaryan
Initial import
198
199
    posix_memalign((void*)&ctx->A, HW_ALIGN, (alloc * properties)*sizeof(MRSESDataType));
200
    posix_memalign((void*)&ctx->B, HW_ALIGN, (alloc * properties)*sizeof(MRSESDataType));
201
    posix_memalign((void*)&ctx->mean, HW_ALIGN, palloc * sizeof(MRSESDataType));
202
    posix_memalign((void*)&meanB, HW_ALIGN, properties*sizeof(MRSESDataType));
203
    posix_memalign((void*)&ones, HW_ALIGN, nAB*sizeof(MRSESDataType));
204
205
    meanA = ctx->mean;
206
207
    if ((!ctx->A)||(!ctx->B)||(!meanA)||(!meanB)||(!ones)) {
208
	if (ones) free(ones);
209
	if (meanB) free(meanB);
210
	return 1;
211
    }
212
	
213
    for (i = 0; i < nAB; i++) 
214
	ones[i] = 1.;
215
    
216
    memset(ctx->A, 0x7F, alloc * properties * sizeof(MRSESDataType));
217
218
219
    for (i = 0; i < properties; i++) {
220
	/* DS: Change1, 
221
	memcpy(ctx->A + i * alloc, A + i * nA, nA * sizeof(MRSESDataType));
222
	*/
223
	for (j = 0; j < nA; j++) {
224
	    ctx->A[i * alloc + j] = A[i * nA + j];
225
	}
226
    }
227
228
#ifdef HW_HAVE_SPU
229
    aA = calc_alloc(nA * sizeof(MRSESDataType), HW_ALIGN) - nA * sizeof(MRSESDataType);
230
    if (aA) {
231
	for (i = 0; i < properties; i++) {
232
	    memset(ctx->A + i * alloc + nA, 0, aA);
233
	}
234
    }
235
#endif /* HW_HAVE_SPU */
236
237
    blas_gemv(CblasRowMajor,CblasNoTrans, properties, nA, 1. / nA, ctx->A, alloc, ones, 1, 0, meanA, 1);
238
    for (i = 0; i < nA; i++) {
239
	blas_axpy(properties, -1, meanA, 1, ctx->A + i, alloc);
240
    }
241
    blas_scal(matrix_size, 1.0 / sqrt(nA), ctx->A, 1);
242
243
244
    for (i = 0; i < properties; i++) {
245
	/* DS: Change2 
246
	    memcpy(ctx->B + i * alloc, B + i * nB, nB * sizeof(MRSESDataType));
247
	*/
248
	for (j = 0; j < nA; j++) {
249
	    ctx->B[i * alloc + j] = B[i * nA + j];
250
	}
251
    }
252
253
#ifdef HW_HAVE_SPU
254
    aB = calc_alloc(nB * sizeof(MRSESDataType), HW_ALIGN) - nB * sizeof(MRSESDataType);
255
    if (aB) {
256
	for (i = 0; i < properties; i++) {
257
	    memset(ctx->B + i * alloc + nB, 0, aB);
258
	}
259
    }
260
#endif /* HW_HAVE_SPU */
261
262
    blas_gemv(CblasRowMajor,CblasNoTrans, properties, nB, 1. / nB, ctx->B, alloc, ones, 1, 0, meanB, 1);
263
    for (i = 0; i < nB; i++) {
264
	blas_axpy(properties, -1.0, meanB, 1, ctx->B + i, alloc);
265
    }
266
    blas_scal(matrix_size, 1.0 / sqrt(nB), ctx->B, 1);
267
268
    blas_axpy(properties, -1.0, meanB, 1, meanA, 1);
269
270
271
/*
272
    blas_gemv(CblasRowMajor,CblasNoTrans, properties, nA, 1. / nA, A, nA, ones, 1, 0, meanA, 1);
273
    blas_gemv(CblasRowMajor,CblasNoTrans, properties, nB, 1. / nB, B, nB, ones, 1, 0, meanB, 1);
274
275
    matrix_size = properties * nA;
276
    memcpy(ctx->A, A, matrix_size * sizeof(MRSESDataType));
277
    for (i = 0; i < nA; i++) {
278
	blas_axpy(properties, -1, meanA, 1, ctx->A + i, nA);
279
    }
280
    blas_scal(matrix_size, 1.0 / sqrt(nA), ctx->A, 1);
281
282
    matrix_size = properties * nB;
283
    memcpy(ctx->B, B, matrix_size * sizeof(MRSESDataType));
284
    for (i = 0; i < nB; i++) {
285
	blas_axpy(properties, -1.0, meanB, 1, ctx->B + i, nB);
286
    }
287
    blas_scal(matrix_size, 1.0 / sqrt(nB), ctx->B, 1);
288
    blas_axpy(properties, -1.0, meanB, 1, meanA, 1);
289
*/    
290
291
/*
292
    //printf("%f %f.\n", meanA[4], meanA[5]);
293
    //printf("%f %f.\n", meanB[4], meanB[5]);
294
*/
295
296
297
298
    ctx->sched = hw_sched_create();
299
    hw_sched_set_sequential_mode(ctx->sched, &ctx->block_size, &ctx->cur_chunk);
300
301
    free(ones);
302
    free(meanB);
303
    
304
    return err;
305
}
306
307
308
309
int mrses_set_distance_mode(MRSESContext ctx, MRSESDistance dist) {
310
    assert(ctx); 
311
    assert(dist < MRSES_DISTANCE_LAST);
312
313
    ctx->dist = dist;
314
    return 0;
315
}
316
317
int mrses_prepare(MRSESContext ctx, int width, int block_size) {
318
#ifdef HW_HAVE_SPU
319
    int at_once, pack;
320
    int index_block;
321
#endif /* HW_HAVE_SPU */
322
323
    assert(ctx);
324
325
    ctx->width = width;
326
327
#ifdef HW_HAVE_SPU
328
# ifdef HW_USE_BLOCKED_MULTIPLY
329
    at_once = SPE_BLOCK / width;
330
    if (at_once < 1) at_once = 1;
331
# else /* HW_USE_BLOCKED_MULTIPLY */
332
    at_once = 1;
333
#endif /* HW_USE_BLOCKED_MULTIPLY */
334
335
    pack = SIMD_BLOCK / sizeof(MRSESDataType);
336
    at_once *= pack;
337
        
338
    index_block = HW_ALIGN / min(sizeof(MRSESDataType), sizeof(MRSESIntType));
339
340
    ctx->iterate_size = lcm3(at_once, index_block, HW_ITERATE_BLOCKS);
341
#else /* HW_HAVE_SPU */
342
    ctx->iterate_size = HW_ITERATE_BLOCKS;
343
#endif /* HW_HAVE_SPU */
344
345
    reportMessage("block size: %i", ctx->iterate_size);
346
347
    ctx->max_block_size = calc_alloc(block_size, ctx->iterate_size);
348
    
349
    return 0;
350
}
351
352
void mrses_free_context(MRSESContext ctx) {
353
    assert(ctx);
354
355
    if (ctx->sched) hw_sched_destroy(ctx->sched);
356
    
357
    if (ctx->index) free(ctx->index);
358
    if (ctx->mean) free(ctx->mean);
359
    if (ctx->B) free(ctx->B);
360
    if (ctx->A) free(ctx->A);
361
    
362
    memset(ctx, 0, sizeof(MRSESContextS));
363
}
364
365
void mrses_destroy_context(MRSESContext ctx) {
366
    mrses_free_context(ctx);
367
    free(ctx);
368
}
369
370
371
int mrses_compute(MRSESContext ctx, int block_size, MRSESIntType *index, MRSESDataType *res) {
372
    assert(ctx);
373
    assert(block_size > 0);
374
    assert(block_size <= ctx->max_block_size);
375
    assert(index);
376
    assert(res);
377
    assert(!ctx->index);
378
379
    ctx->block_size = block_size;
380
    ctx->cur_chunk = 0;
381
    ctx->index = index;
382
    ctx->result = res;
383
    
384
    hw_sched_schedule_task(ctx->sched, ctx, MRSES_RUN_ENTRY);
385
    hw_sched_wait_task(ctx->sched);
386
387
    ctx->index = NULL;
388
    
389
    return 0;
390
}
391
392
int mrses_iterate(MRSESContext ctx, int iterations, int block_size, MRSESIntType *ires) {
2 by Suren A. Chilingaryan
Fix SPE crashes on big (above 4096) and non-power-of-2 number of properties
393
    int i, width, properties, palloc;
1 by Suren A. Chilingaryan
Initial import
394
#ifndef FIX_RANDOM
395
    struct timeval tv;
396
#endif /* FIX_RANDOM */
397
398
    MRSESIntType *index, *iptr, *iend; 
399
    
400
    assert(ctx);
401
    assert(iterations > 0);
402
    assert(block_size > 0);
403
    assert(block_size <= ctx->max_block_size);
404
405
    block_size = calc_alloc(block_size, ctx->iterate_size);
406
407
    width = ctx->width;
408
    properties = ctx->properties;
2 by Suren A. Chilingaryan
Fix SPE crashes on big (above 4096) and non-power-of-2 number of properties
409
    palloc = ctx->palloc;
1 by Suren A. Chilingaryan
Initial import
410
411
#ifdef FIX_RANDOM
412
    srandom(FIX_RANDOM);
413
#else /* FIX_RANDOM */
414
    gettimeofday(&tv, NULL);
415
    srandom(tv.tv_usec);
416
#endif /* FIX_RANDOM */
417
418
    if (ctx->index) {
419
        if (!ires) hw_sched_wait_task(ctx->sched);
420
	index = ctx->index;
421
    } else {
2 by Suren A. Chilingaryan
Fix SPE crashes on big (above 4096) and non-power-of-2 number of properties
422
        posix_memalign((void*)&index, HW_ALIGN, (block_size * palloc)*sizeof(MRSESIntType));
1 by Suren A. Chilingaryan
Initial import
423
	if (!index) {
424
	    return 1;
425
	}
426
    }
427
428
    for (i = 0; i < properties; i++) {
429
	index[i] = i;
430
    }
431
2 by Suren A. Chilingaryan
Fix SPE crashes on big (above 4096) and non-power-of-2 number of properties
432
    iend = index + block_size * palloc;    
433
    for (iptr = index + palloc; iptr < iend; iptr += palloc) {
1 by Suren A. Chilingaryan
Initial import
434
	memcpy(iptr, index, properties * sizeof(MRSESIntType));
435
	
436
	for (i = 0; i < width; i++) {
437
	    SWAPi(iptr, properties, i);
438
	}
439
    }
440
    for (i = 0; i < width; i++) {
441
	SWAPi(index, properties, i);
442
    }
443
444
/*
445
    for (i = 0; i < 5; i++) {
446
	int j
447
	for (j = 0; j < 5; j++) {
448
	    printf("%5i ", index[i * properties + j]);
449
	}
450
	printf("\n");
451
    }
452
    printf("\n");
453
*/
454
    
455
    ctx->block_size = block_size / ctx->iterate_size;
456
    ctx->cur_chunk = 0;
457
458
    ctx->iterations = iterations;
459
    ctx->index = index;
460
    ctx->ires = ires;
461
    ctx->result = NULL;
462
463
    hw_sched_schedule_task(ctx->sched, ctx, MRSES_ITERATE_ENTRY);
464
465
    if (ires) {
466
	hw_sched_wait_task(ctx->sched);
467
	free(ctx->index); ctx->index = NULL;
468
    }
469
470
    return 0;
471
}
472
473
int mrses_get_results(MRSESContext ctx, uint32_t *hist) {
474
    int i, j;
475
    
476
    HWSched sched = ctx->sched;
477
    HWThread *thr = sched->thread;
478
    int n_threads = sched->n_threads;
479
    int properties = ctx->properties;
480
    
481
    uint32_t *thr_hist = NULL;
482
483
    if (ctx->index) {
484
        hw_sched_wait_task(ctx->sched);
485
	free(ctx->index); ctx->index = NULL;
486
    }
487
488
    for (i = 0; i < n_threads; i++) {
489
	thr_hist = (uint32_t*)(thr[i]->data);
490
	if (thr_hist) break;
491
    }
492
    
493
    if (thr_hist) {
494
	memcpy(hist, thr_hist, properties * sizeof(uint32_t));
495
	memset(thr_hist, 0, properties * sizeof(uint32_t));
496
    }
497
    
498
    for (++i; i < n_threads; i++) {
499
	thr_hist = (uint32_t*)(thr[i]->data);
500
	if (!thr_hist) continue;
501
	
502
	for (j = 0; j < properties; j++) {
503
	    hist[j] += thr_hist[j];
504
	}
505
	memset(thr_hist, 0, properties * sizeof(uint32_t));
506
    }
507
    
508
    return 0;
509
}