/ani/mrses

To get this branch, use:
bzr branch http://darksoft.org/webbzr/ani/mrses
1 by Suren A. Chilingaryan
Initial import
1
This file contains some ideas which was slower or not comed to the end due
2
their nature or my inabilities or for time saving purposes.
3
4
1. The general idea is to computer full 16x16 matrix until the end. We got
5
several quadratic matrices along the diagonal with which are the actual 
6
C matrixes and we padding the rest of diagonal with 1. Everything else is 0.
7
Then we can compute Cholesky and solve equations on all matrices together
8
using SPE-optimized blas. However, this approach is twice slower compared 
9
to computing matrix by matrix using non-optimized blas.
10
11
    for (j = 1; j < at_once; j++) {
12
	int offset = j * width * real_width;
13
	MRSESDataType *C_cur = C + offset;
14
	MRSESDataType *Ca_cur = Ca + offset;
15
	MRSESDataType *Cb_cur = Cb + offset;
16
17
	for (i = 0; i < real_width; i++) {
18
	    memset(C_cur + i * width, 0, j * real_width * sizeof(MRSESDataType));
19
	}
20
    }
21
    for (j = at_once * real_width; j < width; j++) {
22
	memset(C + j * width, 0, width * sizeof(MRSESDataType));
23
	C[j * width + j] = 1;
24
    }
25
26
27
Solving equation can be done with following commands, but I have not managed to
28
get correct results. Anyway the trsv uses neglectable amount of time. Most 
29
things are done in matrix multiplication and cholesky decomposition.
30
31
    transpose_matrix(width, width, C, width, D, width);
32
    MRSESDataType *D_cur = D + offset;
33
    solve_upper_1(real_width, D_cur, width, mean_cur);
34
	
35
        // multiple of 16
36
    strsv_spu_lower(width, C, width, mean);	
37
38
39
2. Another idea was to compute [Ca]*[Cb] as [Ca*Cb]. For that we could 
40
multiply Ca*Cb and then multiply by transpose of it (Ca*Cb)' to get 
41
symmetric matrix. Finally we should get the square of determinat.
42
But, the resulting symmetric matrix is not positive-definite (or for
43
precision reasons), the cholesky decomposition have failed for it.
44
45
    for (i = 0; i < width; i++) {
46
	for (j = 0; j <= i; j++) {
47
	    Ca[j * width + i] = Ca[i * width + j];
48
	    Cb[j * width + i] = Cb[i * width + j];
49
	}
50
    }
51
52
    memset(Cab, 0, width*width*sizeof(MRSESDataType));
53
    //sgemm_spu(width, width, width, Ca, Cb, Cab);
54
    cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, width, width, width, 1, Ca, width, Cb, width, 0, Cab, width);
55
56
    memset(Cab2, 0, width*width*sizeof(MRSESDataType));
57
    blas_syrk(CblasRowMajor, CblasLower, CblasNoTrans, width, width, 1, Cab, width, 0, Cab2, width);
58
    //spe_syrk(CblasRowMajor, CblasLower, CblasNoTrans, width, width, 1, Cab, width, 0, Cab2, width);
59
    for ...
60
	lapack_potrf(&hmode, &real_width, Cab_cur, &width, err);
61
62
        detC = C_cur[0]; detAB = Cab_cur[0];
63
	for (i = width + 1; i < c_step; i+= (width+1)) {
64
	    detAB *= Cab_cur[i];
65
	    detC *= C_cur[i];
66
	}
67
	rcorr = logf(detC * detC * detC * detC / detAB);
68
    }
69
70
71
3. Following code is matrix multiplocation using default (non-SPE) blas. Even
72
if used with appropriate small matrices it is significantly slower than IBM
73
code on full matrix.
74
75
/*    
76
    char hmode = 'L';
77
78
	//original (do not store C)
79
    //blas_syrk(CblasRowMajor, CblasUpper, CblasNoTrans, width, nA, 1, A, nA, 0, Ca, width);
80
    //blas_syrk(CblasRowMajor, CblasUpper, CblasNoTrans, width, nB, 1, B, nB, 0, Cb, width);
81
82
    blas_syrk(CblasRowMajor, CblasUpper, CblasNoTrans, 5, 50, 1, A, nA, 0, Ca, width);
83
    blas_syrk(CblasRowMajor, CblasUpper, CblasNoTrans, 5, 40, 1, B, nB, 0, Cb, width);
84
    
85
    memcpy(C, Ca, width2 * sizeof(MRSESDataType));
86
    blas_axpy(width2, 1, Cb, 1, C, 1);
87
    blas_scal(width2, 0.5, C, 1);
88
*/
89
90
4. Other ways to solve equation system. Both works but a bit slower
91
/*
92
	    // Faster 82:
93
	blas_trsm(
94
	    CblasRowMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit,
95
	    1, real_width, 1, C_cur, width, mean_cur, width //1?
96
	);
97
98
	    // Slower 83-84
99
	blas_trsm(
100
	    CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit,
101
	    real_width, 1, 1, C_cur, width, mean_cur, 1 //width //1?
102
	);
103
*/
104
105
5. Other ways to perform cholesky decomposition. The lapack version is surely
106
working. Curretnly implemented seems to work, but who nows about special cases.
107
108
	atlas_spotrf2(real_width, C_cur, width);
109
	atlas_spotrf2(real_width, Ca_cur, width);
110
	atlas_spotrf2(real_width, Cb_cur, width);
111
112
	lapack_potrf(&hmode, &real_width, C_cur, &width, &err);
113
	if (err) return 1;
114
	lapack_potrf(&hmode, &real_width, Ca_cur, &width, &err);
115
	if (err) return 1;
116
	lapack_potrf(&hmode, &real_width, Cb_cur, &width, &err);
117
	if (err) return 1;
118
119
120
6. This is fully working, but not vectorized version of code
121
static inline int mrses_spe_real_run(
122
    MRSESContext mrses, MRSESDataType *result,
123
    int width, int width2, int nA, int nB,
124
    MRSESDataType *A, MRSESDataType *B, MRSESDataType *mean,
125
    MRSESDataType *C, MRSESDataType *Ca, MRSESDataType *Cb
126
) {
127
    int i, j, err;
128
    
129
    MRSESDataType detAB, detC;
130
    MRSESDataType rmahal, rcorr;
131
    
132
    short int walloc;
133
134
    char hmode = 'U';
135
136
    int real_width = 5;
137
//    int iterate_size = mrses->iterate_size;
138
    int at_once = max(1, SPE_BLOCK / real_width);
139
//    int times = iterate_size / at_once;
140
141
    int rww = at_once * width * real_width;
142
    int rww_alloc = calc_alloc(rww, 64);
143
144
    memset(Ca, 0, rww * sizeof(MRSESDataType));
145
    memset(Cb, 0, rww * sizeof(MRSESDataType));
146
147
    //PRINT_MATRIX("% 6.4f ", A, 16, 16, 16)
148
149
	// that works on multiples of 16
150
    spe_syrk(CblasRowMajor, CblasLower, CblasNoTrans, width, nA, 1, A, nA, 0, Ca, width);
151
    spe_syrk(CblasRowMajor, CblasLower, CblasNoTrans, width, nB, 1, B, nB, 0, Cb, width);
152
153
	// thats works on multiples of 64 (16x16 - fine)
154
    memcpy(C, Ca, rww * sizeof(MRSESDataType));
155
    spe_axpy(rww_alloc, 1, Cb, 1, C, 1);
156
    spe_scal(rww_alloc, 0.5, C, 1);
157
158
    int c_step = real_width * (width + 1);
159
160
    for (j = 0; j < at_once; j++) {
161
	int offset = j * c_step;
162
	MRSESDataType *C_cur = C + offset;
163
	MRSESDataType *Ca_cur = Ca + offset;
164
	MRSESDataType *Cb_cur = Cb + offset;
165
	MRSESDataType *mean_cur = mean + j * real_width;
166
	MRSESDataType result;
167
168
169
	err = atlas_spotrf_u(real_width, C_cur, width);
170
	if (err) return 1;
171
	err = atlas_spotrf_u(real_width, Ca_cur, width);
172
	if (err) return 1;
173
	err = atlas_spotrf_u(real_width, Cb_cur, width);
174
	if (err) return 1;
175
176
/*
177
178
	atlas_spotrf2(real_width, C_cur, width);
179
	atlas_spotrf2(real_width, Ca_cur, width);
180
	atlas_spotrf2(real_width, Cb_cur, width);
181
182
	lapack_potrf(&hmode, &real_width, C_cur, &width, &err);
183
	if (err) return 1;
184
	lapack_potrf(&hmode, &real_width, Ca_cur, &width, &err);
185
	if (err) return 1;
186
	lapack_potrf(&hmode, &real_width, Cb_cur, &width, &err);
187
	if (err) return 1;
188
*/        
189
190
        detC = C_cur[0]; detAB = Ca_cur[0] * Cb_cur[0];
191
	for (i = width + 1; i < c_step; i+= (width+1)) {
192
	    detAB *= (Ca_cur[i] * Cb_cur[i]);
193
	    detC *= C_cur[i];
194
	}
195
196
	rcorr = 2 * logf(detC * detC / detAB);
197
198
199
	blas_trsv(
200
	    CblasRowMajor, CblasLower, CblasNoTrans, CblasNonUnit,
201
	    real_width, C_cur, width, mean_cur, 1
202
	);
203
204
	rmahal = blas_dot(real_width, mean_cur, 1, mean_cur, 1);
205
206
	switch (mrses->dist) {
207
	    case BHATTACHARYYA:
208
		result = rmahal/8 + rcorr/4;
209
	    break;
210
	    case MAHALANOBIS:
211
		result = rmahal;
212
	    break;
213
	    case CORCOR:
214
		result = rcorr;
215
	    break;
216
	    default:
217
		result = 0;
218
	}
219
220
//	printf("SPU result: %e (mahal: %e, corcor: %e)\n", result, rmahal, rcorr);
221
    }
222
223
    return 0;
224
}
225
226
    data->A = (MRSESDataType*)(allocation + pos);
227
    pos += walloc * alloc * sizeof(MRSESDataType);
228
229
    data->B = (MRSESDataType*)(allocation + pos);
230
    pos += walloc * alloc * sizeof(MRSESDataType);
231
232
    data->C = (MRSESDataType*)(allocation + pos);
233
    pos += walloc2 * sizeof(MRSESDataType);
234
235
    data->Ca = (MRSESDataType*)(allocation + pos);
236
    pos += walloc2 * sizeof(MRSESDataType);
237
238
    data->Cb = (MRSESDataType*)(allocation + pos);
239
    pos += walloc2 * sizeof(MRSESDataType);
240
241
//	memset(data->A + used_walloc * alloc, 0, (walloc - used_walloc) * alloc * sizeof(MRSESDataType));
242
//	memset(data->B + used_walloc * alloc, 0, (walloc - used_walloc) * alloc * sizeof(MRSESDataType));
243
//	memset(data->C + used_walloc * walloc, 0, (walloc - used_walloc) * walloc * sizeof(MRSESDataType));
244
245
//	memset(data->mean + used_walloc, 0, (walloc - used_walloc) * sizeof(MRSESDataType));
246
//	memset(data->mean_copy + used_walloc, 0, (walloc - used_walloc) * sizeof(MRSESDataType));
247
248
249
/*
250
	err = mrses_spe_real_run(
251
	    mrses, cur_result,
252
	    walign, walign2, aA, aB, //width, width2, nA, nB, alloc, 
253
	    A, B, mean_copy, C, Ca, Cb
254
	);
255
*/
256
257
/*	
258
	    err = mrses_spe_real_run(
259
		mrses, result,
260
	        walign, walign2, aA, aB, //width, width2, nA, nB, alloc,
261
		A, B, mean_copy, C, Ca, Cb
262
	    );
263
*/
264
265
266
7. Vectorization tests with various invalid operations
267
/*
268
    vector float one = {1, -1, 1, 1};
269
    vector float some_zero = {0, 1, 1, 0};
270
    vector float zero = {0, 0, 0, 0};
271
    
272
    vector float result;
273
    result = divf4(one, some_zero);
274
    
275
    printf("%f %f\n", *(((float*)&result)+1), *(((float*)&result)+2));
276
    printf("%f %f\n", *(((float*)&result)+0), *(((float*)&result)+3));
277
278
    result = sqrtf4(result);
279
280
    printf("%f %f\n", *(((float*)&result)+1), *(((float*)&result)+2));
281
    printf("%f %f\n", *(((float*)&result)+0), *(((float*)&result)+3));
282
283
    result = divf4_fast(one, some_zero);
284
    
285
    printf("%f %f\n", *(((float*)&result)+1), *(((float*)&result)+2));
286
    printf("%f %f\n", *(((float*)&result)+0), *(((float*)&result)+3));
287
288
    result = sqrtf4_fast(fmaxf4(result, zero));
289
290
    printf("%f %f\n", *(((float*)&result)+1), *(((float*)&result)+2));
291
    printf("%f %f\n", *(((float*)&result)+0), *(((float*)&result)+3));
292
293
    return 0;
294
*/
295
296
297
8. Debugging vectorized code
298
/*
299
    	int err = atlas_spotrf2(real_width, Ca, width);
300
	if (err) printf("Problem with spotrf\n");
301
	else {
302
	    if ((((unsigned int)D)%16)==0) {
303
	    printf("Cholesky Ca\n");
304
	    PRINT_MATRIX("% 6.4f ", Ca, 16, 5, 5)
305
	    }
306
	}
307
308
309
    float C[16*16];    
310
    memset(C, 0, 16*16 * sizeof(MRSESDataType));
311
    memcpy(C, Ca, rww * sizeof(MRSESDataType));
312
    spe_axpy(16*16, 1, Cb, 1, C, 1);
313
    spe_scal(16*16, 0.5, C, 1);
314
315
316
	puts("=====================================");
317
	int c_step = real_width * (width + 1);
318
	atlas_spotrf2(real_width, Ca, width);
319
	atlas_spotrf2(real_width, Cb, width);
320
	atlas_spotrf2(real_width, C, width);
321
        float detC = C[0]; float detAB = Ca[0] * Cb[0];
322
	for (i = width + 1; i < c_step; i+= (width+1)) {
323
	    detAB *= (Ca[i] * Cb[i]);
324
	    detC *= C[i];
325
	}
326
327
	float rcorr = 2 * logf(detC * detC / detAB);
328
	printf("Real det: %e = %e %e\n", rcorr, detC, detAB);
329
*/
330
331
9. Non-vectorized D-recovery
332
static inline int mrses_spe_real_multiply_recover_old(
333
    MRSESDataType *D,
334
    short int pack, short int at_once,
335
    short int d_step, short int width, short int ralloc, short int walloc, short int nA, short int nB,
336
    MRSESDataType *A, MRSESDataType *B,
337
    MRSESDataType *Ca, MRSESDataType *Cb,
338
    MRSESIntType *drp_gen, MRSESIntType *rst_gen,
339
    MRSESDataType *Ea, MRSESDataType *Eb
340
341
) {
342
    short int i, j, k, l;
343
344
/*
345
    if ((((int)D)%16)==0) {
346
    if (*rst_gen > width) {
347
	printf("Orig\n");
348
	PRINT_MATRIX("% 6.4f", Ca, walloc, 5, 5);
349
    }
350
    }
351
    if (*rst_gen > width) {
352
	vec_ssyrk_rln_11(ralloc, nA, A, nA, Ca, walloc);
353
    }
354
    if ((((int)D)%16)==0) {
355
    if (*rst_gen > width) {
356
	printf("Update: %i\n", *drp_gen);
357
	PRINT_MATRIX("% 6.4f", Ca, walloc, 5, 5);
358
    }}
359
360
    if (*rst_gen > width) {
361
	vec_ssyrk_rln_11(ralloc, nB, B, nB, Cb, walloc);
362
    }
363
*/
364
/*    if (*rst_gen > width) {
365
	vec_ssyrk_rln_11(ralloc, nA, A, nA, Ca, walloc);
366
	vec_ssyrk_rln_11(ralloc, nB, B, nB, Cb, walloc);
367
    }*/
368
369
    short int c_step = width * (walloc + 1);
370
    for (i = 0; i < at_once; ++i) {
371
	short int e_offset = i * walloc;
372
	short int c_offset = i * c_step;
373
374
	short int gen = drp_gen[i];
375
376
	MRSESDataType *Ea_cur = Ea + e_offset;
377
	MRSESDataType *Eb_cur = Eb + e_offset;
378
379
	MRSESDataType *Ca_cur = Ca + c_offset;
380
	MRSESDataType *Cb_cur = Cb + c_offset;
381
382
	MRSESDataType *Da = D + (3 * i + 1) * d_step;
383
	MRSESDataType *Db = Da + d_step;
384
385
//	PRINT_MATRIX("% 6f ", Ea_cur, 0, 1, 5);
386
	
387
	if (rst_gen[i] > width) {
388
	    for (j = 0; j < width; j++) {
389
		if (j < gen) {
390
		    l = j; k = gen;
391
		} else {
392
		    l = gen; k = j;
393
		}
394
		
395
//	        if (fabs(Ca[k*walloc+l] - Ea_cur[j])>0.001) {
396
//		    printf("Restoring (%i): %i %i: % 6.4f to % 6.4f\n", gen, k, l, Ca_cur[k*walloc + l], Ea_cur[j]);
397
//		}
398
		Ca_cur[k * walloc + l] = Ea_cur[j];
399
		Cb_cur[k * walloc + l] = Eb_cur[j];
400
	    }
401
	    
402
	}
403
404
	    // Recovering to current stage
405
    	for (k = 0; k < width; ++k) {
406
	    for (l = 0; l < width; ++l) {
407
		Da[pack*(k * width + l)] = Ca_cur[k * walloc + l];
408
		Db[pack*(k * width + l)] = Cb_cur[k * walloc + l];
409
	    }
410
	}
411
    }
412
413
/*
414
    float Ta[16*16];
415
    vec_ssyrk_rln_11(ralloc, nA, A, nA, Ta, walloc);
416
    int prob = 0;
417
    for (i = 0; i < 5; i++) {
418
	for (j = 0; j < 5; j++) {
419
	    if (fabs(Ta[i*walloc+j] - Ca[i*walloc+j])>0.00001) {
420
		prob = 1;
421
		break;
422
	    }
423
	}
424
    }
425
    if (prob) {
426
	printf("Restoring: %i, gen: %i\n", *rst_gen, *drp_gen);
427
	printf("Computed\n");
428
	PRINT_MATRIX("% 6.4f", Ca, walloc, 5, 5);
429
	printf("Should be:\n");
430
	PRINT_MATRIX("% 6.4f", Ta, walloc, 5, 5);
431
	exit(1);
432
    } else {
433
//	printf("pass\n");
434
    }
435
    
436
//    vec_ssyrk_rln_11(ralloc, nB, B, nB, Tb, walloc);
437
438
*/
439
    return 0;
440
}
441