This file contains some ideas which was slower or not comed to the end due
their nature or my inabilities or for time saving purposes.

1. The general idea is to computer full 16x16 matrix until the end. We got
several quadratic matrices along the diagonal with which are the actual 
C matrixes and we padding the rest of diagonal with 1. Everything else is 0.
Then we can compute Cholesky and solve equations on all matrices together
using SPE-optimized blas. However, this approach is twice slower compared 
to computing matrix by matrix using non-optimized blas.

    for (j = 1; j < at_once; j++) {
	int offset = j * width * real_width;
	MRSESDataType *C_cur = C + offset;
	MRSESDataType *Ca_cur = Ca + offset;
	MRSESDataType *Cb_cur = Cb + offset;

	for (i = 0; i < real_width; i++) {
	    memset(C_cur + i * width, 0, j * real_width * sizeof(MRSESDataType));
	}
    }
    for (j = at_once * real_width; j < width; j++) {
	memset(C + j * width, 0, width * sizeof(MRSESDataType));
	C[j * width + j] = 1;
    }


Solving equation can be done with following commands, but I have not managed to
get correct results. Anyway the trsv uses neglectable amount of time. Most 
things are done in matrix multiplication and cholesky decomposition.

    transpose_matrix(width, width, C, width, D, width);
    MRSESDataType *D_cur = D + offset;
    solve_upper_1(real_width, D_cur, width, mean_cur);
	
        // multiple of 16
    strsv_spu_lower(width, C, width, mean);	


2. Another idea was to compute [Ca]*[Cb] as [Ca*Cb]. For that we could 
multiply Ca*Cb and then multiply by transpose of it (Ca*Cb)' to get 
symmetric matrix. Finally we should get the square of determinat.
But, the resulting symmetric matrix is not positive-definite (or for
precision reasons), the cholesky decomposition have failed for it.

    for (i = 0; i < width; i++) {
	for (j = 0; j <= i; j++) {
	    Ca[j * width + i] = Ca[i * width + j];
	    Cb[j * width + i] = Cb[i * width + j];
	}
    }

    memset(Cab, 0, width*width*sizeof(MRSESDataType));
    //sgemm_spu(width, width, width, Ca, Cb, Cab);
    cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, width, width, width, 1, Ca, width, Cb, width, 0, Cab, width);

    memset(Cab2, 0, width*width*sizeof(MRSESDataType));
    blas_syrk(CblasRowMajor, CblasLower, CblasNoTrans, width, width, 1, Cab, width, 0, Cab2, width);
    //spe_syrk(CblasRowMajor, CblasLower, CblasNoTrans, width, width, 1, Cab, width, 0, Cab2, width);
    for ...
	lapack_potrf(&hmode, &real_width, Cab_cur, &width, err);

        detC = C_cur[0]; detAB = Cab_cur[0];
	for (i = width + 1; i < c_step; i+= (width+1)) {
	    detAB *= Cab_cur[i];
	    detC *= C_cur[i];
	}
	rcorr = logf(detC * detC * detC * detC / detAB);
    }


3. Following code is matrix multiplocation using default (non-SPE) blas. Even
if used with appropriate small matrices it is significantly slower than IBM
code on full matrix.

/*    
    char hmode = 'L';

	//original (do not store C)
    //blas_syrk(CblasRowMajor, CblasUpper, CblasNoTrans, width, nA, 1, A, nA, 0, Ca, width);
    //blas_syrk(CblasRowMajor, CblasUpper, CblasNoTrans, width, nB, 1, B, nB, 0, Cb, width);

    blas_syrk(CblasRowMajor, CblasUpper, CblasNoTrans, 5, 50, 1, A, nA, 0, Ca, width);
    blas_syrk(CblasRowMajor, CblasUpper, CblasNoTrans, 5, 40, 1, B, nB, 0, Cb, width);
    
    memcpy(C, Ca, width2 * sizeof(MRSESDataType));
    blas_axpy(width2, 1, Cb, 1, C, 1);
    blas_scal(width2, 0.5, C, 1);
*/

4. Other ways to solve equation system. Both works but a bit slower
/*
	    // Faster 82:
	blas_trsm(
	    CblasRowMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit,
	    1, real_width, 1, C_cur, width, mean_cur, width //1?
	);

	    // Slower 83-84
	blas_trsm(
	    CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit,
	    real_width, 1, 1, C_cur, width, mean_cur, 1 //width //1?
	);
*/

5. Other ways to perform cholesky decomposition. The lapack version is surely
working. Curretnly implemented seems to work, but who nows about special cases.

	atlas_spotrf2(real_width, C_cur, width);
	atlas_spotrf2(real_width, Ca_cur, width);
	atlas_spotrf2(real_width, Cb_cur, width);

	lapack_potrf(&hmode, &real_width, C_cur, &width, &err);
	if (err) return 1;
	lapack_potrf(&hmode, &real_width, Ca_cur, &width, &err);
	if (err) return 1;
	lapack_potrf(&hmode, &real_width, Cb_cur, &width, &err);
	if (err) return 1;


6. This is fully working, but not vectorized version of code
static inline int mrses_spe_real_run(
    MRSESContext mrses, MRSESDataType *result,
    int width, int width2, int nA, int nB,
    MRSESDataType *A, MRSESDataType *B, MRSESDataType *mean,
    MRSESDataType *C, MRSESDataType *Ca, MRSESDataType *Cb
) {
    int i, j, err;
    
    MRSESDataType detAB, detC;
    MRSESDataType rmahal, rcorr;
    
    short int walloc;

    char hmode = 'U';

    int real_width = 5;
//    int iterate_size = mrses->iterate_size;
    int at_once = max(1, SPE_BLOCK / real_width);
//    int times = iterate_size / at_once;

    int rww = at_once * width * real_width;
    int rww_alloc = calc_alloc(rww, 64);

    memset(Ca, 0, rww * sizeof(MRSESDataType));
    memset(Cb, 0, rww * sizeof(MRSESDataType));

    //PRINT_MATRIX("% 6.4f ", A, 16, 16, 16)

	// that works on multiples of 16
    spe_syrk(CblasRowMajor, CblasLower, CblasNoTrans, width, nA, 1, A, nA, 0, Ca, width);
    spe_syrk(CblasRowMajor, CblasLower, CblasNoTrans, width, nB, 1, B, nB, 0, Cb, width);

	// thats works on multiples of 64 (16x16 - fine)
    memcpy(C, Ca, rww * sizeof(MRSESDataType));
    spe_axpy(rww_alloc, 1, Cb, 1, C, 1);
    spe_scal(rww_alloc, 0.5, C, 1);

    int c_step = real_width * (width + 1);

    for (j = 0; j < at_once; j++) {
	int offset = j * c_step;
	MRSESDataType *C_cur = C + offset;
	MRSESDataType *Ca_cur = Ca + offset;
	MRSESDataType *Cb_cur = Cb + offset;
	MRSESDataType *mean_cur = mean + j * real_width;
	MRSESDataType result;


	err = atlas_spotrf_u(real_width, C_cur, width);
	if (err) return 1;
	err = atlas_spotrf_u(real_width, Ca_cur, width);
	if (err) return 1;
	err = atlas_spotrf_u(real_width, Cb_cur, width);
	if (err) return 1;

/*

	atlas_spotrf2(real_width, C_cur, width);
	atlas_spotrf2(real_width, Ca_cur, width);
	atlas_spotrf2(real_width, Cb_cur, width);

	lapack_potrf(&hmode, &real_width, C_cur, &width, &err);
	if (err) return 1;
	lapack_potrf(&hmode, &real_width, Ca_cur, &width, &err);
	if (err) return 1;
	lapack_potrf(&hmode, &real_width, Cb_cur, &width, &err);
	if (err) return 1;
*/        

        detC = C_cur[0]; detAB = Ca_cur[0] * Cb_cur[0];
	for (i = width + 1; i < c_step; i+= (width+1)) {
	    detAB *= (Ca_cur[i] * Cb_cur[i]);
	    detC *= C_cur[i];
	}

	rcorr = 2 * logf(detC * detC / detAB);


	blas_trsv(
	    CblasRowMajor, CblasLower, CblasNoTrans, CblasNonUnit,
	    real_width, C_cur, width, mean_cur, 1
	);

	rmahal = blas_dot(real_width, mean_cur, 1, mean_cur, 1);

	switch (mrses->dist) {
	    case BHATTACHARYYA:
		result = rmahal/8 + rcorr/4;
	    break;
	    case MAHALANOBIS:
		result = rmahal;
	    break;
	    case CORCOR:
		result = rcorr;
	    break;
	    default:
		result = 0;
	}

//	printf("SPU result: %e (mahal: %e, corcor: %e)\n", result, rmahal, rcorr);
    }

    return 0;
}

    data->A = (MRSESDataType*)(allocation + pos);
    pos += walloc * alloc * sizeof(MRSESDataType);

    data->B = (MRSESDataType*)(allocation + pos);
    pos += walloc * alloc * sizeof(MRSESDataType);

    data->C = (MRSESDataType*)(allocation + pos);
    pos += walloc2 * sizeof(MRSESDataType);

    data->Ca = (MRSESDataType*)(allocation + pos);
    pos += walloc2 * sizeof(MRSESDataType);

    data->Cb = (MRSESDataType*)(allocation + pos);
    pos += walloc2 * sizeof(MRSESDataType);

//	memset(data->A + used_walloc * alloc, 0, (walloc - used_walloc) * alloc * sizeof(MRSESDataType));
//	memset(data->B + used_walloc * alloc, 0, (walloc - used_walloc) * alloc * sizeof(MRSESDataType));
//	memset(data->C + used_walloc * walloc, 0, (walloc - used_walloc) * walloc * sizeof(MRSESDataType));

//	memset(data->mean + used_walloc, 0, (walloc - used_walloc) * sizeof(MRSESDataType));
//	memset(data->mean_copy + used_walloc, 0, (walloc - used_walloc) * sizeof(MRSESDataType));


/*
	err = mrses_spe_real_run(
	    mrses, cur_result,
	    walign, walign2, aA, aB, //width, width2, nA, nB, alloc, 
	    A, B, mean_copy, C, Ca, Cb
	);
*/

/*	
	    err = mrses_spe_real_run(
		mrses, result,
	        walign, walign2, aA, aB, //width, width2, nA, nB, alloc,
		A, B, mean_copy, C, Ca, Cb
	    );
*/


7. Vectorization tests with various invalid operations
/*
    vector float one = {1, -1, 1, 1};
    vector float some_zero = {0, 1, 1, 0};
    vector float zero = {0, 0, 0, 0};
    
    vector float result;
    result = divf4(one, some_zero);
    
    printf("%f %f\n", *(((float*)&result)+1), *(((float*)&result)+2));
    printf("%f %f\n", *(((float*)&result)+0), *(((float*)&result)+3));

    result = sqrtf4(result);

    printf("%f %f\n", *(((float*)&result)+1), *(((float*)&result)+2));
    printf("%f %f\n", *(((float*)&result)+0), *(((float*)&result)+3));

    result = divf4_fast(one, some_zero);
    
    printf("%f %f\n", *(((float*)&result)+1), *(((float*)&result)+2));
    printf("%f %f\n", *(((float*)&result)+0), *(((float*)&result)+3));

    result = sqrtf4_fast(fmaxf4(result, zero));

    printf("%f %f\n", *(((float*)&result)+1), *(((float*)&result)+2));
    printf("%f %f\n", *(((float*)&result)+0), *(((float*)&result)+3));

    return 0;
*/


8. Debugging vectorized code
/*
    	int err = atlas_spotrf2(real_width, Ca, width);
	if (err) printf("Problem with spotrf\n");
	else {
	    if ((((unsigned int)D)%16)==0) {
	    printf("Cholesky Ca\n");
	    PRINT_MATRIX("% 6.4f ", Ca, 16, 5, 5)
	    }
	}


    float C[16*16];    
    memset(C, 0, 16*16 * sizeof(MRSESDataType));
    memcpy(C, Ca, rww * sizeof(MRSESDataType));
    spe_axpy(16*16, 1, Cb, 1, C, 1);
    spe_scal(16*16, 0.5, C, 1);


	puts("=====================================");
	int c_step = real_width * (width + 1);
	atlas_spotrf2(real_width, Ca, width);
	atlas_spotrf2(real_width, Cb, width);
	atlas_spotrf2(real_width, C, width);
        float detC = C[0]; float detAB = Ca[0] * Cb[0];
	for (i = width + 1; i < c_step; i+= (width+1)) {
	    detAB *= (Ca[i] * Cb[i]);
	    detC *= C[i];
	}

	float rcorr = 2 * logf(detC * detC / detAB);
	printf("Real det: %e = %e %e\n", rcorr, detC, detAB);
*/

9. Non-vectorized D-recovery
static inline int mrses_spe_real_multiply_recover_old(
    MRSESDataType *D,
    short int pack, short int at_once,
    short int d_step, short int width, short int ralloc, short int walloc, short int nA, short int nB,
    MRSESDataType *A, MRSESDataType *B,
    MRSESDataType *Ca, MRSESDataType *Cb,
    MRSESIntType *drp_gen, MRSESIntType *rst_gen,
    MRSESDataType *Ea, MRSESDataType *Eb

) {
    short int i, j, k, l;

/*
    if ((((int)D)%16)==0) {
    if (*rst_gen > width) {
	printf("Orig\n");
	PRINT_MATRIX("% 6.4f", Ca, walloc, 5, 5);
    }
    }
    if (*rst_gen > width) {
	vec_ssyrk_rln_11(ralloc, nA, A, nA, Ca, walloc);
    }
    if ((((int)D)%16)==0) {
    if (*rst_gen > width) {
	printf("Update: %i\n", *drp_gen);
	PRINT_MATRIX("% 6.4f", Ca, walloc, 5, 5);
    }}

    if (*rst_gen > width) {
	vec_ssyrk_rln_11(ralloc, nB, B, nB, Cb, walloc);
    }
*/
/*    if (*rst_gen > width) {
	vec_ssyrk_rln_11(ralloc, nA, A, nA, Ca, walloc);
	vec_ssyrk_rln_11(ralloc, nB, B, nB, Cb, walloc);
    }*/

    short int c_step = width * (walloc + 1);
    for (i = 0; i < at_once; ++i) {
	short int e_offset = i * walloc;
	short int c_offset = i * c_step;

	short int gen = drp_gen[i];

	MRSESDataType *Ea_cur = Ea + e_offset;
	MRSESDataType *Eb_cur = Eb + e_offset;

	MRSESDataType *Ca_cur = Ca + c_offset;
	MRSESDataType *Cb_cur = Cb + c_offset;

	MRSESDataType *Da = D + (3 * i + 1) * d_step;
	MRSESDataType *Db = Da + d_step;

//	PRINT_MATRIX("% 6f ", Ea_cur, 0, 1, 5);
	
	if (rst_gen[i] > width) {
	    for (j = 0; j < width; j++) {
		if (j < gen) {
		    l = j; k = gen;
		} else {
		    l = gen; k = j;
		}
		
//	        if (fabs(Ca[k*walloc+l] - Ea_cur[j])>0.001) {
//		    printf("Restoring (%i): %i %i: % 6.4f to % 6.4f\n", gen, k, l, Ca_cur[k*walloc + l], Ea_cur[j]);
//		}
		Ca_cur[k * walloc + l] = Ea_cur[j];
		Cb_cur[k * walloc + l] = Eb_cur[j];
	    }
	    
	}

	    // Recovering to current stage
    	for (k = 0; k < width; ++k) {
	    for (l = 0; l < width; ++l) {
		Da[pack*(k * width + l)] = Ca_cur[k * walloc + l];
		Db[pack*(k * width + l)] = Cb_cur[k * walloc + l];
	    }
	}
    }

/*
    float Ta[16*16];
    vec_ssyrk_rln_11(ralloc, nA, A, nA, Ta, walloc);
    int prob = 0;
    for (i = 0; i < 5; i++) {
	for (j = 0; j < 5; j++) {
	    if (fabs(Ta[i*walloc+j] - Ca[i*walloc+j])>0.00001) {
		prob = 1;
		break;
	    }
	}
    }
    if (prob) {
	printf("Restoring: %i, gen: %i\n", *rst_gen, *drp_gen);
	printf("Computed\n");
	PRINT_MATRIX("% 6.4f", Ca, walloc, 5, 5);
	printf("Should be:\n");
	PRINT_MATRIX("% 6.4f", Ta, walloc, 5, 5);
	exit(1);
    } else {
//	printf("pass\n");
    }
    
//    vec_ssyrk_rln_11(ralloc, nB, B, nB, Tb, walloc);

*/
    return 0;
}