5
print("Usage: $0 <N> <K> (12 4)\n");
12
open out, ">vec_potrf_mtxmul.h";
14
print out "#define T(i,j) T_ ## i ## _ ## j\n";
15
print out "#define R(i,j) R_ ## i ## _ ## j\n";
16
print out "#define V(i) V_ ## i\n";
17
print out "#define DECLARE_T(i, j, A) register vector float T(i,j) = *((A) + i*lda + j);\n";
18
print out "#define DECLARE_R(i, j) register vector float R(i,j) = spu_splats((float)0.0);\n";
19
print out "#define DECLARE_V(i, A) register vector float V(i) = *((A) + i);\n";
21
$out = "#define DECLARE_TX(i, A)";
22
$vout = "#define DECLARE_VX(A)";
23
for (my $i=0; $i < $K; $i++) {
24
$out .= " DECLARE_T(i, $i, A)";
25
$vout .= " DECLARE_V($i, A)";
30
#$rout = "#define DECLARE_RX(i)";
31
#for (my $i=0; $i < $N/2; $i++) {
32
# $rout .= " DECLARE_R(i, $i)";
36
print out "#define DECLARE_T1(A) DECLARE_TX(0, A)\n";
37
#print out "#define DECLARE_V1(A) DECLARE_VX(A)\n";
38
print out "#define DECLARE_R1 DECLARE_R(0,0)\n";
39
print out "#define DECLARE_VR1 DECLARE_R(0,0)\n";
41
for (my $i=1; $i < $N; $i++) {
42
print out "#define DECLARE_T". ($i+1) ."(A) DECLARE_T$i(A) DECLARE_TX($i, A)\n";
43
# print out "#define DECLARE_V". ($i+1) ."(A) DECLARE_V$i(A) DECLARE_VX(A)\n";
45
# print out "#define DECLARE_R". ($i+1) ." DECLARE_R$i DECLARE_RX($i)\n";
47
print out "#define DECLARE_VR". ($i+1) ." DECLARE_VR$i DECLARE_R($i,0)\n";
49
print out "#define DECLARE_R". ($i+1) ." DECLARE_R$i";
50
for (my $j=0; $j <= $i; $j++) {
51
print out " DECLARE_R($j, $i)";
57
print out "#define COMPUTE_T(i, j, l, var) spu_madd(T(i,l), T(j,l), var)\n";
59
print out "#define COMPUTE_V(A, l, k, var) spu_madd(V(k), A[l*lda + k], var)\n";
60
#spu_madd(Arow, A[l * lda + k], temp[l]);
63
#$out = "COMPUTE_T(i, j, 1, spu_mul(T(i,0), T(j,0)))";
64
$out = "COMPUTE_T(i, j, 1, spu_madd(T(i,0), T(j,0), R(i,j)))";
65
$vout = "COMPUTE_V(A, l, 1, spu_madd(V(0), A[l*lda], R(l,0)))";
66
for (my $i = 2; $i < $K; $i++) {
67
$out = "COMPUTE_T(i, j, $i, $out)";
68
$vout = "COMPUTE_V(A, l, $i, $vout)";
71
#print out "#define COMPUTE_TX(i, j, C) *((C) + i * ldc + j) += sum_across_float4($out);\n";
72
print out "#define COMPUTE_TX(i, j, C) R(i,j) = $out;\n";
73
print out "#define COMPUTE_VX(A, l) R(l,0) = $vout;\n";
74
print out "#define SAVE_TX(i, j, C) *((C) + i * ldc + j) = sum_across_float4(R(i,j));\n";
75
#print out "#define SAVE_TX(i, j, C) *((C) + i * ldc + j) = spu_extract(R(i,j),0);\n";
77
print out "#define COMPUTE_T1(C) COMPUTE_TX(0, 0, C)\n";
78
print out "#define COMPUTE_V1(A) COMPUTE_VX(A, 0)\n";
79
print out "#define SAVE_T1(C) SAVE_TX(0, 0, C)\n";
81
for (my $i = 1; $i < $N; $i++) {
82
$out = "#define COMPUTE_T".($i+1)."(C) COMPUTE_T$i(C) COMPUTE_TX($i,$i,C)";
83
$vout = "#define COMPUTE_V".($i+1)."(A) COMPUTE_V$i(A) COMPUTE_VX(A,$i)";
84
$sout = "#define SAVE_T".($i+1)."(C) SAVE_T$i(C) SAVE_TX($i,$i,C)";
85
for (my $j = 0; $j < $i; $j++) {
86
# $out .= " COMPUTE_TX($j,$i,C) COMPUTE_TX($i,$j,C)";
87
$out .= " COMPUTE_TX($j,$i,C)";
88
$sout .= " SAVE_TX($j,$i,C)";
103
for ($i = 0; $i <= $#c; $i++) {
104
$out.=$c[$i]." = spu_extract(tmp5,$i);";
109
for (my $i = 0; $i < $N; $i++) {
110
$out = "#define SUM_T" . ($i+1) . "(C)";
114
for (my $l = 0; $l <= $i; $l++) {
115
for (my $m = 0; $m <= $l; $m++) {
116
push @el, "R($m, $l)";
117
push @c, "*((C) + $l * ldc + $m)";
119
$out .= " HSUM($el[0], $el[1], $el[2], $el[3]);";
120
$out .= save(\@el, \@c);
127
while ($#el < 3) { push @el, "zero"; }
128
$out .= " HSUM($el[0], $el[1], $el[2], $el[3]);";
129
$out .= save(\@el, \@c);
135
$out = "#define SUM_V" . ($i+1) . "(C)";
138
for (my $l = 0; $l <= $i; $l++) {
139
push @el, "R($l, 0)";
141
$out .= " HSUM($el[0], $el[1], $el[2], $el[3]);";
142
$out .= " C[$cidx] = tmp5;";
149
while ($#el < 3) { push @el, "zero"; }
150
$out .= " HSUM($el[0], $el[1], $el[2], $el[3]);";
151
$out .= " C[$cidx] = tmp5;";