/ani/mrses

To get this branch, use:
bzr branch http://darksoft.org/webbzr/ani/mrses
1 by Suren A. Chilingaryan
Initial import
1
#define T(i,j) T_ ## i ## _ ## j
2
#define R(i,j) R_ ## i ## _ ## j
3
#define V(i) V_ ## i
4
#define DECLARE_T(i, j, A) register vector float T(i,j) = *((A) + i*lda + j);
5
#define DECLARE_R(i, j) register vector float R(i,j) = spu_splats((float)0.0);
6
#define DECLARE_V(i, A) register vector float V(i) = *((A) + i);
7
#define DECLARE_TX(i, A) DECLARE_T(i, 0, A) DECLARE_T(i, 1, A) DECLARE_T(i, 2, A) DECLARE_T(i, 3, A)
8
#define DECLARE_VX(A) DECLARE_V(0, A) DECLARE_V(1, A) DECLARE_V(2, A) DECLARE_V(3, A)
9
#define DECLARE_T1(A) DECLARE_TX(0, A)
10
#define DECLARE_R1 DECLARE_R(0,0)
11
#define DECLARE_VR1 DECLARE_R(0,0)
12
#define DECLARE_T2(A) DECLARE_T1(A) DECLARE_TX(1, A)
13
#define DECLARE_VR2 DECLARE_VR1 DECLARE_R(1,0)
14
#define DECLARE_R2 DECLARE_R1 DECLARE_R(0, 1) DECLARE_R(1, 1)
15
#define DECLARE_T3(A) DECLARE_T2(A) DECLARE_TX(2, A)
16
#define DECLARE_VR3 DECLARE_VR2 DECLARE_R(2,0)
17
#define DECLARE_R3 DECLARE_R2 DECLARE_R(0, 2) DECLARE_R(1, 2) DECLARE_R(2, 2)
18
#define DECLARE_T4(A) DECLARE_T3(A) DECLARE_TX(3, A)
19
#define DECLARE_VR4 DECLARE_VR3 DECLARE_R(3,0)
20
#define DECLARE_R4 DECLARE_R3 DECLARE_R(0, 3) DECLARE_R(1, 3) DECLARE_R(2, 3) DECLARE_R(3, 3)
21
#define DECLARE_T5(A) DECLARE_T4(A) DECLARE_TX(4, A)
22
#define DECLARE_VR5 DECLARE_VR4 DECLARE_R(4,0)
23
#define DECLARE_R5 DECLARE_R4 DECLARE_R(0, 4) DECLARE_R(1, 4) DECLARE_R(2, 4) DECLARE_R(3, 4) DECLARE_R(4, 4)
24
#define DECLARE_T6(A) DECLARE_T5(A) DECLARE_TX(5, A)
25
#define DECLARE_VR6 DECLARE_VR5 DECLARE_R(5,0)
26
#define DECLARE_R6 DECLARE_R5 DECLARE_R(0, 5) DECLARE_R(1, 5) DECLARE_R(2, 5) DECLARE_R(3, 5) DECLARE_R(4, 5) DECLARE_R(5, 5)
27
#define DECLARE_T7(A) DECLARE_T6(A) DECLARE_TX(6, A)
28
#define DECLARE_VR7 DECLARE_VR6 DECLARE_R(6,0)
29
#define DECLARE_R7 DECLARE_R6 DECLARE_R(0, 6) DECLARE_R(1, 6) DECLARE_R(2, 6) DECLARE_R(3, 6) DECLARE_R(4, 6) DECLARE_R(5, 6) DECLARE_R(6, 6)
30
#define DECLARE_T8(A) DECLARE_T7(A) DECLARE_TX(7, A)
31
#define DECLARE_VR8 DECLARE_VR7 DECLARE_R(7,0)
32
#define DECLARE_R8 DECLARE_R7 DECLARE_R(0, 7) DECLARE_R(1, 7) DECLARE_R(2, 7) DECLARE_R(3, 7) DECLARE_R(4, 7) DECLARE_R(5, 7) DECLARE_R(6, 7) DECLARE_R(7, 7)
33
#define DECLARE_T9(A) DECLARE_T8(A) DECLARE_TX(8, A)
34
#define DECLARE_VR9 DECLARE_VR8 DECLARE_R(8,0)
35
#define DECLARE_R9 DECLARE_R8 DECLARE_R(0, 8) DECLARE_R(1, 8) DECLARE_R(2, 8) DECLARE_R(3, 8) DECLARE_R(4, 8) DECLARE_R(5, 8) DECLARE_R(6, 8) DECLARE_R(7, 8) DECLARE_R(8, 8)
36
#define DECLARE_T10(A) DECLARE_T9(A) DECLARE_TX(9, A)
37
#define DECLARE_VR10 DECLARE_VR9 DECLARE_R(9,0)
38
#define DECLARE_R10 DECLARE_R9 DECLARE_R(0, 9) DECLARE_R(1, 9) DECLARE_R(2, 9) DECLARE_R(3, 9) DECLARE_R(4, 9) DECLARE_R(5, 9) DECLARE_R(6, 9) DECLARE_R(7, 9) DECLARE_R(8, 9) DECLARE_R(9, 9)
39
#define DECLARE_T11(A) DECLARE_T10(A) DECLARE_TX(10, A)
40
#define DECLARE_VR11 DECLARE_VR10 DECLARE_R(10,0)
41
#define DECLARE_R11 DECLARE_R10 DECLARE_R(0, 10) DECLARE_R(1, 10) DECLARE_R(2, 10) DECLARE_R(3, 10) DECLARE_R(4, 10) DECLARE_R(5, 10) DECLARE_R(6, 10) DECLARE_R(7, 10) DECLARE_R(8, 10) DECLARE_R(9, 10) DECLARE_R(10, 10)
42
#define DECLARE_T12(A) DECLARE_T11(A) DECLARE_TX(11, A)
43
#define DECLARE_VR12 DECLARE_VR11 DECLARE_R(11,0)
44
#define DECLARE_R12 DECLARE_R11 DECLARE_R(0, 11) DECLARE_R(1, 11) DECLARE_R(2, 11) DECLARE_R(3, 11) DECLARE_R(4, 11) DECLARE_R(5, 11) DECLARE_R(6, 11) DECLARE_R(7, 11) DECLARE_R(8, 11) DECLARE_R(9, 11) DECLARE_R(10, 11) DECLARE_R(11, 11)
45
#define DECLARE_T13(A) DECLARE_T12(A) DECLARE_TX(12, A)
46
#define DECLARE_VR13 DECLARE_VR12 DECLARE_R(12,0)
47
#define DECLARE_R13 DECLARE_R12 DECLARE_R(0, 12) DECLARE_R(1, 12) DECLARE_R(2, 12) DECLARE_R(3, 12) DECLARE_R(4, 12) DECLARE_R(5, 12) DECLARE_R(6, 12) DECLARE_R(7, 12) DECLARE_R(8, 12) DECLARE_R(9, 12) DECLARE_R(10, 12) DECLARE_R(11, 12) DECLARE_R(12, 12)
48
#define DECLARE_T14(A) DECLARE_T13(A) DECLARE_TX(13, A)
49
#define DECLARE_VR14 DECLARE_VR13 DECLARE_R(13,0)
50
#define DECLARE_R14 DECLARE_R13 DECLARE_R(0, 13) DECLARE_R(1, 13) DECLARE_R(2, 13) DECLARE_R(3, 13) DECLARE_R(4, 13) DECLARE_R(5, 13) DECLARE_R(6, 13) DECLARE_R(7, 13) DECLARE_R(8, 13) DECLARE_R(9, 13) DECLARE_R(10, 13) DECLARE_R(11, 13) DECLARE_R(12, 13) DECLARE_R(13, 13)
51
#define DECLARE_T15(A) DECLARE_T14(A) DECLARE_TX(14, A)
52
#define DECLARE_VR15 DECLARE_VR14 DECLARE_R(14,0)
53
#define DECLARE_R15 DECLARE_R14 DECLARE_R(0, 14) DECLARE_R(1, 14) DECLARE_R(2, 14) DECLARE_R(3, 14) DECLARE_R(4, 14) DECLARE_R(5, 14) DECLARE_R(6, 14) DECLARE_R(7, 14) DECLARE_R(8, 14) DECLARE_R(9, 14) DECLARE_R(10, 14) DECLARE_R(11, 14) DECLARE_R(12, 14) DECLARE_R(13, 14) DECLARE_R(14, 14)
54
#define DECLARE_T16(A) DECLARE_T15(A) DECLARE_TX(15, A)
55
#define DECLARE_VR16 DECLARE_VR15 DECLARE_R(15,0)
56
#define DECLARE_R16 DECLARE_R15 DECLARE_R(0, 15) DECLARE_R(1, 15) DECLARE_R(2, 15) DECLARE_R(3, 15) DECLARE_R(4, 15) DECLARE_R(5, 15) DECLARE_R(6, 15) DECLARE_R(7, 15) DECLARE_R(8, 15) DECLARE_R(9, 15) DECLARE_R(10, 15) DECLARE_R(11, 15) DECLARE_R(12, 15) DECLARE_R(13, 15) DECLARE_R(14, 15) DECLARE_R(15, 15)
57
58
#define COMPUTE_T(i, j, l, var) spu_madd(T(i,l), T(j,l), var)
59
#define COMPUTE_V(A, l, k, var) spu_madd(V(k), A[l*lda + k], var)
60
#define COMPUTE_TX(i, j, C) R(i,j) = COMPUTE_T(i, j, 3, COMPUTE_T(i, j, 2, COMPUTE_T(i, j,  1, spu_madd(T(i,0), T(j,0), R(i,j)))));
61
#define COMPUTE_VX(A, l) R(l,0) = COMPUTE_V(A, l, 3, COMPUTE_V(A, l, 2, COMPUTE_V(A, l, 1, spu_madd(V(0), A[l*lda], R(l,0)))));
62
#define SAVE_TX(i, j, C) *((C) + i * ldc + j) = sum_across_float4(R(i,j));
63
#define COMPUTE_T1(C) COMPUTE_TX(0, 0, C)
64
#define COMPUTE_V1(A) COMPUTE_VX(A, 0)
65
#define SAVE_T1(C) SAVE_TX(0, 0, C)
66
#define COMPUTE_T2(C) COMPUTE_T1(C) COMPUTE_TX(1,1,C) COMPUTE_TX(0,1,C)
67
#define COMPUTE_V2(A) COMPUTE_V1(A) COMPUTE_VX(A,1)
68
#define SAVE_T2(C) SAVE_T1(C) SAVE_TX(1,1,C) SAVE_TX(0,1,C)
69
#define COMPUTE_T3(C) COMPUTE_T2(C) COMPUTE_TX(2,2,C) COMPUTE_TX(0,2,C) COMPUTE_TX(1,2,C)
70
#define COMPUTE_V3(A) COMPUTE_V2(A) COMPUTE_VX(A,2)
71
#define SAVE_T3(C) SAVE_T2(C) SAVE_TX(2,2,C) SAVE_TX(0,2,C) SAVE_TX(1,2,C)
72
#define COMPUTE_T4(C) COMPUTE_T3(C) COMPUTE_TX(3,3,C) COMPUTE_TX(0,3,C) COMPUTE_TX(1,3,C) COMPUTE_TX(2,3,C)
73
#define COMPUTE_V4(A) COMPUTE_V3(A) COMPUTE_VX(A,3)
74
#define SAVE_T4(C) SAVE_T3(C) SAVE_TX(3,3,C) SAVE_TX(0,3,C) SAVE_TX(1,3,C) SAVE_TX(2,3,C)
75
#define COMPUTE_T5(C) COMPUTE_T4(C) COMPUTE_TX(4,4,C) COMPUTE_TX(0,4,C) COMPUTE_TX(1,4,C) COMPUTE_TX(2,4,C) COMPUTE_TX(3,4,C)
76
#define COMPUTE_V5(A) COMPUTE_V4(A) COMPUTE_VX(A,4)
77
#define SAVE_T5(C) SAVE_T4(C) SAVE_TX(4,4,C) SAVE_TX(0,4,C) SAVE_TX(1,4,C) SAVE_TX(2,4,C) SAVE_TX(3,4,C)
78
#define COMPUTE_T6(C) COMPUTE_T5(C) COMPUTE_TX(5,5,C) COMPUTE_TX(0,5,C) COMPUTE_TX(1,5,C) COMPUTE_TX(2,5,C) COMPUTE_TX(3,5,C) COMPUTE_TX(4,5,C)
79
#define COMPUTE_V6(A) COMPUTE_V5(A) COMPUTE_VX(A,5)
80
#define SAVE_T6(C) SAVE_T5(C) SAVE_TX(5,5,C) SAVE_TX(0,5,C) SAVE_TX(1,5,C) SAVE_TX(2,5,C) SAVE_TX(3,5,C) SAVE_TX(4,5,C)
81
#define COMPUTE_T7(C) COMPUTE_T6(C) COMPUTE_TX(6,6,C) COMPUTE_TX(0,6,C) COMPUTE_TX(1,6,C) COMPUTE_TX(2,6,C) COMPUTE_TX(3,6,C) COMPUTE_TX(4,6,C) COMPUTE_TX(5,6,C)
82
#define COMPUTE_V7(A) COMPUTE_V6(A) COMPUTE_VX(A,6)
83
#define SAVE_T7(C) SAVE_T6(C) SAVE_TX(6,6,C) SAVE_TX(0,6,C) SAVE_TX(1,6,C) SAVE_TX(2,6,C) SAVE_TX(3,6,C) SAVE_TX(4,6,C) SAVE_TX(5,6,C)
84
#define COMPUTE_T8(C) COMPUTE_T7(C) COMPUTE_TX(7,7,C) COMPUTE_TX(0,7,C) COMPUTE_TX(1,7,C) COMPUTE_TX(2,7,C) COMPUTE_TX(3,7,C) COMPUTE_TX(4,7,C) COMPUTE_TX(5,7,C) COMPUTE_TX(6,7,C)
85
#define COMPUTE_V8(A) COMPUTE_V7(A) COMPUTE_VX(A,7)
86
#define SAVE_T8(C) SAVE_T7(C) SAVE_TX(7,7,C) SAVE_TX(0,7,C) SAVE_TX(1,7,C) SAVE_TX(2,7,C) SAVE_TX(3,7,C) SAVE_TX(4,7,C) SAVE_TX(5,7,C) SAVE_TX(6,7,C)
87
#define COMPUTE_T9(C) COMPUTE_T8(C) COMPUTE_TX(8,8,C) COMPUTE_TX(0,8,C) COMPUTE_TX(1,8,C) COMPUTE_TX(2,8,C) COMPUTE_TX(3,8,C) COMPUTE_TX(4,8,C) COMPUTE_TX(5,8,C) COMPUTE_TX(6,8,C) COMPUTE_TX(7,8,C)
88
#define COMPUTE_V9(A) COMPUTE_V8(A) COMPUTE_VX(A,8)
89
#define SAVE_T9(C) SAVE_T8(C) SAVE_TX(8,8,C) SAVE_TX(0,8,C) SAVE_TX(1,8,C) SAVE_TX(2,8,C) SAVE_TX(3,8,C) SAVE_TX(4,8,C) SAVE_TX(5,8,C) SAVE_TX(6,8,C) SAVE_TX(7,8,C)
90
#define COMPUTE_T10(C) COMPUTE_T9(C) COMPUTE_TX(9,9,C) COMPUTE_TX(0,9,C) COMPUTE_TX(1,9,C) COMPUTE_TX(2,9,C) COMPUTE_TX(3,9,C) COMPUTE_TX(4,9,C) COMPUTE_TX(5,9,C) COMPUTE_TX(6,9,C) COMPUTE_TX(7,9,C) COMPUTE_TX(8,9,C)
91
#define COMPUTE_V10(A) COMPUTE_V9(A) COMPUTE_VX(A,9)
92
#define SAVE_T10(C) SAVE_T9(C) SAVE_TX(9,9,C) SAVE_TX(0,9,C) SAVE_TX(1,9,C) SAVE_TX(2,9,C) SAVE_TX(3,9,C) SAVE_TX(4,9,C) SAVE_TX(5,9,C) SAVE_TX(6,9,C) SAVE_TX(7,9,C) SAVE_TX(8,9,C)
93
#define COMPUTE_T11(C) COMPUTE_T10(C) COMPUTE_TX(10,10,C) COMPUTE_TX(0,10,C) COMPUTE_TX(1,10,C) COMPUTE_TX(2,10,C) COMPUTE_TX(3,10,C) COMPUTE_TX(4,10,C) COMPUTE_TX(5,10,C) COMPUTE_TX(6,10,C) COMPUTE_TX(7,10,C) COMPUTE_TX(8,10,C) COMPUTE_TX(9,10,C)
94
#define COMPUTE_V11(A) COMPUTE_V10(A) COMPUTE_VX(A,10)
95
#define SAVE_T11(C) SAVE_T10(C) SAVE_TX(10,10,C) SAVE_TX(0,10,C) SAVE_TX(1,10,C) SAVE_TX(2,10,C) SAVE_TX(3,10,C) SAVE_TX(4,10,C) SAVE_TX(5,10,C) SAVE_TX(6,10,C) SAVE_TX(7,10,C) SAVE_TX(8,10,C) SAVE_TX(9,10,C)
96
#define COMPUTE_T12(C) COMPUTE_T11(C) COMPUTE_TX(11,11,C) COMPUTE_TX(0,11,C) COMPUTE_TX(1,11,C) COMPUTE_TX(2,11,C) COMPUTE_TX(3,11,C) COMPUTE_TX(4,11,C) COMPUTE_TX(5,11,C) COMPUTE_TX(6,11,C) COMPUTE_TX(7,11,C) COMPUTE_TX(8,11,C) COMPUTE_TX(9,11,C) COMPUTE_TX(10,11,C)
97
#define COMPUTE_V12(A) COMPUTE_V11(A) COMPUTE_VX(A,11)
98
#define SAVE_T12(C) SAVE_T11(C) SAVE_TX(11,11,C) SAVE_TX(0,11,C) SAVE_TX(1,11,C) SAVE_TX(2,11,C) SAVE_TX(3,11,C) SAVE_TX(4,11,C) SAVE_TX(5,11,C) SAVE_TX(6,11,C) SAVE_TX(7,11,C) SAVE_TX(8,11,C) SAVE_TX(9,11,C) SAVE_TX(10,11,C)
99
#define COMPUTE_T13(C) COMPUTE_T12(C) COMPUTE_TX(12,12,C) COMPUTE_TX(0,12,C) COMPUTE_TX(1,12,C) COMPUTE_TX(2,12,C) COMPUTE_TX(3,12,C) COMPUTE_TX(4,12,C) COMPUTE_TX(5,12,C) COMPUTE_TX(6,12,C) COMPUTE_TX(7,12,C) COMPUTE_TX(8,12,C) COMPUTE_TX(9,12,C) COMPUTE_TX(10,12,C) COMPUTE_TX(11,12,C)
100
#define COMPUTE_V13(A) COMPUTE_V12(A) COMPUTE_VX(A,12)
101
#define SAVE_T13(C) SAVE_T12(C) SAVE_TX(12,12,C) SAVE_TX(0,12,C) SAVE_TX(1,12,C) SAVE_TX(2,12,C) SAVE_TX(3,12,C) SAVE_TX(4,12,C) SAVE_TX(5,12,C) SAVE_TX(6,12,C) SAVE_TX(7,12,C) SAVE_TX(8,12,C) SAVE_TX(9,12,C) SAVE_TX(10,12,C) SAVE_TX(11,12,C)
102
#define COMPUTE_T14(C) COMPUTE_T13(C) COMPUTE_TX(13,13,C) COMPUTE_TX(0,13,C) COMPUTE_TX(1,13,C) COMPUTE_TX(2,13,C) COMPUTE_TX(3,13,C) COMPUTE_TX(4,13,C) COMPUTE_TX(5,13,C) COMPUTE_TX(6,13,C) COMPUTE_TX(7,13,C) COMPUTE_TX(8,13,C) COMPUTE_TX(9,13,C) COMPUTE_TX(10,13,C) COMPUTE_TX(11,13,C) COMPUTE_TX(12,13,C)
103
#define COMPUTE_V14(A) COMPUTE_V13(A) COMPUTE_VX(A,13)
104
#define SAVE_T14(C) SAVE_T13(C) SAVE_TX(13,13,C) SAVE_TX(0,13,C) SAVE_TX(1,13,C) SAVE_TX(2,13,C) SAVE_TX(3,13,C) SAVE_TX(4,13,C) SAVE_TX(5,13,C) SAVE_TX(6,13,C) SAVE_TX(7,13,C) SAVE_TX(8,13,C) SAVE_TX(9,13,C) SAVE_TX(10,13,C) SAVE_TX(11,13,C) SAVE_TX(12,13,C)
105
#define COMPUTE_T15(C) COMPUTE_T14(C) COMPUTE_TX(14,14,C) COMPUTE_TX(0,14,C) COMPUTE_TX(1,14,C) COMPUTE_TX(2,14,C) COMPUTE_TX(3,14,C) COMPUTE_TX(4,14,C) COMPUTE_TX(5,14,C) COMPUTE_TX(6,14,C) COMPUTE_TX(7,14,C) COMPUTE_TX(8,14,C) COMPUTE_TX(9,14,C) COMPUTE_TX(10,14,C) COMPUTE_TX(11,14,C) COMPUTE_TX(12,14,C) COMPUTE_TX(13,14,C)
106
#define COMPUTE_V15(A) COMPUTE_V14(A) COMPUTE_VX(A,14)
107
#define SAVE_T15(C) SAVE_T14(C) SAVE_TX(14,14,C) SAVE_TX(0,14,C) SAVE_TX(1,14,C) SAVE_TX(2,14,C) SAVE_TX(3,14,C) SAVE_TX(4,14,C) SAVE_TX(5,14,C) SAVE_TX(6,14,C) SAVE_TX(7,14,C) SAVE_TX(8,14,C) SAVE_TX(9,14,C) SAVE_TX(10,14,C) SAVE_TX(11,14,C) SAVE_TX(12,14,C) SAVE_TX(13,14,C)
108
#define COMPUTE_T16(C) COMPUTE_T15(C) COMPUTE_TX(15,15,C) COMPUTE_TX(0,15,C) COMPUTE_TX(1,15,C) COMPUTE_TX(2,15,C) COMPUTE_TX(3,15,C) COMPUTE_TX(4,15,C) COMPUTE_TX(5,15,C) COMPUTE_TX(6,15,C) COMPUTE_TX(7,15,C) COMPUTE_TX(8,15,C) COMPUTE_TX(9,15,C) COMPUTE_TX(10,15,C) COMPUTE_TX(11,15,C) COMPUTE_TX(12,15,C) COMPUTE_TX(13,15,C) COMPUTE_TX(14,15,C)
109
#define COMPUTE_V16(A) COMPUTE_V15(A) COMPUTE_VX(A,15)
110
#define SAVE_T16(C) SAVE_T15(C) SAVE_TX(15,15,C) SAVE_TX(0,15,C) SAVE_TX(1,15,C) SAVE_TX(2,15,C) SAVE_TX(3,15,C) SAVE_TX(4,15,C) SAVE_TX(5,15,C) SAVE_TX(6,15,C) SAVE_TX(7,15,C) SAVE_TX(8,15,C) SAVE_TX(9,15,C) SAVE_TX(10,15,C) SAVE_TX(11,15,C) SAVE_TX(12,15,C) SAVE_TX(13,15,C) SAVE_TX(14,15,C)
111
112
#define SUM_T1(C) HSUM(R(0, 0), zero, zero, zero);*((C) + 0 * ldc + 0) = spu_extract(tmp5,0);
113
#define SUM_V1(C) HSUM(R(0, 0), zero, zero, zero); C[0] = tmp5;
114
#define SUM_T2(C) HSUM(R(0, 0), R(0, 1), R(1, 1), zero);*((C) + 0 * ldc + 0) = spu_extract(tmp5,0);*((C) + 1 * ldc + 0) = spu_extract(tmp5,1);*((C) + 1 * ldc + 1) = spu_extract(tmp5,2);
115
#define SUM_V2(C) HSUM(R(0, 0), R(1, 0), zero, zero); C[0] = tmp5;
116
#define SUM_T3(C) HSUM(R(0, 0), R(0, 1), R(1, 1), R(0, 2));*((C) + 0 * ldc + 0) = spu_extract(tmp5,0);*((C) + 1 * ldc + 0) = spu_extract(tmp5,1);*((C) + 1 * ldc + 1) = spu_extract(tmp5,2);*((C) + 2 * ldc + 0) = spu_extract(tmp5,3); HSUM(R(1, 2), R(2, 2), zero, zero);*((C) + 2 * ldc + 1) = spu_extract(tmp5,0);*((C) + 2 * ldc + 2) = spu_extract(tmp5,1);
117
#define SUM_V3(C) HSUM(R(0, 0), R(1, 0), R(2, 0), zero); C[0] = tmp5;
118
#define SUM_T4(C) HSUM(R(0, 0), R(0, 1), R(1, 1), R(0, 2));*((C) + 0 * ldc + 0) = spu_extract(tmp5,0);*((C) + 1 * ldc + 0) = spu_extract(tmp5,1);*((C) + 1 * ldc + 1) = spu_extract(tmp5,2);*((C) + 2 * ldc + 0) = spu_extract(tmp5,3); HSUM(R(1, 2), R(2, 2), R(0, 3), R(1, 3));*((C) + 2 * ldc + 1) = spu_extract(tmp5,0);*((C) + 2 * ldc + 2) = spu_extract(tmp5,1);*((C) + 3 * ldc + 0) = spu_extract(tmp5,2);*((C) + 3 * ldc + 1) = spu_extract(tmp5,3); HSUM(R(2, 3), R(3, 3), zero, zero);*((C) + 3 * ldc + 2) = spu_extract(tmp5,0);*((C) + 3 * ldc + 3) = spu_extract(tmp5,1);
119
#define SUM_V4(C) HSUM(R(0, 0), R(1, 0), R(2, 0), R(3, 0)); C[0] = tmp5;
120
#define SUM_T5(C) HSUM(R(0, 0), R(0, 1), R(1, 1), R(0, 2));*((C) + 0 * ldc + 0) = spu_extract(tmp5,0);*((C) + 1 * ldc + 0) = spu_extract(tmp5,1);*((C) + 1 * ldc + 1) = spu_extract(tmp5,2);*((C) + 2 * ldc + 0) = spu_extract(tmp5,3); HSUM(R(1, 2), R(2, 2), R(0, 3), R(1, 3));*((C) + 2 * ldc + 1) = spu_extract(tmp5,0);*((C) + 2 * ldc + 2) = spu_extract(tmp5,1);*((C) + 3 * ldc + 0) = spu_extract(tmp5,2);*((C) + 3 * ldc + 1) = spu_extract(tmp5,3); HSUM(R(2, 3), R(3, 3), R(0, 4), R(1, 4));*((C) + 3 * ldc + 2) = spu_extract(tmp5,0);*((C) + 3 * ldc + 3) = spu_extract(tmp5,1);*((C) + 4 * ldc + 0) = spu_extract(tmp5,2);*((C) + 4 * ldc + 1) = spu_extract(tmp5,3); HSUM(R(2, 4), R(3, 4), R(4, 4), zero);*((C) + 4 * ldc + 2) = spu_extract(tmp5,0);*((C) + 4 * ldc + 3) = spu_extract(tmp5,1);*((C) + 4 * ldc + 4) = spu_extract(tmp5,2);
121
#define SUM_V5(C) HSUM(R(0, 0), R(1, 0), R(2, 0), R(3, 0)); C[0] = tmp5; HSUM(R(4, 0), zero, zero, zero); C[1] = tmp5;
122
#define SUM_T6(C) HSUM(R(0, 0), R(0, 1), R(1, 1), R(0, 2));*((C) + 0 * ldc + 0) = spu_extract(tmp5,0);*((C) + 1 * ldc + 0) = spu_extract(tmp5,1);*((C) + 1 * ldc + 1) = spu_extract(tmp5,2);*((C) + 2 * ldc + 0) = spu_extract(tmp5,3); HSUM(R(1, 2), R(2, 2), R(0, 3), R(1, 3));*((C) + 2 * ldc + 1) = spu_extract(tmp5,0);*((C) + 2 * ldc + 2) = spu_extract(tmp5,1);*((C) + 3 * ldc + 0) = spu_extract(tmp5,2);*((C) + 3 * ldc + 1) = spu_extract(tmp5,3); HSUM(R(2, 3), R(3, 3), R(0, 4), R(1, 4));*((C) + 3 * ldc + 2) = spu_extract(tmp5,0);*((C) + 3 * ldc + 3) = spu_extract(tmp5,1);*((C) + 4 * ldc + 0) = spu_extract(tmp5,2);*((C) + 4 * ldc + 1) = spu_extract(tmp5,3); HSUM(R(2, 4), R(3, 4), R(4, 4), R(0, 5));*((C) + 4 * ldc + 2) = spu_extract(tmp5,0);*((C) + 4 * ldc + 3) = spu_extract(tmp5,1);*((C) + 4 * ldc + 4) = spu_extract(tmp5,2);*((C) + 5 * ldc + 0) = spu_extract(tmp5,3); HSUM(R(1, 5), R(2, 5), R(3, 5), R(4, 5));*((C) + 5 * ldc + 1) = spu_extract(tmp5,0);*((C) + 5 * ldc + 2) = spu_extract(tmp5,1);*((C) + 5 * ldc + 3) = spu_extract(tmp5,2);*((C) + 5 * ldc + 4) = spu_extract(tmp5,3); HSUM(R(5, 5), zero, zero, zero);*((C) + 5 * ldc + 5) = spu_extract(tmp5,0);
123
#define SUM_V6(C) HSUM(R(0, 0), R(1, 0), R(2, 0), R(3, 0)); C[0] = tmp5; HSUM(R(4, 0), R(5, 0), zero, zero); C[1] = tmp5;
124
#define SUM_T7(C) HSUM(R(0, 0), R(0, 1), R(1, 1), R(0, 2));*((C) + 0 * ldc + 0) = spu_extract(tmp5,0);*((C) + 1 * ldc + 0) = spu_extract(tmp5,1);*((C) + 1 * ldc + 1) = spu_extract(tmp5,2);*((C) + 2 * ldc + 0) = spu_extract(tmp5,3); HSUM(R(1, 2), R(2, 2), R(0, 3), R(1, 3));*((C) + 2 * ldc + 1) = spu_extract(tmp5,0);*((C) + 2 * ldc + 2) = spu_extract(tmp5,1);*((C) + 3 * ldc + 0) = spu_extract(tmp5,2);*((C) + 3 * ldc + 1) = spu_extract(tmp5,3); HSUM(R(2, 3), R(3, 3), R(0, 4), R(1, 4));*((C) + 3 * ldc + 2) = spu_extract(tmp5,0);*((C) + 3 * ldc + 3) = spu_extract(tmp5,1);*((C) + 4 * ldc + 0) = spu_extract(tmp5,2);*((C) + 4 * ldc + 1) = spu_extract(tmp5,3); HSUM(R(2, 4), R(3, 4), R(4, 4), R(0, 5));*((C) + 4 * ldc + 2) = spu_extract(tmp5,0);*((C) + 4 * ldc + 3) = spu_extract(tmp5,1);*((C) + 4 * ldc + 4) = spu_extract(tmp5,2);*((C) + 5 * ldc + 0) = spu_extract(tmp5,3); HSUM(R(1, 5), R(2, 5), R(3, 5), R(4, 5));*((C) + 5 * ldc + 1) = spu_extract(tmp5,0);*((C) + 5 * ldc + 2) = spu_extract(tmp5,1);*((C) + 5 * ldc + 3) = spu_extract(tmp5,2);*((C) + 5 * ldc + 4) = spu_extract(tmp5,3); HSUM(R(5, 5), R(0, 6), R(1, 6), R(2, 6));*((C) + 5 * ldc + 5) = spu_extract(tmp5,0);*((C) + 6 * ldc + 0) = spu_extract(tmp5,1);*((C) + 6 * ldc + 1) = spu_extract(tmp5,2);*((C) + 6 * ldc + 2) = spu_extract(tmp5,3); HSUM(R(3, 6), R(4, 6), R(5, 6), R(6, 6));*((C) + 6 * ldc + 3) = spu_extract(tmp5,0);*((C) + 6 * ldc + 4) = spu_extract(tmp5,1);*((C) + 6 * ldc + 5) = spu_extract(tmp5,2);*((C) + 6 * ldc + 6) = spu_extract(tmp5,3);
125
#define SUM_V7(C) HSUM(R(0, 0), R(1, 0), R(2, 0), R(3, 0)); C[0] = tmp5; HSUM(R(4, 0), R(5, 0), R(6, 0), zero); C[1] = tmp5;
126
#define SUM_T8(C) HSUM(R(0, 0), R(0, 1), R(1, 1), R(0, 2));*((C) + 0 * ldc + 0) = spu_extract(tmp5,0);*((C) + 1 * ldc + 0) = spu_extract(tmp5,1);*((C) + 1 * ldc + 1) = spu_extract(tmp5,2);*((C) + 2 * ldc + 0) = spu_extract(tmp5,3); HSUM(R(1, 2), R(2, 2), R(0, 3), R(1, 3));*((C) + 2 * ldc + 1) = spu_extract(tmp5,0);*((C) + 2 * ldc + 2) = spu_extract(tmp5,1);*((C) + 3 * ldc + 0) = spu_extract(tmp5,2);*((C) + 3 * ldc + 1) = spu_extract(tmp5,3); HSUM(R(2, 3), R(3, 3), R(0, 4), R(1, 4));*((C) + 3 * ldc + 2) = spu_extract(tmp5,0);*((C) + 3 * ldc + 3) = spu_extract(tmp5,1);*((C) + 4 * ldc + 0) = spu_extract(tmp5,2);*((C) + 4 * ldc + 1) = spu_extract(tmp5,3); HSUM(R(2, 4), R(3, 4), R(4, 4), R(0, 5));*((C) + 4 * ldc + 2) = spu_extract(tmp5,0);*((C) + 4 * ldc + 3) = spu_extract(tmp5,1);*((C) + 4 * ldc + 4) = spu_extract(tmp5,2);*((C) + 5 * ldc + 0) = spu_extract(tmp5,3); HSUM(R(1, 5), R(2, 5), R(3, 5), R(4, 5));*((C) + 5 * ldc + 1) = spu_extract(tmp5,0);*((C) + 5 * ldc + 2) = spu_extract(tmp5,1);*((C) + 5 * ldc + 3) = spu_extract(tmp5,2);*((C) + 5 * ldc + 4) = spu_extract(tmp5,3); HSUM(R(5, 5), R(0, 6), R(1, 6), R(2, 6));*((C) + 5 * ldc + 5) = spu_extract(tmp5,0);*((C) + 6 * ldc + 0) = spu_extract(tmp5,1);*((C) + 6 * ldc + 1) = spu_extract(tmp5,2);*((C) + 6 * ldc + 2) = spu_extract(tmp5,3); HSUM(R(3, 6), R(4, 6), R(5, 6), R(6, 6));*((C) + 6 * ldc + 3) = spu_extract(tmp5,0);*((C) + 6 * ldc + 4) = spu_extract(tmp5,1);*((C) + 6 * ldc + 5) = spu_extract(tmp5,2);*((C) + 6 * ldc + 6) = spu_extract(tmp5,3); HSUM(R(0, 7), R(1, 7), R(2, 7), R(3, 7));*((C) + 7 * ldc + 0) = spu_extract(tmp5,0);*((C) + 7 * ldc + 1) = spu_extract(tmp5,1);*((C) + 7 * ldc + 2) = spu_extract(tmp5,2);*((C) + 7 * ldc + 3) = spu_extract(tmp5,3); HSUM(R(4, 7), R(5, 7), R(6, 7), R(7, 7));*((C) + 7 * ldc + 4) = spu_extract(tmp5,0);*((C) + 7 * ldc + 5) = spu_extract(tmp5,1);*((C) + 7 * ldc + 6) = spu_extract(tmp5,2);*((C) + 7 * ldc + 7) = spu_extract(tmp5,3);
127
#define SUM_V8(C) HSUM(R(0, 0), R(1, 0), R(2, 0), R(3, 0)); C[0] = tmp5; HSUM(R(4, 0), R(5, 0), R(6, 0), R(7, 0)); C[1] = tmp5;
128
#define SUM_T9(C) HSUM(R(0, 0), R(0, 1), R(1, 1), R(0, 2));*((C) + 0 * ldc + 0) = spu_extract(tmp5,0);*((C) + 1 * ldc + 0) = spu_extract(tmp5,1);*((C) + 1 * ldc + 1) = spu_extract(tmp5,2);*((C) + 2 * ldc + 0) = spu_extract(tmp5,3); HSUM(R(1, 2), R(2, 2), R(0, 3), R(1, 3));*((C) + 2 * ldc + 1) = spu_extract(tmp5,0);*((C) + 2 * ldc + 2) = spu_extract(tmp5,1);*((C) + 3 * ldc + 0) = spu_extract(tmp5,2);*((C) + 3 * ldc + 1) = spu_extract(tmp5,3); HSUM(R(2, 3), R(3, 3), R(0, 4), R(1, 4));*((C) + 3 * ldc + 2) = spu_extract(tmp5,0);*((C) + 3 * ldc + 3) = spu_extract(tmp5,1);*((C) + 4 * ldc + 0) = spu_extract(tmp5,2);*((C) + 4 * ldc + 1) = spu_extract(tmp5,3); HSUM(R(2, 4), R(3, 4), R(4, 4), R(0, 5));*((C) + 4 * ldc + 2) = spu_extract(tmp5,0);*((C) + 4 * ldc + 3) = spu_extract(tmp5,1);*((C) + 4 * ldc + 4) = spu_extract(tmp5,2);*((C) + 5 * ldc + 0) = spu_extract(tmp5,3); HSUM(R(1, 5), R(2, 5), R(3, 5), R(4, 5));*((C) + 5 * ldc + 1) = spu_extract(tmp5,0);*((C) + 5 * ldc + 2) = spu_extract(tmp5,1);*((C) + 5 * ldc + 3) = spu_extract(tmp5,2);*((C) + 5 * ldc + 4) = spu_extract(tmp5,3); HSUM(R(5, 5), R(0, 6), R(1, 6), R(2, 6));*((C) + 5 * ldc + 5) = spu_extract(tmp5,0);*((C) + 6 * ldc + 0) = spu_extract(tmp5,1);*((C) + 6 * ldc + 1) = spu_extract(tmp5,2);*((C) + 6 * ldc + 2) = spu_extract(tmp5,3); HSUM(R(3, 6), R(4, 6), R(5, 6), R(6, 6));*((C) + 6 * ldc + 3) = spu_extract(tmp5,0);*((C) + 6 * ldc + 4) = spu_extract(tmp5,1);*((C) + 6 * ldc + 5) = spu_extract(tmp5,2);*((C) + 6 * ldc + 6) = spu_extract(tmp5,3); HSUM(R(0, 7), R(1, 7), R(2, 7), R(3, 7));*((C) + 7 * ldc + 0) = spu_extract(tmp5,0);*((C) + 7 * ldc + 1) = spu_extract(tmp5,1);*((C) + 7 * ldc + 2) = spu_extract(tmp5,2);*((C) + 7 * ldc + 3) = spu_extract(tmp5,3); HSUM(R(4, 7), R(5, 7), R(6, 7), R(7, 7));*((C) + 7 * ldc + 4) = spu_extract(tmp5,0);*((C) + 7 * ldc + 5) = spu_extract(tmp5,1);*((C) + 7 * ldc + 6) = spu_extract(tmp5,2);*((C) + 7 * ldc + 7) = spu_extract(tmp5,3); HSUM(R(0, 8), R(1, 8), R(2, 8), R(3, 8));*((C) + 8 * ldc + 0) = spu_extract(tmp5,0);*((C) + 8 * ldc + 1) = spu_extract(tmp5,1);*((C) + 8 * ldc + 2) = spu_extract(tmp5,2);*((C) + 8 * ldc + 3) = spu_extract(tmp5,3); HSUM(R(4, 8), R(5, 8), R(6, 8), R(7, 8));*((C) + 8 * ldc + 4) = spu_extract(tmp5,0);*((C) + 8 * ldc + 5) = spu_extract(tmp5,1);*((C) + 8 * ldc + 6) = spu_extract(tmp5,2);*((C) + 8 * ldc + 7) = spu_extract(tmp5,3); HSUM(R(8, 8), zero, zero, zero);*((C) + 8 * ldc + 8) = spu_extract(tmp5,0);
129
#define SUM_V9(C) HSUM(R(0, 0), R(1, 0), R(2, 0), R(3, 0)); C[0] = tmp5; HSUM(R(4, 0), R(5, 0), R(6, 0), R(7, 0)); C[1] = tmp5; HSUM(R(8, 0), zero, zero, zero); C[2] = tmp5;
130
#define SUM_T10(C) HSUM(R(0, 0), R(0, 1), R(1, 1), R(0, 2));*((C) + 0 * ldc + 0) = spu_extract(tmp5,0);*((C) + 1 * ldc + 0) = spu_extract(tmp5,1);*((C) + 1 * ldc + 1) = spu_extract(tmp5,2);*((C) + 2 * ldc + 0) = spu_extract(tmp5,3); HSUM(R(1, 2), R(2, 2), R(0, 3), R(1, 3));*((C) + 2 * ldc + 1) = spu_extract(tmp5,0);*((C) + 2 * ldc + 2) = spu_extract(tmp5,1);*((C) + 3 * ldc + 0) = spu_extract(tmp5,2);*((C) + 3 * ldc + 1) = spu_extract(tmp5,3); HSUM(R(2, 3), R(3, 3), R(0, 4), R(1, 4));*((C) + 3 * ldc + 2) = spu_extract(tmp5,0);*((C) + 3 * ldc + 3) = spu_extract(tmp5,1);*((C) + 4 * ldc + 0) = spu_extract(tmp5,2);*((C) + 4 * ldc + 1) = spu_extract(tmp5,3); HSUM(R(2, 4), R(3, 4), R(4, 4), R(0, 5));*((C) + 4 * ldc + 2) = spu_extract(tmp5,0);*((C) + 4 * ldc + 3) = spu_extract(tmp5,1);*((C) + 4 * ldc + 4) = spu_extract(tmp5,2);*((C) + 5 * ldc + 0) = spu_extract(tmp5,3); HSUM(R(1, 5), R(2, 5), R(3, 5), R(4, 5));*((C) + 5 * ldc + 1) = spu_extract(tmp5,0);*((C) + 5 * ldc + 2) = spu_extract(tmp5,1);*((C) + 5 * ldc + 3) = spu_extract(tmp5,2);*((C) + 5 * ldc + 4) = spu_extract(tmp5,3); HSUM(R(5, 5), R(0, 6), R(1, 6), R(2, 6));*((C) + 5 * ldc + 5) = spu_extract(tmp5,0);*((C) + 6 * ldc + 0) = spu_extract(tmp5,1);*((C) + 6 * ldc + 1) = spu_extract(tmp5,2);*((C) + 6 * ldc + 2) = spu_extract(tmp5,3); HSUM(R(3, 6), R(4, 6), R(5, 6), R(6, 6));*((C) + 6 * ldc + 3) = spu_extract(tmp5,0);*((C) + 6 * ldc + 4) = spu_extract(tmp5,1);*((C) + 6 * ldc + 5) = spu_extract(tmp5,2);*((C) + 6 * ldc + 6) = spu_extract(tmp5,3); HSUM(R(0, 7), R(1, 7), R(2, 7), R(3, 7));*((C) + 7 * ldc + 0) = spu_extract(tmp5,0);*((C) + 7 * ldc + 1) = spu_extract(tmp5,1);*((C) + 7 * ldc + 2) = spu_extract(tmp5,2);*((C) + 7 * ldc + 3) = spu_extract(tmp5,3); HSUM(R(4, 7), R(5, 7), R(6, 7), R(7, 7));*((C) + 7 * ldc + 4) = spu_extract(tmp5,0);*((C) + 7 * ldc + 5) = spu_extract(tmp5,1);*((C) + 7 * ldc + 6) = spu_extract(tmp5,2);*((C) + 7 * ldc + 7) = spu_extract(tmp5,3); HSUM(R(0, 8), R(1, 8), R(2, 8), R(3, 8));*((C) + 8 * ldc + 0) = spu_extract(tmp5,0);*((C) + 8 * ldc + 1) = spu_extract(tmp5,1);*((C) + 8 * ldc + 2) = spu_extract(tmp5,2);*((C) + 8 * ldc + 3) = spu_extract(tmp5,3); HSUM(R(4, 8), R(5, 8), R(6, 8), R(7, 8));*((C) + 8 * ldc + 4) = spu_extract(tmp5,0);*((C) + 8 * ldc + 5) = spu_extract(tmp5,1);*((C) + 8 * ldc + 6) = spu_extract(tmp5,2);*((C) + 8 * ldc + 7) = spu_extract(tmp5,3); HSUM(R(8, 8), R(0, 9), R(1, 9), R(2, 9));*((C) + 8 * ldc + 8) = spu_extract(tmp5,0);*((C) + 9 * ldc + 0) = spu_extract(tmp5,1);*((C) + 9 * ldc + 1) = spu_extract(tmp5,2);*((C) + 9 * ldc + 2) = spu_extract(tmp5,3); HSUM(R(3, 9), R(4, 9), R(5, 9), R(6, 9));*((C) + 9 * ldc + 3) = spu_extract(tmp5,0);*((C) + 9 * ldc + 4) = spu_extract(tmp5,1);*((C) + 9 * ldc + 5) = spu_extract(tmp5,2);*((C) + 9 * ldc + 6) = spu_extract(tmp5,3); HSUM(R(7, 9), R(8, 9), R(9, 9), zero);*((C) + 9 * ldc + 7) = spu_extract(tmp5,0);*((C) + 9 * ldc + 8) = spu_extract(tmp5,1);*((C) + 9 * ldc + 9) = spu_extract(tmp5,2);
131
#define SUM_V10(C) HSUM(R(0, 0), R(1, 0), R(2, 0), R(3, 0)); C[0] = tmp5; HSUM(R(4, 0), R(5, 0), R(6, 0), R(7, 0)); C[1] = tmp5; HSUM(R(8, 0), R(9, 0), zero, zero); C[2] = tmp5;
132
#define SUM_T11(C) HSUM(R(0, 0), R(0, 1), R(1, 1), R(0, 2));*((C) + 0 * ldc + 0) = spu_extract(tmp5,0);*((C) + 1 * ldc + 0) = spu_extract(tmp5,1);*((C) + 1 * ldc + 1) = spu_extract(tmp5,2);*((C) + 2 * ldc + 0) = spu_extract(tmp5,3); HSUM(R(1, 2), R(2, 2), R(0, 3), R(1, 3));*((C) + 2 * ldc + 1) = spu_extract(tmp5,0);*((C) + 2 * ldc + 2) = spu_extract(tmp5,1);*((C) + 3 * ldc + 0) = spu_extract(tmp5,2);*((C) + 3 * ldc + 1) = spu_extract(tmp5,3); HSUM(R(2, 3), R(3, 3), R(0, 4), R(1, 4));*((C) + 3 * ldc + 2) = spu_extract(tmp5,0);*((C) + 3 * ldc + 3) = spu_extract(tmp5,1);*((C) + 4 * ldc + 0) = spu_extract(tmp5,2);*((C) + 4 * ldc + 1) = spu_extract(tmp5,3); HSUM(R(2, 4), R(3, 4), R(4, 4), R(0, 5));*((C) + 4 * ldc + 2) = spu_extract(tmp5,0);*((C) + 4 * ldc + 3) = spu_extract(tmp5,1);*((C) + 4 * ldc + 4) = spu_extract(tmp5,2);*((C) + 5 * ldc + 0) = spu_extract(tmp5,3); HSUM(R(1, 5), R(2, 5), R(3, 5), R(4, 5));*((C) + 5 * ldc + 1) = spu_extract(tmp5,0);*((C) + 5 * ldc + 2) = spu_extract(tmp5,1);*((C) + 5 * ldc + 3) = spu_extract(tmp5,2);*((C) + 5 * ldc + 4) = spu_extract(tmp5,3); HSUM(R(5, 5), R(0, 6), R(1, 6), R(2, 6));*((C) + 5 * ldc + 5) = spu_extract(tmp5,0);*((C) + 6 * ldc + 0) = spu_extract(tmp5,1);*((C) + 6 * ldc + 1) = spu_extract(tmp5,2);*((C) + 6 * ldc + 2) = spu_extract(tmp5,3); HSUM(R(3, 6), R(4, 6), R(5, 6), R(6, 6));*((C) + 6 * ldc + 3) = spu_extract(tmp5,0);*((C) + 6 * ldc + 4) = spu_extract(tmp5,1);*((C) + 6 * ldc + 5) = spu_extract(tmp5,2);*((C) + 6 * ldc + 6) = spu_extract(tmp5,3); HSUM(R(0, 7), R(1, 7), R(2, 7), R(3, 7));*((C) + 7 * ldc + 0) = spu_extract(tmp5,0);*((C) + 7 * ldc + 1) = spu_extract(tmp5,1);*((C) + 7 * ldc + 2) = spu_extract(tmp5,2);*((C) + 7 * ldc + 3) = spu_extract(tmp5,3); HSUM(R(4, 7), R(5, 7), R(6, 7), R(7, 7));*((C) + 7 * ldc + 4) = spu_extract(tmp5,0);*((C) + 7 * ldc + 5) = spu_extract(tmp5,1);*((C) + 7 * ldc + 6) = spu_extract(tmp5,2);*((C) + 7 * ldc + 7) = spu_extract(tmp5,3); HSUM(R(0, 8), R(1, 8), R(2, 8), R(3, 8));*((C) + 8 * ldc + 0) = spu_extract(tmp5,0);*((C) + 8 * ldc + 1) = spu_extract(tmp5,1);*((C) + 8 * ldc + 2) = spu_extract(tmp5,2);*((C) + 8 * ldc + 3) = spu_extract(tmp5,3); HSUM(R(4, 8), R(5, 8), R(6, 8), R(7, 8));*((C) + 8 * ldc + 4) = spu_extract(tmp5,0);*((C) + 8 * ldc + 5) = spu_extract(tmp5,1);*((C) + 8 * ldc + 6) = spu_extract(tmp5,2);*((C) + 8 * ldc + 7) = spu_extract(tmp5,3); HSUM(R(8, 8), R(0, 9), R(1, 9), R(2, 9));*((C) + 8 * ldc + 8) = spu_extract(tmp5,0);*((C) + 9 * ldc + 0) = spu_extract(tmp5,1);*((C) + 9 * ldc + 1) = spu_extract(tmp5,2);*((C) + 9 * ldc + 2) = spu_extract(tmp5,3); HSUM(R(3, 9), R(4, 9), R(5, 9), R(6, 9));*((C) + 9 * ldc + 3) = spu_extract(tmp5,0);*((C) + 9 * ldc + 4) = spu_extract(tmp5,1);*((C) + 9 * ldc + 5) = spu_extract(tmp5,2);*((C) + 9 * ldc + 6) = spu_extract(tmp5,3); HSUM(R(7, 9), R(8, 9), R(9, 9), R(0, 10));*((C) + 9 * ldc + 7) = spu_extract(tmp5,0);*((C) + 9 * ldc + 8) = spu_extract(tmp5,1);*((C) + 9 * ldc + 9) = spu_extract(tmp5,2);*((C) + 10 * ldc + 0) = spu_extract(tmp5,3); HSUM(R(1, 10), R(2, 10), R(3, 10), R(4, 10));*((C) + 10 * ldc + 1) = spu_extract(tmp5,0);*((C) + 10 * ldc + 2) = spu_extract(tmp5,1);*((C) + 10 * ldc + 3) = spu_extract(tmp5,2);*((C) + 10 * ldc + 4) = spu_extract(tmp5,3); HSUM(R(5, 10), R(6, 10), R(7, 10), R(8, 10));*((C) + 10 * ldc + 5) = spu_extract(tmp5,0);*((C) + 10 * ldc + 6) = spu_extract(tmp5,1);*((C) + 10 * ldc + 7) = spu_extract(tmp5,2);*((C) + 10 * ldc + 8) = spu_extract(tmp5,3); HSUM(R(9, 10), R(10, 10), zero, zero);*((C) + 10 * ldc + 9) = spu_extract(tmp5,0);*((C) + 10 * ldc + 10) = spu_extract(tmp5,1);
133
#define SUM_V11(C) HSUM(R(0, 0), R(1, 0), R(2, 0), R(3, 0)); C[0] = tmp5; HSUM(R(4, 0), R(5, 0), R(6, 0), R(7, 0)); C[1] = tmp5; HSUM(R(8, 0), R(9, 0), R(10, 0), zero); C[2] = tmp5;
134
#define SUM_T12(C) HSUM(R(0, 0), R(0, 1), R(1, 1), R(0, 2));*((C) + 0 * ldc + 0) = spu_extract(tmp5,0);*((C) + 1 * ldc + 0) = spu_extract(tmp5,1);*((C) + 1 * ldc + 1) = spu_extract(tmp5,2);*((C) + 2 * ldc + 0) = spu_extract(tmp5,3); HSUM(R(1, 2), R(2, 2), R(0, 3), R(1, 3));*((C) + 2 * ldc + 1) = spu_extract(tmp5,0);*((C) + 2 * ldc + 2) = spu_extract(tmp5,1);*((C) + 3 * ldc + 0) = spu_extract(tmp5,2);*((C) + 3 * ldc + 1) = spu_extract(tmp5,3); HSUM(R(2, 3), R(3, 3), R(0, 4), R(1, 4));*((C) + 3 * ldc + 2) = spu_extract(tmp5,0);*((C) + 3 * ldc + 3) = spu_extract(tmp5,1);*((C) + 4 * ldc + 0) = spu_extract(tmp5,2);*((C) + 4 * ldc + 1) = spu_extract(tmp5,3); HSUM(R(2, 4), R(3, 4), R(4, 4), R(0, 5));*((C) + 4 * ldc + 2) = spu_extract(tmp5,0);*((C) + 4 * ldc + 3) = spu_extract(tmp5,1);*((C) + 4 * ldc + 4) = spu_extract(tmp5,2);*((C) + 5 * ldc + 0) = spu_extract(tmp5,3); HSUM(R(1, 5), R(2, 5), R(3, 5), R(4, 5));*((C) + 5 * ldc + 1) = spu_extract(tmp5,0);*((C) + 5 * ldc + 2) = spu_extract(tmp5,1);*((C) + 5 * ldc + 3) = spu_extract(tmp5,2);*((C) + 5 * ldc + 4) = spu_extract(tmp5,3); HSUM(R(5, 5), R(0, 6), R(1, 6), R(2, 6));*((C) + 5 * ldc + 5) = spu_extract(tmp5,0);*((C) + 6 * ldc + 0) = spu_extract(tmp5,1);*((C) + 6 * ldc + 1) = spu_extract(tmp5,2);*((C) + 6 * ldc + 2) = spu_extract(tmp5,3); HSUM(R(3, 6), R(4, 6), R(5, 6), R(6, 6));*((C) + 6 * ldc + 3) = spu_extract(tmp5,0);*((C) + 6 * ldc + 4) = spu_extract(tmp5,1);*((C) + 6 * ldc + 5) = spu_extract(tmp5,2);*((C) + 6 * ldc + 6) = spu_extract(tmp5,3); HSUM(R(0, 7), R(1, 7), R(2, 7), R(3, 7));*((C) + 7 * ldc + 0) = spu_extract(tmp5,0);*((C) + 7 * ldc + 1) = spu_extract(tmp5,1);*((C) + 7 * ldc + 2) = spu_extract(tmp5,2);*((C) + 7 * ldc + 3) = spu_extract(tmp5,3); HSUM(R(4, 7), R(5, 7), R(6, 7), R(7, 7));*((C) + 7 * ldc + 4) = spu_extract(tmp5,0);*((C) + 7 * ldc + 5) = spu_extract(tmp5,1);*((C) + 7 * ldc + 6) = spu_extract(tmp5,2);*((C) + 7 * ldc + 7) = spu_extract(tmp5,3); HSUM(R(0, 8), R(1, 8), R(2, 8), R(3, 8));*((C) + 8 * ldc + 0) = spu_extract(tmp5,0);*((C) + 8 * ldc + 1) = spu_extract(tmp5,1);*((C) + 8 * ldc + 2) = spu_extract(tmp5,2);*((C) + 8 * ldc + 3) = spu_extract(tmp5,3); HSUM(R(4, 8), R(5, 8), R(6, 8), R(7, 8));*((C) + 8 * ldc + 4) = spu_extract(tmp5,0);*((C) + 8 * ldc + 5) = spu_extract(tmp5,1);*((C) + 8 * ldc + 6) = spu_extract(tmp5,2);*((C) + 8 * ldc + 7) = spu_extract(tmp5,3); HSUM(R(8, 8), R(0, 9), R(1, 9), R(2, 9));*((C) + 8 * ldc + 8) = spu_extract(tmp5,0);*((C) + 9 * ldc + 0) = spu_extract(tmp5,1);*((C) + 9 * ldc + 1) = spu_extract(tmp5,2);*((C) + 9 * ldc + 2) = spu_extract(tmp5,3); HSUM(R(3, 9), R(4, 9), R(5, 9), R(6, 9));*((C) + 9 * ldc + 3) = spu_extract(tmp5,0);*((C) + 9 * ldc + 4) = spu_extract(tmp5,1);*((C) + 9 * ldc + 5) = spu_extract(tmp5,2);*((C) + 9 * ldc + 6) = spu_extract(tmp5,3); HSUM(R(7, 9), R(8, 9), R(9, 9), R(0, 10));*((C) + 9 * ldc + 7) = spu_extract(tmp5,0);*((C) + 9 * ldc + 8) = spu_extract(tmp5,1);*((C) + 9 * ldc + 9) = spu_extract(tmp5,2);*((C) + 10 * ldc + 0) = spu_extract(tmp5,3); HSUM(R(1, 10), R(2, 10), R(3, 10), R(4, 10));*((C) + 10 * ldc + 1) = spu_extract(tmp5,0);*((C) + 10 * ldc + 2) = spu_extract(tmp5,1);*((C) + 10 * ldc + 3) = spu_extract(tmp5,2);*((C) + 10 * ldc + 4) = spu_extract(tmp5,3); HSUM(R(5, 10), R(6, 10), R(7, 10), R(8, 10));*((C) + 10 * ldc + 5) = spu_extract(tmp5,0);*((C) + 10 * ldc + 6) = spu_extract(tmp5,1);*((C) + 10 * ldc + 7) = spu_extract(tmp5,2);*((C) + 10 * ldc + 8) = spu_extract(tmp5,3); HSUM(R(9, 10), R(10, 10), R(0, 11), R(1, 11));*((C) + 10 * ldc + 9) = spu_extract(tmp5,0);*((C) + 10 * ldc + 10) = spu_extract(tmp5,1);*((C) + 11 * ldc + 0) = spu_extract(tmp5,2);*((C) + 11 * ldc + 1) = spu_extract(tmp5,3); HSUM(R(2, 11), R(3, 11), R(4, 11), R(5, 11));*((C) + 11 * ldc + 2) = spu_extract(tmp5,0);*((C) + 11 * ldc + 3) = spu_extract(tmp5,1);*((C) + 11 * ldc + 4) = spu_extract(tmp5,2);*((C) + 11 * ldc + 5) = spu_extract(tmp5,3); HSUM(R(6, 11), R(7, 11), R(8, 11), R(9, 11));*((C) + 11 * ldc + 6) = spu_extract(tmp5,0);*((C) + 11 * ldc + 7) = spu_extract(tmp5,1);*((C) + 11 * ldc + 8) = spu_extract(tmp5,2);*((C) + 11 * ldc + 9) = spu_extract(tmp5,3); HSUM(R(10, 11), R(11, 11), zero, zero);*((C) + 11 * ldc + 10) = spu_extract(tmp5,0);*((C) + 11 * ldc + 11) = spu_extract(tmp5,1);
135
#define SUM_V12(C) HSUM(R(0, 0), R(1, 0), R(2, 0), R(3, 0)); C[0] = tmp5; HSUM(R(4, 0), R(5, 0), R(6, 0), R(7, 0)); C[1] = tmp5; HSUM(R(8, 0), R(9, 0), R(10, 0), R(11, 0)); C[2] = tmp5;
136
#define SUM_T13(C) HSUM(R(0, 0), R(0, 1), R(1, 1), R(0, 2));*((C) + 0 * ldc + 0) = spu_extract(tmp5,0);*((C) + 1 * ldc + 0) = spu_extract(tmp5,1);*((C) + 1 * ldc + 1) = spu_extract(tmp5,2);*((C) + 2 * ldc + 0) = spu_extract(tmp5,3); HSUM(R(1, 2), R(2, 2), R(0, 3), R(1, 3));*((C) + 2 * ldc + 1) = spu_extract(tmp5,0);*((C) + 2 * ldc + 2) = spu_extract(tmp5,1);*((C) + 3 * ldc + 0) = spu_extract(tmp5,2);*((C) + 3 * ldc + 1) = spu_extract(tmp5,3); HSUM(R(2, 3), R(3, 3), R(0, 4), R(1, 4));*((C) + 3 * ldc + 2) = spu_extract(tmp5,0);*((C) + 3 * ldc + 3) = spu_extract(tmp5,1);*((C) + 4 * ldc + 0) = spu_extract(tmp5,2);*((C) + 4 * ldc + 1) = spu_extract(tmp5,3); HSUM(R(2, 4), R(3, 4), R(4, 4), R(0, 5));*((C) + 4 * ldc + 2) = spu_extract(tmp5,0);*((C) + 4 * ldc + 3) = spu_extract(tmp5,1);*((C) + 4 * ldc + 4) = spu_extract(tmp5,2);*((C) + 5 * ldc + 0) = spu_extract(tmp5,3); HSUM(R(1, 5), R(2, 5), R(3, 5), R(4, 5));*((C) + 5 * ldc + 1) = spu_extract(tmp5,0);*((C) + 5 * ldc + 2) = spu_extract(tmp5,1);*((C) + 5 * ldc + 3) = spu_extract(tmp5,2);*((C) + 5 * ldc + 4) = spu_extract(tmp5,3); HSUM(R(5, 5), R(0, 6), R(1, 6), R(2, 6));*((C) + 5 * ldc + 5) = spu_extract(tmp5,0);*((C) + 6 * ldc + 0) = spu_extract(tmp5,1);*((C) + 6 * ldc + 1) = spu_extract(tmp5,2);*((C) + 6 * ldc + 2) = spu_extract(tmp5,3); HSUM(R(3, 6), R(4, 6), R(5, 6), R(6, 6));*((C) + 6 * ldc + 3) = spu_extract(tmp5,0);*((C) + 6 * ldc + 4) = spu_extract(tmp5,1);*((C) + 6 * ldc + 5) = spu_extract(tmp5,2);*((C) + 6 * ldc + 6) = spu_extract(tmp5,3); HSUM(R(0, 7), R(1, 7), R(2, 7), R(3, 7));*((C) + 7 * ldc + 0) = spu_extract(tmp5,0);*((C) + 7 * ldc + 1) = spu_extract(tmp5,1);*((C) + 7 * ldc + 2) = spu_extract(tmp5,2);*((C) + 7 * ldc + 3) = spu_extract(tmp5,3); HSUM(R(4, 7), R(5, 7), R(6, 7), R(7, 7));*((C) + 7 * ldc + 4) = spu_extract(tmp5,0);*((C) + 7 * ldc + 5) = spu_extract(tmp5,1);*((C) + 7 * ldc + 6) = spu_extract(tmp5,2);*((C) + 7 * ldc + 7) = spu_extract(tmp5,3); HSUM(R(0, 8), R(1, 8), R(2, 8), R(3, 8));*((C) + 8 * ldc + 0) = spu_extract(tmp5,0);*((C) + 8 * ldc + 1) = spu_extract(tmp5,1);*((C) + 8 * ldc + 2) = spu_extract(tmp5,2);*((C) + 8 * ldc + 3) = spu_extract(tmp5,3); HSUM(R(4, 8), R(5, 8), R(6, 8), R(7, 8));*((C) + 8 * ldc + 4) = spu_extract(tmp5,0);*((C) + 8 * ldc + 5) = spu_extract(tmp5,1);*((C) + 8 * ldc + 6) = spu_extract(tmp5,2);*((C) + 8 * ldc + 7) = spu_extract(tmp5,3); HSUM(R(8, 8), R(0, 9), R(1, 9), R(2, 9));*((C) + 8 * ldc + 8) = spu_extract(tmp5,0);*((C) + 9 * ldc + 0) = spu_extract(tmp5,1);*((C) + 9 * ldc + 1) = spu_extract(tmp5,2);*((C) + 9 * ldc + 2) = spu_extract(tmp5,3); HSUM(R(3, 9), R(4, 9), R(5, 9), R(6, 9));*((C) + 9 * ldc + 3) = spu_extract(tmp5,0);*((C) + 9 * ldc + 4) = spu_extract(tmp5,1);*((C) + 9 * ldc + 5) = spu_extract(tmp5,2);*((C) + 9 * ldc + 6) = spu_extract(tmp5,3); HSUM(R(7, 9), R(8, 9), R(9, 9), R(0, 10));*((C) + 9 * ldc + 7) = spu_extract(tmp5,0);*((C) + 9 * ldc + 8) = spu_extract(tmp5,1);*((C) + 9 * ldc + 9) = spu_extract(tmp5,2);*((C) + 10 * ldc + 0) = spu_extract(tmp5,3); HSUM(R(1, 10), R(2, 10), R(3, 10), R(4, 10));*((C) + 10 * ldc + 1) = spu_extract(tmp5,0);*((C) + 10 * ldc + 2) = spu_extract(tmp5,1);*((C) + 10 * ldc + 3) = spu_extract(tmp5,2);*((C) + 10 * ldc + 4) = spu_extract(tmp5,3); HSUM(R(5, 10), R(6, 10), R(7, 10), R(8, 10));*((C) + 10 * ldc + 5) = spu_extract(tmp5,0);*((C) + 10 * ldc + 6) = spu_extract(tmp5,1);*((C) + 10 * ldc + 7) = spu_extract(tmp5,2);*((C) + 10 * ldc + 8) = spu_extract(tmp5,3); HSUM(R(9, 10), R(10, 10), R(0, 11), R(1, 11));*((C) + 10 * ldc + 9) = spu_extract(tmp5,0);*((C) + 10 * ldc + 10) = spu_extract(tmp5,1);*((C) + 11 * ldc + 0) = spu_extract(tmp5,2);*((C) + 11 * ldc + 1) = spu_extract(tmp5,3); HSUM(R(2, 11), R(3, 11), R(4, 11), R(5, 11));*((C) + 11 * ldc + 2) = spu_extract(tmp5,0);*((C) + 11 * ldc + 3) = spu_extract(tmp5,1);*((C) + 11 * ldc + 4) = spu_extract(tmp5,2);*((C) + 11 * ldc + 5) = spu_extract(tmp5,3); HSUM(R(6, 11), R(7, 11), R(8, 11), R(9, 11));*((C) + 11 * ldc + 6) = spu_extract(tmp5,0);*((C) + 11 * ldc + 7) = spu_extract(tmp5,1);*((C) + 11 * ldc + 8) = spu_extract(tmp5,2);*((C) + 11 * ldc + 9) = spu_extract(tmp5,3); HSUM(R(10, 11), R(11, 11), R(0, 12), R(1, 12));*((C) + 11 * ldc + 10) = spu_extract(tmp5,0);*((C) + 11 * ldc + 11) = spu_extract(tmp5,1);*((C) + 12 * ldc + 0) = spu_extract(tmp5,2);*((C) + 12 * ldc + 1) = spu_extract(tmp5,3); HSUM(R(2, 12), R(3, 12), R(4, 12), R(5, 12));*((C) + 12 * ldc + 2) = spu_extract(tmp5,0);*((C) + 12 * ldc + 3) = spu_extract(tmp5,1);*((C) + 12 * ldc + 4) = spu_extract(tmp5,2);*((C) + 12 * ldc + 5) = spu_extract(tmp5,3); HSUM(R(6, 12), R(7, 12), R(8, 12), R(9, 12));*((C) + 12 * ldc + 6) = spu_extract(tmp5,0);*((C) + 12 * ldc + 7) = spu_extract(tmp5,1);*((C) + 12 * ldc + 8) = spu_extract(tmp5,2);*((C) + 12 * ldc + 9) = spu_extract(tmp5,3); HSUM(R(10, 12), R(11, 12), R(12, 12), zero);*((C) + 12 * ldc + 10) = spu_extract(tmp5,0);*((C) + 12 * ldc + 11) = spu_extract(tmp5,1);*((C) + 12 * ldc + 12) = spu_extract(tmp5,2);
137
#define SUM_V13(C) HSUM(R(0, 0), R(1, 0), R(2, 0), R(3, 0)); C[0] = tmp5; HSUM(R(4, 0), R(5, 0), R(6, 0), R(7, 0)); C[1] = tmp5; HSUM(R(8, 0), R(9, 0), R(10, 0), R(11, 0)); C[2] = tmp5; HSUM(R(12, 0), zero, zero, zero); C[3] = tmp5;
138
#define SUM_T14(C) HSUM(R(0, 0), R(0, 1), R(1, 1), R(0, 2));*((C) + 0 * ldc + 0) = spu_extract(tmp5,0);*((C) + 1 * ldc + 0) = spu_extract(tmp5,1);*((C) + 1 * ldc + 1) = spu_extract(tmp5,2);*((C) + 2 * ldc + 0) = spu_extract(tmp5,3); HSUM(R(1, 2), R(2, 2), R(0, 3), R(1, 3));*((C) + 2 * ldc + 1) = spu_extract(tmp5,0);*((C) + 2 * ldc + 2) = spu_extract(tmp5,1);*((C) + 3 * ldc + 0) = spu_extract(tmp5,2);*((C) + 3 * ldc + 1) = spu_extract(tmp5,3); HSUM(R(2, 3), R(3, 3), R(0, 4), R(1, 4));*((C) + 3 * ldc + 2) = spu_extract(tmp5,0);*((C) + 3 * ldc + 3) = spu_extract(tmp5,1);*((C) + 4 * ldc + 0) = spu_extract(tmp5,2);*((C) + 4 * ldc + 1) = spu_extract(tmp5,3); HSUM(R(2, 4), R(3, 4), R(4, 4), R(0, 5));*((C) + 4 * ldc + 2) = spu_extract(tmp5,0);*((C) + 4 * ldc + 3) = spu_extract(tmp5,1);*((C) + 4 * ldc + 4) = spu_extract(tmp5,2);*((C) + 5 * ldc + 0) = spu_extract(tmp5,3); HSUM(R(1, 5), R(2, 5), R(3, 5), R(4, 5));*((C) + 5 * ldc + 1) = spu_extract(tmp5,0);*((C) + 5 * ldc + 2) = spu_extract(tmp5,1);*((C) + 5 * ldc + 3) = spu_extract(tmp5,2);*((C) + 5 * ldc + 4) = spu_extract(tmp5,3); HSUM(R(5, 5), R(0, 6), R(1, 6), R(2, 6));*((C) + 5 * ldc + 5) = spu_extract(tmp5,0);*((C) + 6 * ldc + 0) = spu_extract(tmp5,1);*((C) + 6 * ldc + 1) = spu_extract(tmp5,2);*((C) + 6 * ldc + 2) = spu_extract(tmp5,3); HSUM(R(3, 6), R(4, 6), R(5, 6), R(6, 6));*((C) + 6 * ldc + 3) = spu_extract(tmp5,0);*((C) + 6 * ldc + 4) = spu_extract(tmp5,1);*((C) + 6 * ldc + 5) = spu_extract(tmp5,2);*((C) + 6 * ldc + 6) = spu_extract(tmp5,3); HSUM(R(0, 7), R(1, 7), R(2, 7), R(3, 7));*((C) + 7 * ldc + 0) = spu_extract(tmp5,0);*((C) + 7 * ldc + 1) = spu_extract(tmp5,1);*((C) + 7 * ldc + 2) = spu_extract(tmp5,2);*((C) + 7 * ldc + 3) = spu_extract(tmp5,3); HSUM(R(4, 7), R(5, 7), R(6, 7), R(7, 7));*((C) + 7 * ldc + 4) = spu_extract(tmp5,0);*((C) + 7 * ldc + 5) = spu_extract(tmp5,1);*((C) + 7 * ldc + 6) = spu_extract(tmp5,2);*((C) + 7 * ldc + 7) = spu_extract(tmp5,3); HSUM(R(0, 8), R(1, 8), R(2, 8), R(3, 8));*((C) + 8 * ldc + 0) = spu_extract(tmp5,0);*((C) + 8 * ldc + 1) = spu_extract(tmp5,1);*((C) + 8 * ldc + 2) = spu_extract(tmp5,2);*((C) + 8 * ldc + 3) = spu_extract(tmp5,3); HSUM(R(4, 8), R(5, 8), R(6, 8), R(7, 8));*((C) + 8 * ldc + 4) = spu_extract(tmp5,0);*((C) + 8 * ldc + 5) = spu_extract(tmp5,1);*((C) + 8 * ldc + 6) = spu_extract(tmp5,2);*((C) + 8 * ldc + 7) = spu_extract(tmp5,3); HSUM(R(8, 8), R(0, 9), R(1, 9), R(2, 9));*((C) + 8 * ldc + 8) = spu_extract(tmp5,0);*((C) + 9 * ldc + 0) = spu_extract(tmp5,1);*((C) + 9 * ldc + 1) = spu_extract(tmp5,2);*((C) + 9 * ldc + 2) = spu_extract(tmp5,3); HSUM(R(3, 9), R(4, 9), R(5, 9), R(6, 9));*((C) + 9 * ldc + 3) = spu_extract(tmp5,0);*((C) + 9 * ldc + 4) = spu_extract(tmp5,1);*((C) + 9 * ldc + 5) = spu_extract(tmp5,2);*((C) + 9 * ldc + 6) = spu_extract(tmp5,3); HSUM(R(7, 9), R(8, 9), R(9, 9), R(0, 10));*((C) + 9 * ldc + 7) = spu_extract(tmp5,0);*((C) + 9 * ldc + 8) = spu_extract(tmp5,1);*((C) + 9 * ldc + 9) = spu_extract(tmp5,2);*((C) + 10 * ldc + 0) = spu_extract(tmp5,3); HSUM(R(1, 10), R(2, 10), R(3, 10), R(4, 10));*((C) + 10 * ldc + 1) = spu_extract(tmp5,0);*((C) + 10 * ldc + 2) = spu_extract(tmp5,1);*((C) + 10 * ldc + 3) = spu_extract(tmp5,2);*((C) + 10 * ldc + 4) = spu_extract(tmp5,3); HSUM(R(5, 10), R(6, 10), R(7, 10), R(8, 10));*((C) + 10 * ldc + 5) = spu_extract(tmp5,0);*((C) + 10 * ldc + 6) = spu_extract(tmp5,1);*((C) + 10 * ldc + 7) = spu_extract(tmp5,2);*((C) + 10 * ldc + 8) = spu_extract(tmp5,3); HSUM(R(9, 10), R(10, 10), R(0, 11), R(1, 11));*((C) + 10 * ldc + 9) = spu_extract(tmp5,0);*((C) + 10 * ldc + 10) = spu_extract(tmp5,1);*((C) + 11 * ldc + 0) = spu_extract(tmp5,2);*((C) + 11 * ldc + 1) = spu_extract(tmp5,3); HSUM(R(2, 11), R(3, 11), R(4, 11), R(5, 11));*((C) + 11 * ldc + 2) = spu_extract(tmp5,0);*((C) + 11 * ldc + 3) = spu_extract(tmp5,1);*((C) + 11 * ldc + 4) = spu_extract(tmp5,2);*((C) + 11 * ldc + 5) = spu_extract(tmp5,3); HSUM(R(6, 11), R(7, 11), R(8, 11), R(9, 11));*((C) + 11 * ldc + 6) = spu_extract(tmp5,0);*((C) + 11 * ldc + 7) = spu_extract(tmp5,1);*((C) + 11 * ldc + 8) = spu_extract(tmp5,2);*((C) + 11 * ldc + 9) = spu_extract(tmp5,3); HSUM(R(10, 11), R(11, 11), R(0, 12), R(1, 12));*((C) + 11 * ldc + 10) = spu_extract(tmp5,0);*((C) + 11 * ldc + 11) = spu_extract(tmp5,1);*((C) + 12 * ldc + 0) = spu_extract(tmp5,2);*((C) + 12 * ldc + 1) = spu_extract(tmp5,3); HSUM(R(2, 12), R(3, 12), R(4, 12), R(5, 12));*((C) + 12 * ldc + 2) = spu_extract(tmp5,0);*((C) + 12 * ldc + 3) = spu_extract(tmp5,1);*((C) + 12 * ldc + 4) = spu_extract(tmp5,2);*((C) + 12 * ldc + 5) = spu_extract(tmp5,3); HSUM(R(6, 12), R(7, 12), R(8, 12), R(9, 12));*((C) + 12 * ldc + 6) = spu_extract(tmp5,0);*((C) + 12 * ldc + 7) = spu_extract(tmp5,1);*((C) + 12 * ldc + 8) = spu_extract(tmp5,2);*((C) + 12 * ldc + 9) = spu_extract(tmp5,3); HSUM(R(10, 12), R(11, 12), R(12, 12), R(0, 13));*((C) + 12 * ldc + 10) = spu_extract(tmp5,0);*((C) + 12 * ldc + 11) = spu_extract(tmp5,1);*((C) + 12 * ldc + 12) = spu_extract(tmp5,2);*((C) + 13 * ldc + 0) = spu_extract(tmp5,3); HSUM(R(1, 13), R(2, 13), R(3, 13), R(4, 13));*((C) + 13 * ldc + 1) = spu_extract(tmp5,0);*((C) + 13 * ldc + 2) = spu_extract(tmp5,1);*((C) + 13 * ldc + 3) = spu_extract(tmp5,2);*((C) + 13 * ldc + 4) = spu_extract(tmp5,3); HSUM(R(5, 13), R(6, 13), R(7, 13), R(8, 13));*((C) + 13 * ldc + 5) = spu_extract(tmp5,0);*((C) + 13 * ldc + 6) = spu_extract(tmp5,1);*((C) + 13 * ldc + 7) = spu_extract(tmp5,2);*((C) + 13 * ldc + 8) = spu_extract(tmp5,3); HSUM(R(9, 13), R(10, 13), R(11, 13), R(12, 13));*((C) + 13 * ldc + 9) = spu_extract(tmp5,0);*((C) + 13 * ldc + 10) = spu_extract(tmp5,1);*((C) + 13 * ldc + 11) = spu_extract(tmp5,2);*((C) + 13 * ldc + 12) = spu_extract(tmp5,3); HSUM(R(13, 13), zero, zero, zero);*((C) + 13 * ldc + 13) = spu_extract(tmp5,0);
139
#define SUM_V14(C) HSUM(R(0, 0), R(1, 0), R(2, 0), R(3, 0)); C[0] = tmp5; HSUM(R(4, 0), R(5, 0), R(6, 0), R(7, 0)); C[1] = tmp5; HSUM(R(8, 0), R(9, 0), R(10, 0), R(11, 0)); C[2] = tmp5; HSUM(R(12, 0), R(13, 0), zero, zero); C[3] = tmp5;
140
#define SUM_T15(C) HSUM(R(0, 0), R(0, 1), R(1, 1), R(0, 2));*((C) + 0 * ldc + 0) = spu_extract(tmp5,0);*((C) + 1 * ldc + 0) = spu_extract(tmp5,1);*((C) + 1 * ldc + 1) = spu_extract(tmp5,2);*((C) + 2 * ldc + 0) = spu_extract(tmp5,3); HSUM(R(1, 2), R(2, 2), R(0, 3), R(1, 3));*((C) + 2 * ldc + 1) = spu_extract(tmp5,0);*((C) + 2 * ldc + 2) = spu_extract(tmp5,1);*((C) + 3 * ldc + 0) = spu_extract(tmp5,2);*((C) + 3 * ldc + 1) = spu_extract(tmp5,3); HSUM(R(2, 3), R(3, 3), R(0, 4), R(1, 4));*((C) + 3 * ldc + 2) = spu_extract(tmp5,0);*((C) + 3 * ldc + 3) = spu_extract(tmp5,1);*((C) + 4 * ldc + 0) = spu_extract(tmp5,2);*((C) + 4 * ldc + 1) = spu_extract(tmp5,3); HSUM(R(2, 4), R(3, 4), R(4, 4), R(0, 5));*((C) + 4 * ldc + 2) = spu_extract(tmp5,0);*((C) + 4 * ldc + 3) = spu_extract(tmp5,1);*((C) + 4 * ldc + 4) = spu_extract(tmp5,2);*((C) + 5 * ldc + 0) = spu_extract(tmp5,3); HSUM(R(1, 5), R(2, 5), R(3, 5), R(4, 5));*((C) + 5 * ldc + 1) = spu_extract(tmp5,0);*((C) + 5 * ldc + 2) = spu_extract(tmp5,1);*((C) + 5 * ldc + 3) = spu_extract(tmp5,2);*((C) + 5 * ldc + 4) = spu_extract(tmp5,3); HSUM(R(5, 5), R(0, 6), R(1, 6), R(2, 6));*((C) + 5 * ldc + 5) = spu_extract(tmp5,0);*((C) + 6 * ldc + 0) = spu_extract(tmp5,1);*((C) + 6 * ldc + 1) = spu_extract(tmp5,2);*((C) + 6 * ldc + 2) = spu_extract(tmp5,3); HSUM(R(3, 6), R(4, 6), R(5, 6), R(6, 6));*((C) + 6 * ldc + 3) = spu_extract(tmp5,0);*((C) + 6 * ldc + 4) = spu_extract(tmp5,1);*((C) + 6 * ldc + 5) = spu_extract(tmp5,2);*((C) + 6 * ldc + 6) = spu_extract(tmp5,3); HSUM(R(0, 7), R(1, 7), R(2, 7), R(3, 7));*((C) + 7 * ldc + 0) = spu_extract(tmp5,0);*((C) + 7 * ldc + 1) = spu_extract(tmp5,1);*((C) + 7 * ldc + 2) = spu_extract(tmp5,2);*((C) + 7 * ldc + 3) = spu_extract(tmp5,3); HSUM(R(4, 7), R(5, 7), R(6, 7), R(7, 7));*((C) + 7 * ldc + 4) = spu_extract(tmp5,0);*((C) + 7 * ldc + 5) = spu_extract(tmp5,1);*((C) + 7 * ldc + 6) = spu_extract(tmp5,2);*((C) + 7 * ldc + 7) = spu_extract(tmp5,3); HSUM(R(0, 8), R(1, 8), R(2, 8), R(3, 8));*((C) + 8 * ldc + 0) = spu_extract(tmp5,0);*((C) + 8 * ldc + 1) = spu_extract(tmp5,1);*((C) + 8 * ldc + 2) = spu_extract(tmp5,2);*((C) + 8 * ldc + 3) = spu_extract(tmp5,3); HSUM(R(4, 8), R(5, 8), R(6, 8), R(7, 8));*((C) + 8 * ldc + 4) = spu_extract(tmp5,0);*((C) + 8 * ldc + 5) = spu_extract(tmp5,1);*((C) + 8 * ldc + 6) = spu_extract(tmp5,2);*((C) + 8 * ldc + 7) = spu_extract(tmp5,3); HSUM(R(8, 8), R(0, 9), R(1, 9), R(2, 9));*((C) + 8 * ldc + 8) = spu_extract(tmp5,0);*((C) + 9 * ldc + 0) = spu_extract(tmp5,1);*((C) + 9 * ldc + 1) = spu_extract(tmp5,2);*((C) + 9 * ldc + 2) = spu_extract(tmp5,3); HSUM(R(3, 9), R(4, 9), R(5, 9), R(6, 9));*((C) + 9 * ldc + 3) = spu_extract(tmp5,0);*((C) + 9 * ldc + 4) = spu_extract(tmp5,1);*((C) + 9 * ldc + 5) = spu_extract(tmp5,2);*((C) + 9 * ldc + 6) = spu_extract(tmp5,3); HSUM(R(7, 9), R(8, 9), R(9, 9), R(0, 10));*((C) + 9 * ldc + 7) = spu_extract(tmp5,0);*((C) + 9 * ldc + 8) = spu_extract(tmp5,1);*((C) + 9 * ldc + 9) = spu_extract(tmp5,2);*((C) + 10 * ldc + 0) = spu_extract(tmp5,3); HSUM(R(1, 10), R(2, 10), R(3, 10), R(4, 10));*((C) + 10 * ldc + 1) = spu_extract(tmp5,0);*((C) + 10 * ldc + 2) = spu_extract(tmp5,1);*((C) + 10 * ldc + 3) = spu_extract(tmp5,2);*((C) + 10 * ldc + 4) = spu_extract(tmp5,3); HSUM(R(5, 10), R(6, 10), R(7, 10), R(8, 10));*((C) + 10 * ldc + 5) = spu_extract(tmp5,0);*((C) + 10 * ldc + 6) = spu_extract(tmp5,1);*((C) + 10 * ldc + 7) = spu_extract(tmp5,2);*((C) + 10 * ldc + 8) = spu_extract(tmp5,3); HSUM(R(9, 10), R(10, 10), R(0, 11), R(1, 11));*((C) + 10 * ldc + 9) = spu_extract(tmp5,0);*((C) + 10 * ldc + 10) = spu_extract(tmp5,1);*((C) + 11 * ldc + 0) = spu_extract(tmp5,2);*((C) + 11 * ldc + 1) = spu_extract(tmp5,3); HSUM(R(2, 11), R(3, 11), R(4, 11), R(5, 11));*((C) + 11 * ldc + 2) = spu_extract(tmp5,0);*((C) + 11 * ldc + 3) = spu_extract(tmp5,1);*((C) + 11 * ldc + 4) = spu_extract(tmp5,2);*((C) + 11 * ldc + 5) = spu_extract(tmp5,3); HSUM(R(6, 11), R(7, 11), R(8, 11), R(9, 11));*((C) + 11 * ldc + 6) = spu_extract(tmp5,0);*((C) + 11 * ldc + 7) = spu_extract(tmp5,1);*((C) + 11 * ldc + 8) = spu_extract(tmp5,2);*((C) + 11 * ldc + 9) = spu_extract(tmp5,3); HSUM(R(10, 11), R(11, 11), R(0, 12), R(1, 12));*((C) + 11 * ldc + 10) = spu_extract(tmp5,0);*((C) + 11 * ldc + 11) = spu_extract(tmp5,1);*((C) + 12 * ldc + 0) = spu_extract(tmp5,2);*((C) + 12 * ldc + 1) = spu_extract(tmp5,3); HSUM(R(2, 12), R(3, 12), R(4, 12), R(5, 12));*((C) + 12 * ldc + 2) = spu_extract(tmp5,0);*((C) + 12 * ldc + 3) = spu_extract(tmp5,1);*((C) + 12 * ldc + 4) = spu_extract(tmp5,2);*((C) + 12 * ldc + 5) = spu_extract(tmp5,3); HSUM(R(6, 12), R(7, 12), R(8, 12), R(9, 12));*((C) + 12 * ldc + 6) = spu_extract(tmp5,0);*((C) + 12 * ldc + 7) = spu_extract(tmp5,1);*((C) + 12 * ldc + 8) = spu_extract(tmp5,2);*((C) + 12 * ldc + 9) = spu_extract(tmp5,3); HSUM(R(10, 12), R(11, 12), R(12, 12), R(0, 13));*((C) + 12 * ldc + 10) = spu_extract(tmp5,0);*((C) + 12 * ldc + 11) = spu_extract(tmp5,1);*((C) + 12 * ldc + 12) = spu_extract(tmp5,2);*((C) + 13 * ldc + 0) = spu_extract(tmp5,3); HSUM(R(1, 13), R(2, 13), R(3, 13), R(4, 13));*((C) + 13 * ldc + 1) = spu_extract(tmp5,0);*((C) + 13 * ldc + 2) = spu_extract(tmp5,1);*((C) + 13 * ldc + 3) = spu_extract(tmp5,2);*((C) + 13 * ldc + 4) = spu_extract(tmp5,3); HSUM(R(5, 13), R(6, 13), R(7, 13), R(8, 13));*((C) + 13 * ldc + 5) = spu_extract(tmp5,0);*((C) + 13 * ldc + 6) = spu_extract(tmp5,1);*((C) + 13 * ldc + 7) = spu_extract(tmp5,2);*((C) + 13 * ldc + 8) = spu_extract(tmp5,3); HSUM(R(9, 13), R(10, 13), R(11, 13), R(12, 13));*((C) + 13 * ldc + 9) = spu_extract(tmp5,0);*((C) + 13 * ldc + 10) = spu_extract(tmp5,1);*((C) + 13 * ldc + 11) = spu_extract(tmp5,2);*((C) + 13 * ldc + 12) = spu_extract(tmp5,3); HSUM(R(13, 13), R(0, 14), R(1, 14), R(2, 14));*((C) + 13 * ldc + 13) = spu_extract(tmp5,0);*((C) + 14 * ldc + 0) = spu_extract(tmp5,1);*((C) + 14 * ldc + 1) = spu_extract(tmp5,2);*((C) + 14 * ldc + 2) = spu_extract(tmp5,3); HSUM(R(3, 14), R(4, 14), R(5, 14), R(6, 14));*((C) + 14 * ldc + 3) = spu_extract(tmp5,0);*((C) + 14 * ldc + 4) = spu_extract(tmp5,1);*((C) + 14 * ldc + 5) = spu_extract(tmp5,2);*((C) + 14 * ldc + 6) = spu_extract(tmp5,3); HSUM(R(7, 14), R(8, 14), R(9, 14), R(10, 14));*((C) + 14 * ldc + 7) = spu_extract(tmp5,0);*((C) + 14 * ldc + 8) = spu_extract(tmp5,1);*((C) + 14 * ldc + 9) = spu_extract(tmp5,2);*((C) + 14 * ldc + 10) = spu_extract(tmp5,3); HSUM(R(11, 14), R(12, 14), R(13, 14), R(14, 14));*((C) + 14 * ldc + 11) = spu_extract(tmp5,0);*((C) + 14 * ldc + 12) = spu_extract(tmp5,1);*((C) + 14 * ldc + 13) = spu_extract(tmp5,2);*((C) + 14 * ldc + 14) = spu_extract(tmp5,3);
141
#define SUM_V15(C) HSUM(R(0, 0), R(1, 0), R(2, 0), R(3, 0)); C[0] = tmp5; HSUM(R(4, 0), R(5, 0), R(6, 0), R(7, 0)); C[1] = tmp5; HSUM(R(8, 0), R(9, 0), R(10, 0), R(11, 0)); C[2] = tmp5; HSUM(R(12, 0), R(13, 0), R(14, 0), zero); C[3] = tmp5;
142
#define SUM_T16(C) HSUM(R(0, 0), R(0, 1), R(1, 1), R(0, 2));*((C) + 0 * ldc + 0) = spu_extract(tmp5,0);*((C) + 1 * ldc + 0) = spu_extract(tmp5,1);*((C) + 1 * ldc + 1) = spu_extract(tmp5,2);*((C) + 2 * ldc + 0) = spu_extract(tmp5,3); HSUM(R(1, 2), R(2, 2), R(0, 3), R(1, 3));*((C) + 2 * ldc + 1) = spu_extract(tmp5,0);*((C) + 2 * ldc + 2) = spu_extract(tmp5,1);*((C) + 3 * ldc + 0) = spu_extract(tmp5,2);*((C) + 3 * ldc + 1) = spu_extract(tmp5,3); HSUM(R(2, 3), R(3, 3), R(0, 4), R(1, 4));*((C) + 3 * ldc + 2) = spu_extract(tmp5,0);*((C) + 3 * ldc + 3) = spu_extract(tmp5,1);*((C) + 4 * ldc + 0) = spu_extract(tmp5,2);*((C) + 4 * ldc + 1) = spu_extract(tmp5,3); HSUM(R(2, 4), R(3, 4), R(4, 4), R(0, 5));*((C) + 4 * ldc + 2) = spu_extract(tmp5,0);*((C) + 4 * ldc + 3) = spu_extract(tmp5,1);*((C) + 4 * ldc + 4) = spu_extract(tmp5,2);*((C) + 5 * ldc + 0) = spu_extract(tmp5,3); HSUM(R(1, 5), R(2, 5), R(3, 5), R(4, 5));*((C) + 5 * ldc + 1) = spu_extract(tmp5,0);*((C) + 5 * ldc + 2) = spu_extract(tmp5,1);*((C) + 5 * ldc + 3) = spu_extract(tmp5,2);*((C) + 5 * ldc + 4) = spu_extract(tmp5,3); HSUM(R(5, 5), R(0, 6), R(1, 6), R(2, 6));*((C) + 5 * ldc + 5) = spu_extract(tmp5,0);*((C) + 6 * ldc + 0) = spu_extract(tmp5,1);*((C) + 6 * ldc + 1) = spu_extract(tmp5,2);*((C) + 6 * ldc + 2) = spu_extract(tmp5,3); HSUM(R(3, 6), R(4, 6), R(5, 6), R(6, 6));*((C) + 6 * ldc + 3) = spu_extract(tmp5,0);*((C) + 6 * ldc + 4) = spu_extract(tmp5,1);*((C) + 6 * ldc + 5) = spu_extract(tmp5,2);*((C) + 6 * ldc + 6) = spu_extract(tmp5,3); HSUM(R(0, 7), R(1, 7), R(2, 7), R(3, 7));*((C) + 7 * ldc + 0) = spu_extract(tmp5,0);*((C) + 7 * ldc + 1) = spu_extract(tmp5,1);*((C) + 7 * ldc + 2) = spu_extract(tmp5,2);*((C) + 7 * ldc + 3) = spu_extract(tmp5,3); HSUM(R(4, 7), R(5, 7), R(6, 7), R(7, 7));*((C) + 7 * ldc + 4) = spu_extract(tmp5,0);*((C) + 7 * ldc + 5) = spu_extract(tmp5,1);*((C) + 7 * ldc + 6) = spu_extract(tmp5,2);*((C) + 7 * ldc + 7) = spu_extract(tmp5,3); HSUM(R(0, 8), R(1, 8), R(2, 8), R(3, 8));*((C) + 8 * ldc + 0) = spu_extract(tmp5,0);*((C) + 8 * ldc + 1) = spu_extract(tmp5,1);*((C) + 8 * ldc + 2) = spu_extract(tmp5,2);*((C) + 8 * ldc + 3) = spu_extract(tmp5,3); HSUM(R(4, 8), R(5, 8), R(6, 8), R(7, 8));*((C) + 8 * ldc + 4) = spu_extract(tmp5,0);*((C) + 8 * ldc + 5) = spu_extract(tmp5,1);*((C) + 8 * ldc + 6) = spu_extract(tmp5,2);*((C) + 8 * ldc + 7) = spu_extract(tmp5,3); HSUM(R(8, 8), R(0, 9), R(1, 9), R(2, 9));*((C) + 8 * ldc + 8) = spu_extract(tmp5,0);*((C) + 9 * ldc + 0) = spu_extract(tmp5,1);*((C) + 9 * ldc + 1) = spu_extract(tmp5,2);*((C) + 9 * ldc + 2) = spu_extract(tmp5,3); HSUM(R(3, 9), R(4, 9), R(5, 9), R(6, 9));*((C) + 9 * ldc + 3) = spu_extract(tmp5,0);*((C) + 9 * ldc + 4) = spu_extract(tmp5,1);*((C) + 9 * ldc + 5) = spu_extract(tmp5,2);*((C) + 9 * ldc + 6) = spu_extract(tmp5,3); HSUM(R(7, 9), R(8, 9), R(9, 9), R(0, 10));*((C) + 9 * ldc + 7) = spu_extract(tmp5,0);*((C) + 9 * ldc + 8) = spu_extract(tmp5,1);*((C) + 9 * ldc + 9) = spu_extract(tmp5,2);*((C) + 10 * ldc + 0) = spu_extract(tmp5,3); HSUM(R(1, 10), R(2, 10), R(3, 10), R(4, 10));*((C) + 10 * ldc + 1) = spu_extract(tmp5,0);*((C) + 10 * ldc + 2) = spu_extract(tmp5,1);*((C) + 10 * ldc + 3) = spu_extract(tmp5,2);*((C) + 10 * ldc + 4) = spu_extract(tmp5,3); HSUM(R(5, 10), R(6, 10), R(7, 10), R(8, 10));*((C) + 10 * ldc + 5) = spu_extract(tmp5,0);*((C) + 10 * ldc + 6) = spu_extract(tmp5,1);*((C) + 10 * ldc + 7) = spu_extract(tmp5,2);*((C) + 10 * ldc + 8) = spu_extract(tmp5,3); HSUM(R(9, 10), R(10, 10), R(0, 11), R(1, 11));*((C) + 10 * ldc + 9) = spu_extract(tmp5,0);*((C) + 10 * ldc + 10) = spu_extract(tmp5,1);*((C) + 11 * ldc + 0) = spu_extract(tmp5,2);*((C) + 11 * ldc + 1) = spu_extract(tmp5,3); HSUM(R(2, 11), R(3, 11), R(4, 11), R(5, 11));*((C) + 11 * ldc + 2) = spu_extract(tmp5,0);*((C) + 11 * ldc + 3) = spu_extract(tmp5,1);*((C) + 11 * ldc + 4) = spu_extract(tmp5,2);*((C) + 11 * ldc + 5) = spu_extract(tmp5,3); HSUM(R(6, 11), R(7, 11), R(8, 11), R(9, 11));*((C) + 11 * ldc + 6) = spu_extract(tmp5,0);*((C) + 11 * ldc + 7) = spu_extract(tmp5,1);*((C) + 11 * ldc + 8) = spu_extract(tmp5,2);*((C) + 11 * ldc + 9) = spu_extract(tmp5,3); HSUM(R(10, 11), R(11, 11), R(0, 12), R(1, 12));*((C) + 11 * ldc + 10) = spu_extract(tmp5,0);*((C) + 11 * ldc + 11) = spu_extract(tmp5,1);*((C) + 12 * ldc + 0) = spu_extract(tmp5,2);*((C) + 12 * ldc + 1) = spu_extract(tmp5,3); HSUM(R(2, 12), R(3, 12), R(4, 12), R(5, 12));*((C) + 12 * ldc + 2) = spu_extract(tmp5,0);*((C) + 12 * ldc + 3) = spu_extract(tmp5,1);*((C) + 12 * ldc + 4) = spu_extract(tmp5,2);*((C) + 12 * ldc + 5) = spu_extract(tmp5,3); HSUM(R(6, 12), R(7, 12), R(8, 12), R(9, 12));*((C) + 12 * ldc + 6) = spu_extract(tmp5,0);*((C) + 12 * ldc + 7) = spu_extract(tmp5,1);*((C) + 12 * ldc + 8) = spu_extract(tmp5,2);*((C) + 12 * ldc + 9) = spu_extract(tmp5,3); HSUM(R(10, 12), R(11, 12), R(12, 12), R(0, 13));*((C) + 12 * ldc + 10) = spu_extract(tmp5,0);*((C) + 12 * ldc + 11) = spu_extract(tmp5,1);*((C) + 12 * ldc + 12) = spu_extract(tmp5,2);*((C) + 13 * ldc + 0) = spu_extract(tmp5,3); HSUM(R(1, 13), R(2, 13), R(3, 13), R(4, 13));*((C) + 13 * ldc + 1) = spu_extract(tmp5,0);*((C) + 13 * ldc + 2) = spu_extract(tmp5,1);*((C) + 13 * ldc + 3) = spu_extract(tmp5,2);*((C) + 13 * ldc + 4) = spu_extract(tmp5,3); HSUM(R(5, 13), R(6, 13), R(7, 13), R(8, 13));*((C) + 13 * ldc + 5) = spu_extract(tmp5,0);*((C) + 13 * ldc + 6) = spu_extract(tmp5,1);*((C) + 13 * ldc + 7) = spu_extract(tmp5,2);*((C) + 13 * ldc + 8) = spu_extract(tmp5,3); HSUM(R(9, 13), R(10, 13), R(11, 13), R(12, 13));*((C) + 13 * ldc + 9) = spu_extract(tmp5,0);*((C) + 13 * ldc + 10) = spu_extract(tmp5,1);*((C) + 13 * ldc + 11) = spu_extract(tmp5,2);*((C) + 13 * ldc + 12) = spu_extract(tmp5,3); HSUM(R(13, 13), R(0, 14), R(1, 14), R(2, 14));*((C) + 13 * ldc + 13) = spu_extract(tmp5,0);*((C) + 14 * ldc + 0) = spu_extract(tmp5,1);*((C) + 14 * ldc + 1) = spu_extract(tmp5,2);*((C) + 14 * ldc + 2) = spu_extract(tmp5,3); HSUM(R(3, 14), R(4, 14), R(5, 14), R(6, 14));*((C) + 14 * ldc + 3) = spu_extract(tmp5,0);*((C) + 14 * ldc + 4) = spu_extract(tmp5,1);*((C) + 14 * ldc + 5) = spu_extract(tmp5,2);*((C) + 14 * ldc + 6) = spu_extract(tmp5,3); HSUM(R(7, 14), R(8, 14), R(9, 14), R(10, 14));*((C) + 14 * ldc + 7) = spu_extract(tmp5,0);*((C) + 14 * ldc + 8) = spu_extract(tmp5,1);*((C) + 14 * ldc + 9) = spu_extract(tmp5,2);*((C) + 14 * ldc + 10) = spu_extract(tmp5,3); HSUM(R(11, 14), R(12, 14), R(13, 14), R(14, 14));*((C) + 14 * ldc + 11) = spu_extract(tmp5,0);*((C) + 14 * ldc + 12) = spu_extract(tmp5,1);*((C) + 14 * ldc + 13) = spu_extract(tmp5,2);*((C) + 14 * ldc + 14) = spu_extract(tmp5,3); HSUM(R(0, 15), R(1, 15), R(2, 15), R(3, 15));*((C) + 15 * ldc + 0) = spu_extract(tmp5,0);*((C) + 15 * ldc + 1) = spu_extract(tmp5,1);*((C) + 15 * ldc + 2) = spu_extract(tmp5,2);*((C) + 15 * ldc + 3) = spu_extract(tmp5,3); HSUM(R(4, 15), R(5, 15), R(6, 15), R(7, 15));*((C) + 15 * ldc + 4) = spu_extract(tmp5,0);*((C) + 15 * ldc + 5) = spu_extract(tmp5,1);*((C) + 15 * ldc + 6) = spu_extract(tmp5,2);*((C) + 15 * ldc + 7) = spu_extract(tmp5,3); HSUM(R(8, 15), R(9, 15), R(10, 15), R(11, 15));*((C) + 15 * ldc + 8) = spu_extract(tmp5,0);*((C) + 15 * ldc + 9) = spu_extract(tmp5,1);*((C) + 15 * ldc + 10) = spu_extract(tmp5,2);*((C) + 15 * ldc + 11) = spu_extract(tmp5,3); HSUM(R(12, 15), R(13, 15), R(14, 15), R(15, 15));*((C) + 15 * ldc + 12) = spu_extract(tmp5,0);*((C) + 15 * ldc + 13) = spu_extract(tmp5,1);*((C) + 15 * ldc + 14) = spu_extract(tmp5,2);*((C) + 15 * ldc + 15) = spu_extract(tmp5,3);
143
#define SUM_V16(C) HSUM(R(0, 0), R(1, 0), R(2, 0), R(3, 0)); C[0] = tmp5; HSUM(R(4, 0), R(5, 0), R(6, 0), R(7, 0)); C[1] = tmp5; HSUM(R(8, 0), R(9, 0), R(10, 0), R(11, 0)); C[2] = tmp5; HSUM(R(12, 0), R(13, 0), R(14, 0), R(15, 0)); C[3] = tmp5;