bzr branch
http://darksoft.org/webbzr/ani/mrses
1
by Suren A. Chilingaryan
Initial import |
1 |
#define T(i,j) T_ ## i ## _ ## j
|
2 |
#define R(i,j) R_ ## i ## _ ## j
|
|
3 |
#define V(i) V_ ## i
|
|
4 |
#define DECLARE_T(i, j, A) register vector float T(i,j) = *((A) + i*lda + j);
|
|
5 |
#define DECLARE_R(i, j) register vector float R(i,j) = spu_splats((float)0.0);
|
|
6 |
#define DECLARE_V(i, A) register vector float V(i) = *((A) + i);
|
|
7 |
#define DECLARE_TX(i, A) DECLARE_T(i, 0, A) DECLARE_T(i, 1, A) DECLARE_T(i, 2, A) DECLARE_T(i, 3, A)
|
|
8 |
#define DECLARE_VX(A) DECLARE_V(0, A) DECLARE_V(1, A) DECLARE_V(2, A) DECLARE_V(3, A)
|
|
9 |
#define DECLARE_T1(A) DECLARE_TX(0, A)
|
|
10 |
#define DECLARE_R1 DECLARE_R(0,0)
|
|
11 |
#define DECLARE_VR1 DECLARE_R(0,0)
|
|
12 |
#define DECLARE_T2(A) DECLARE_T1(A) DECLARE_TX(1, A)
|
|
13 |
#define DECLARE_VR2 DECLARE_VR1 DECLARE_R(1,0)
|
|
14 |
#define DECLARE_R2 DECLARE_R1 DECLARE_R(0, 1) DECLARE_R(1, 1)
|
|
15 |
#define DECLARE_T3(A) DECLARE_T2(A) DECLARE_TX(2, A)
|
|
16 |
#define DECLARE_VR3 DECLARE_VR2 DECLARE_R(2,0)
|
|
17 |
#define DECLARE_R3 DECLARE_R2 DECLARE_R(0, 2) DECLARE_R(1, 2) DECLARE_R(2, 2)
|
|
18 |
#define DECLARE_T4(A) DECLARE_T3(A) DECLARE_TX(3, A)
|
|
19 |
#define DECLARE_VR4 DECLARE_VR3 DECLARE_R(3,0)
|
|
20 |
#define DECLARE_R4 DECLARE_R3 DECLARE_R(0, 3) DECLARE_R(1, 3) DECLARE_R(2, 3) DECLARE_R(3, 3)
|
|
21 |
#define DECLARE_T5(A) DECLARE_T4(A) DECLARE_TX(4, A)
|
|
22 |
#define DECLARE_VR5 DECLARE_VR4 DECLARE_R(4,0)
|
|
23 |
#define DECLARE_R5 DECLARE_R4 DECLARE_R(0, 4) DECLARE_R(1, 4) DECLARE_R(2, 4) DECLARE_R(3, 4) DECLARE_R(4, 4)
|
|
24 |
#define DECLARE_T6(A) DECLARE_T5(A) DECLARE_TX(5, A)
|
|
25 |
#define DECLARE_VR6 DECLARE_VR5 DECLARE_R(5,0)
|
|
26 |
#define DECLARE_R6 DECLARE_R5 DECLARE_R(0, 5) DECLARE_R(1, 5) DECLARE_R(2, 5) DECLARE_R(3, 5) DECLARE_R(4, 5) DECLARE_R(5, 5)
|
|
27 |
#define DECLARE_T7(A) DECLARE_T6(A) DECLARE_TX(6, A)
|
|
28 |
#define DECLARE_VR7 DECLARE_VR6 DECLARE_R(6,0)
|
|
29 |
#define DECLARE_R7 DECLARE_R6 DECLARE_R(0, 6) DECLARE_R(1, 6) DECLARE_R(2, 6) DECLARE_R(3, 6) DECLARE_R(4, 6) DECLARE_R(5, 6) DECLARE_R(6, 6)
|
|
30 |
#define DECLARE_T8(A) DECLARE_T7(A) DECLARE_TX(7, A)
|
|
31 |
#define DECLARE_VR8 DECLARE_VR7 DECLARE_R(7,0)
|
|
32 |
#define DECLARE_R8 DECLARE_R7 DECLARE_R(0, 7) DECLARE_R(1, 7) DECLARE_R(2, 7) DECLARE_R(3, 7) DECLARE_R(4, 7) DECLARE_R(5, 7) DECLARE_R(6, 7) DECLARE_R(7, 7)
|
|
33 |
#define DECLARE_T9(A) DECLARE_T8(A) DECLARE_TX(8, A)
|
|
34 |
#define DECLARE_VR9 DECLARE_VR8 DECLARE_R(8,0)
|
|
35 |
#define DECLARE_R9 DECLARE_R8 DECLARE_R(0, 8) DECLARE_R(1, 8) DECLARE_R(2, 8) DECLARE_R(3, 8) DECLARE_R(4, 8) DECLARE_R(5, 8) DECLARE_R(6, 8) DECLARE_R(7, 8) DECLARE_R(8, 8)
|
|
36 |
#define DECLARE_T10(A) DECLARE_T9(A) DECLARE_TX(9, A)
|
|
37 |
#define DECLARE_VR10 DECLARE_VR9 DECLARE_R(9,0)
|
|
38 |
#define DECLARE_R10 DECLARE_R9 DECLARE_R(0, 9) DECLARE_R(1, 9) DECLARE_R(2, 9) DECLARE_R(3, 9) DECLARE_R(4, 9) DECLARE_R(5, 9) DECLARE_R(6, 9) DECLARE_R(7, 9) DECLARE_R(8, 9) DECLARE_R(9, 9)
|
|
39 |
#define DECLARE_T11(A) DECLARE_T10(A) DECLARE_TX(10, A)
|
|
40 |
#define DECLARE_VR11 DECLARE_VR10 DECLARE_R(10,0)
|
|
41 |
#define DECLARE_R11 DECLARE_R10 DECLARE_R(0, 10) DECLARE_R(1, 10) DECLARE_R(2, 10) DECLARE_R(3, 10) DECLARE_R(4, 10) DECLARE_R(5, 10) DECLARE_R(6, 10) DECLARE_R(7, 10) DECLARE_R(8, 10) DECLARE_R(9, 10) DECLARE_R(10, 10)
|
|
42 |
#define DECLARE_T12(A) DECLARE_T11(A) DECLARE_TX(11, A)
|
|
43 |
#define DECLARE_VR12 DECLARE_VR11 DECLARE_R(11,0)
|
|
44 |
#define DECLARE_R12 DECLARE_R11 DECLARE_R(0, 11) DECLARE_R(1, 11) DECLARE_R(2, 11) DECLARE_R(3, 11) DECLARE_R(4, 11) DECLARE_R(5, 11) DECLARE_R(6, 11) DECLARE_R(7, 11) DECLARE_R(8, 11) DECLARE_R(9, 11) DECLARE_R(10, 11) DECLARE_R(11, 11)
|
|
45 |
#define DECLARE_T13(A) DECLARE_T12(A) DECLARE_TX(12, A)
|
|
46 |
#define DECLARE_VR13 DECLARE_VR12 DECLARE_R(12,0)
|
|
47 |
#define DECLARE_R13 DECLARE_R12 DECLARE_R(0, 12) DECLARE_R(1, 12) DECLARE_R(2, 12) DECLARE_R(3, 12) DECLARE_R(4, 12) DECLARE_R(5, 12) DECLARE_R(6, 12) DECLARE_R(7, 12) DECLARE_R(8, 12) DECLARE_R(9, 12) DECLARE_R(10, 12) DECLARE_R(11, 12) DECLARE_R(12, 12)
|
|
48 |
#define DECLARE_T14(A) DECLARE_T13(A) DECLARE_TX(13, A)
|
|
49 |
#define DECLARE_VR14 DECLARE_VR13 DECLARE_R(13,0)
|
|
50 |
#define DECLARE_R14 DECLARE_R13 DECLARE_R(0, 13) DECLARE_R(1, 13) DECLARE_R(2, 13) DECLARE_R(3, 13) DECLARE_R(4, 13) DECLARE_R(5, 13) DECLARE_R(6, 13) DECLARE_R(7, 13) DECLARE_R(8, 13) DECLARE_R(9, 13) DECLARE_R(10, 13) DECLARE_R(11, 13) DECLARE_R(12, 13) DECLARE_R(13, 13)
|
|
51 |
#define DECLARE_T15(A) DECLARE_T14(A) DECLARE_TX(14, A)
|
|
52 |
#define DECLARE_VR15 DECLARE_VR14 DECLARE_R(14,0)
|
|
53 |
#define DECLARE_R15 DECLARE_R14 DECLARE_R(0, 14) DECLARE_R(1, 14) DECLARE_R(2, 14) DECLARE_R(3, 14) DECLARE_R(4, 14) DECLARE_R(5, 14) DECLARE_R(6, 14) DECLARE_R(7, 14) DECLARE_R(8, 14) DECLARE_R(9, 14) DECLARE_R(10, 14) DECLARE_R(11, 14) DECLARE_R(12, 14) DECLARE_R(13, 14) DECLARE_R(14, 14)
|
|
54 |
#define DECLARE_T16(A) DECLARE_T15(A) DECLARE_TX(15, A)
|
|
55 |
#define DECLARE_VR16 DECLARE_VR15 DECLARE_R(15,0)
|
|
56 |
#define DECLARE_R16 DECLARE_R15 DECLARE_R(0, 15) DECLARE_R(1, 15) DECLARE_R(2, 15) DECLARE_R(3, 15) DECLARE_R(4, 15) DECLARE_R(5, 15) DECLARE_R(6, 15) DECLARE_R(7, 15) DECLARE_R(8, 15) DECLARE_R(9, 15) DECLARE_R(10, 15) DECLARE_R(11, 15) DECLARE_R(12, 15) DECLARE_R(13, 15) DECLARE_R(14, 15) DECLARE_R(15, 15)
|
|
57 |
||
58 |
#define COMPUTE_T(i, j, l, var) spu_madd(T(i,l), T(j,l), var)
|
|
59 |
#define COMPUTE_V(A, l, k, var) spu_madd(V(k), A[l*lda + k], var)
|
|
60 |
#define COMPUTE_TX(i, j, C) R(i,j) = COMPUTE_T(i, j, 3, COMPUTE_T(i, j, 2, COMPUTE_T(i, j, 1, spu_madd(T(i,0), T(j,0), R(i,j)))));
|
|
61 |
#define COMPUTE_VX(A, l) R(l,0) = COMPUTE_V(A, l, 3, COMPUTE_V(A, l, 2, COMPUTE_V(A, l, 1, spu_madd(V(0), A[l*lda], R(l,0)))));
|
|
62 |
#define SAVE_TX(i, j, C) *((C) + i * ldc + j) = sum_across_float4(R(i,j));
|
|
63 |
#define COMPUTE_T1(C) COMPUTE_TX(0, 0, C)
|
|
64 |
#define COMPUTE_V1(A) COMPUTE_VX(A, 0)
|
|
65 |
#define SAVE_T1(C) SAVE_TX(0, 0, C)
|
|
66 |
#define COMPUTE_T2(C) COMPUTE_T1(C) COMPUTE_TX(1,1,C) COMPUTE_TX(0,1,C)
|
|
67 |
#define COMPUTE_V2(A) COMPUTE_V1(A) COMPUTE_VX(A,1)
|
|
68 |
#define SAVE_T2(C) SAVE_T1(C) SAVE_TX(1,1,C) SAVE_TX(0,1,C)
|
|
69 |
#define COMPUTE_T3(C) COMPUTE_T2(C) COMPUTE_TX(2,2,C) COMPUTE_TX(0,2,C) COMPUTE_TX(1,2,C)
|
|
70 |
#define COMPUTE_V3(A) COMPUTE_V2(A) COMPUTE_VX(A,2)
|
|
71 |
#define SAVE_T3(C) SAVE_T2(C) SAVE_TX(2,2,C) SAVE_TX(0,2,C) SAVE_TX(1,2,C)
|
|
72 |
#define COMPUTE_T4(C) COMPUTE_T3(C) COMPUTE_TX(3,3,C) COMPUTE_TX(0,3,C) COMPUTE_TX(1,3,C) COMPUTE_TX(2,3,C)
|
|
73 |
#define COMPUTE_V4(A) COMPUTE_V3(A) COMPUTE_VX(A,3)
|
|
74 |
#define SAVE_T4(C) SAVE_T3(C) SAVE_TX(3,3,C) SAVE_TX(0,3,C) SAVE_TX(1,3,C) SAVE_TX(2,3,C)
|
|
75 |
#define COMPUTE_T5(C) COMPUTE_T4(C) COMPUTE_TX(4,4,C) COMPUTE_TX(0,4,C) COMPUTE_TX(1,4,C) COMPUTE_TX(2,4,C) COMPUTE_TX(3,4,C)
|
|
76 |
#define COMPUTE_V5(A) COMPUTE_V4(A) COMPUTE_VX(A,4)
|
|
77 |
#define SAVE_T5(C) SAVE_T4(C) SAVE_TX(4,4,C) SAVE_TX(0,4,C) SAVE_TX(1,4,C) SAVE_TX(2,4,C) SAVE_TX(3,4,C)
|
|
78 |
#define COMPUTE_T6(C) COMPUTE_T5(C) COMPUTE_TX(5,5,C) COMPUTE_TX(0,5,C) COMPUTE_TX(1,5,C) COMPUTE_TX(2,5,C) COMPUTE_TX(3,5,C) COMPUTE_TX(4,5,C)
|
|
79 |
#define COMPUTE_V6(A) COMPUTE_V5(A) COMPUTE_VX(A,5)
|
|
80 |
#define SAVE_T6(C) SAVE_T5(C) SAVE_TX(5,5,C) SAVE_TX(0,5,C) SAVE_TX(1,5,C) SAVE_TX(2,5,C) SAVE_TX(3,5,C) SAVE_TX(4,5,C)
|
|
81 |
#define COMPUTE_T7(C) COMPUTE_T6(C) COMPUTE_TX(6,6,C) COMPUTE_TX(0,6,C) COMPUTE_TX(1,6,C) COMPUTE_TX(2,6,C) COMPUTE_TX(3,6,C) COMPUTE_TX(4,6,C) COMPUTE_TX(5,6,C)
|
|
82 |
#define COMPUTE_V7(A) COMPUTE_V6(A) COMPUTE_VX(A,6)
|
|
83 |
#define SAVE_T7(C) SAVE_T6(C) SAVE_TX(6,6,C) SAVE_TX(0,6,C) SAVE_TX(1,6,C) SAVE_TX(2,6,C) SAVE_TX(3,6,C) SAVE_TX(4,6,C) SAVE_TX(5,6,C)
|
|
84 |
#define COMPUTE_T8(C) COMPUTE_T7(C) COMPUTE_TX(7,7,C) COMPUTE_TX(0,7,C) COMPUTE_TX(1,7,C) COMPUTE_TX(2,7,C) COMPUTE_TX(3,7,C) COMPUTE_TX(4,7,C) COMPUTE_TX(5,7,C) COMPUTE_TX(6,7,C)
|
|
85 |
#define COMPUTE_V8(A) COMPUTE_V7(A) COMPUTE_VX(A,7)
|
|
86 |
#define SAVE_T8(C) SAVE_T7(C) SAVE_TX(7,7,C) SAVE_TX(0,7,C) SAVE_TX(1,7,C) SAVE_TX(2,7,C) SAVE_TX(3,7,C) SAVE_TX(4,7,C) SAVE_TX(5,7,C) SAVE_TX(6,7,C)
|
|
87 |
#define COMPUTE_T9(C) COMPUTE_T8(C) COMPUTE_TX(8,8,C) COMPUTE_TX(0,8,C) COMPUTE_TX(1,8,C) COMPUTE_TX(2,8,C) COMPUTE_TX(3,8,C) COMPUTE_TX(4,8,C) COMPUTE_TX(5,8,C) COMPUTE_TX(6,8,C) COMPUTE_TX(7,8,C)
|
|
88 |
#define COMPUTE_V9(A) COMPUTE_V8(A) COMPUTE_VX(A,8)
|
|
89 |
#define SAVE_T9(C) SAVE_T8(C) SAVE_TX(8,8,C) SAVE_TX(0,8,C) SAVE_TX(1,8,C) SAVE_TX(2,8,C) SAVE_TX(3,8,C) SAVE_TX(4,8,C) SAVE_TX(5,8,C) SAVE_TX(6,8,C) SAVE_TX(7,8,C)
|
|
90 |
#define COMPUTE_T10(C) COMPUTE_T9(C) COMPUTE_TX(9,9,C) COMPUTE_TX(0,9,C) COMPUTE_TX(1,9,C) COMPUTE_TX(2,9,C) COMPUTE_TX(3,9,C) COMPUTE_TX(4,9,C) COMPUTE_TX(5,9,C) COMPUTE_TX(6,9,C) COMPUTE_TX(7,9,C) COMPUTE_TX(8,9,C)
|
|
91 |
#define COMPUTE_V10(A) COMPUTE_V9(A) COMPUTE_VX(A,9)
|
|
92 |
#define SAVE_T10(C) SAVE_T9(C) SAVE_TX(9,9,C) SAVE_TX(0,9,C) SAVE_TX(1,9,C) SAVE_TX(2,9,C) SAVE_TX(3,9,C) SAVE_TX(4,9,C) SAVE_TX(5,9,C) SAVE_TX(6,9,C) SAVE_TX(7,9,C) SAVE_TX(8,9,C)
|
|
93 |
#define COMPUTE_T11(C) COMPUTE_T10(C) COMPUTE_TX(10,10,C) COMPUTE_TX(0,10,C) COMPUTE_TX(1,10,C) COMPUTE_TX(2,10,C) COMPUTE_TX(3,10,C) COMPUTE_TX(4,10,C) COMPUTE_TX(5,10,C) COMPUTE_TX(6,10,C) COMPUTE_TX(7,10,C) COMPUTE_TX(8,10,C) COMPUTE_TX(9,10,C)
|
|
94 |
#define COMPUTE_V11(A) COMPUTE_V10(A) COMPUTE_VX(A,10)
|
|
95 |
#define SAVE_T11(C) SAVE_T10(C) SAVE_TX(10,10,C) SAVE_TX(0,10,C) SAVE_TX(1,10,C) SAVE_TX(2,10,C) SAVE_TX(3,10,C) SAVE_TX(4,10,C) SAVE_TX(5,10,C) SAVE_TX(6,10,C) SAVE_TX(7,10,C) SAVE_TX(8,10,C) SAVE_TX(9,10,C)
|
|
96 |
#define COMPUTE_T12(C) COMPUTE_T11(C) COMPUTE_TX(11,11,C) COMPUTE_TX(0,11,C) COMPUTE_TX(1,11,C) COMPUTE_TX(2,11,C) COMPUTE_TX(3,11,C) COMPUTE_TX(4,11,C) COMPUTE_TX(5,11,C) COMPUTE_TX(6,11,C) COMPUTE_TX(7,11,C) COMPUTE_TX(8,11,C) COMPUTE_TX(9,11,C) COMPUTE_TX(10,11,C)
|
|
97 |
#define COMPUTE_V12(A) COMPUTE_V11(A) COMPUTE_VX(A,11)
|
|
98 |
#define SAVE_T12(C) SAVE_T11(C) SAVE_TX(11,11,C) SAVE_TX(0,11,C) SAVE_TX(1,11,C) SAVE_TX(2,11,C) SAVE_TX(3,11,C) SAVE_TX(4,11,C) SAVE_TX(5,11,C) SAVE_TX(6,11,C) SAVE_TX(7,11,C) SAVE_TX(8,11,C) SAVE_TX(9,11,C) SAVE_TX(10,11,C)
|
|
99 |
#define COMPUTE_T13(C) COMPUTE_T12(C) COMPUTE_TX(12,12,C) COMPUTE_TX(0,12,C) COMPUTE_TX(1,12,C) COMPUTE_TX(2,12,C) COMPUTE_TX(3,12,C) COMPUTE_TX(4,12,C) COMPUTE_TX(5,12,C) COMPUTE_TX(6,12,C) COMPUTE_TX(7,12,C) COMPUTE_TX(8,12,C) COMPUTE_TX(9,12,C) COMPUTE_TX(10,12,C) COMPUTE_TX(11,12,C)
|
|
100 |
#define COMPUTE_V13(A) COMPUTE_V12(A) COMPUTE_VX(A,12)
|
|
101 |
#define SAVE_T13(C) SAVE_T12(C) SAVE_TX(12,12,C) SAVE_TX(0,12,C) SAVE_TX(1,12,C) SAVE_TX(2,12,C) SAVE_TX(3,12,C) SAVE_TX(4,12,C) SAVE_TX(5,12,C) SAVE_TX(6,12,C) SAVE_TX(7,12,C) SAVE_TX(8,12,C) SAVE_TX(9,12,C) SAVE_TX(10,12,C) SAVE_TX(11,12,C)
|
|
102 |
#define COMPUTE_T14(C) COMPUTE_T13(C) COMPUTE_TX(13,13,C) COMPUTE_TX(0,13,C) COMPUTE_TX(1,13,C) COMPUTE_TX(2,13,C) COMPUTE_TX(3,13,C) COMPUTE_TX(4,13,C) COMPUTE_TX(5,13,C) COMPUTE_TX(6,13,C) COMPUTE_TX(7,13,C) COMPUTE_TX(8,13,C) COMPUTE_TX(9,13,C) COMPUTE_TX(10,13,C) COMPUTE_TX(11,13,C) COMPUTE_TX(12,13,C)
|
|
103 |
#define COMPUTE_V14(A) COMPUTE_V13(A) COMPUTE_VX(A,13)
|
|
104 |
#define SAVE_T14(C) SAVE_T13(C) SAVE_TX(13,13,C) SAVE_TX(0,13,C) SAVE_TX(1,13,C) SAVE_TX(2,13,C) SAVE_TX(3,13,C) SAVE_TX(4,13,C) SAVE_TX(5,13,C) SAVE_TX(6,13,C) SAVE_TX(7,13,C) SAVE_TX(8,13,C) SAVE_TX(9,13,C) SAVE_TX(10,13,C) SAVE_TX(11,13,C) SAVE_TX(12,13,C)
|
|
105 |
#define COMPUTE_T15(C) COMPUTE_T14(C) COMPUTE_TX(14,14,C) COMPUTE_TX(0,14,C) COMPUTE_TX(1,14,C) COMPUTE_TX(2,14,C) COMPUTE_TX(3,14,C) COMPUTE_TX(4,14,C) COMPUTE_TX(5,14,C) COMPUTE_TX(6,14,C) COMPUTE_TX(7,14,C) COMPUTE_TX(8,14,C) COMPUTE_TX(9,14,C) COMPUTE_TX(10,14,C) COMPUTE_TX(11,14,C) COMPUTE_TX(12,14,C) COMPUTE_TX(13,14,C)
|
|
106 |
#define COMPUTE_V15(A) COMPUTE_V14(A) COMPUTE_VX(A,14)
|
|
107 |
#define SAVE_T15(C) SAVE_T14(C) SAVE_TX(14,14,C) SAVE_TX(0,14,C) SAVE_TX(1,14,C) SAVE_TX(2,14,C) SAVE_TX(3,14,C) SAVE_TX(4,14,C) SAVE_TX(5,14,C) SAVE_TX(6,14,C) SAVE_TX(7,14,C) SAVE_TX(8,14,C) SAVE_TX(9,14,C) SAVE_TX(10,14,C) SAVE_TX(11,14,C) SAVE_TX(12,14,C) SAVE_TX(13,14,C)
|
|
108 |
#define COMPUTE_T16(C) COMPUTE_T15(C) COMPUTE_TX(15,15,C) COMPUTE_TX(0,15,C) COMPUTE_TX(1,15,C) COMPUTE_TX(2,15,C) COMPUTE_TX(3,15,C) COMPUTE_TX(4,15,C) COMPUTE_TX(5,15,C) COMPUTE_TX(6,15,C) COMPUTE_TX(7,15,C) COMPUTE_TX(8,15,C) COMPUTE_TX(9,15,C) COMPUTE_TX(10,15,C) COMPUTE_TX(11,15,C) COMPUTE_TX(12,15,C) COMPUTE_TX(13,15,C) COMPUTE_TX(14,15,C)
|
|
109 |
#define COMPUTE_V16(A) COMPUTE_V15(A) COMPUTE_VX(A,15)
|
|
110 |
#define SAVE_T16(C) SAVE_T15(C) SAVE_TX(15,15,C) SAVE_TX(0,15,C) SAVE_TX(1,15,C) SAVE_TX(2,15,C) SAVE_TX(3,15,C) SAVE_TX(4,15,C) SAVE_TX(5,15,C) SAVE_TX(6,15,C) SAVE_TX(7,15,C) SAVE_TX(8,15,C) SAVE_TX(9,15,C) SAVE_TX(10,15,C) SAVE_TX(11,15,C) SAVE_TX(12,15,C) SAVE_TX(13,15,C) SAVE_TX(14,15,C)
|
|
111 |
||
112 |
#define SUM_T1(C) HSUM(R(0, 0), zero, zero, zero);*((C) + 0 * ldc + 0) = spu_extract(tmp5,0);
|
|
113 |
#define SUM_V1(C) HSUM(R(0, 0), zero, zero, zero); C[0] = tmp5;
|
|
114 |
#define SUM_T2(C) HSUM(R(0, 0), R(0, 1), R(1, 1), zero);*((C) + 0 * ldc + 0) = spu_extract(tmp5,0);*((C) + 1 * ldc + 0) = spu_extract(tmp5,1);*((C) + 1 * ldc + 1) = spu_extract(tmp5,2);
|
|
115 |
#define SUM_V2(C) HSUM(R(0, 0), R(1, 0), zero, zero); C[0] = tmp5;
|
|
116 |
#define SUM_T3(C) HSUM(R(0, 0), R(0, 1), R(1, 1), R(0, 2));*((C) + 0 * ldc + 0) = spu_extract(tmp5,0);*((C) + 1 * ldc + 0) = spu_extract(tmp5,1);*((C) + 1 * ldc + 1) = spu_extract(tmp5,2);*((C) + 2 * ldc + 0) = spu_extract(tmp5,3); HSUM(R(1, 2), R(2, 2), zero, zero);*((C) + 2 * ldc + 1) = spu_extract(tmp5,0);*((C) + 2 * ldc + 2) = spu_extract(tmp5,1);
|
|
117 |
#define SUM_V3(C) HSUM(R(0, 0), R(1, 0), R(2, 0), zero); C[0] = tmp5;
|
|
118 |
#define SUM_T4(C) HSUM(R(0, 0), R(0, 1), R(1, 1), R(0, 2));*((C) + 0 * ldc + 0) = spu_extract(tmp5,0);*((C) + 1 * ldc + 0) = spu_extract(tmp5,1);*((C) + 1 * ldc + 1) = spu_extract(tmp5,2);*((C) + 2 * ldc + 0) = spu_extract(tmp5,3); HSUM(R(1, 2), R(2, 2), R(0, 3), R(1, 3));*((C) + 2 * ldc + 1) = spu_extract(tmp5,0);*((C) + 2 * ldc + 2) = spu_extract(tmp5,1);*((C) + 3 * ldc + 0) = spu_extract(tmp5,2);*((C) + 3 * ldc + 1) = spu_extract(tmp5,3); HSUM(R(2, 3), R(3, 3), zero, zero);*((C) + 3 * ldc + 2) = spu_extract(tmp5,0);*((C) + 3 * ldc + 3) = spu_extract(tmp5,1);
|
|
119 |
#define SUM_V4(C) HSUM(R(0, 0), R(1, 0), R(2, 0), R(3, 0)); C[0] = tmp5;
|
|
120 |
#define SUM_T5(C) HSUM(R(0, 0), R(0, 1), R(1, 1), R(0, 2));*((C) + 0 * ldc + 0) = spu_extract(tmp5,0);*((C) + 1 * ldc + 0) = spu_extract(tmp5,1);*((C) + 1 * ldc + 1) = spu_extract(tmp5,2);*((C) + 2 * ldc + 0) = spu_extract(tmp5,3); HSUM(R(1, 2), R(2, 2), R(0, 3), R(1, 3));*((C) + 2 * ldc + 1) = spu_extract(tmp5,0);*((C) + 2 * ldc + 2) = spu_extract(tmp5,1);*((C) + 3 * ldc + 0) = spu_extract(tmp5,2);*((C) + 3 * ldc + 1) = spu_extract(tmp5,3); HSUM(R(2, 3), R(3, 3), R(0, 4), R(1, 4));*((C) + 3 * ldc + 2) = spu_extract(tmp5,0);*((C) + 3 * ldc + 3) = spu_extract(tmp5,1);*((C) + 4 * ldc + 0) = spu_extract(tmp5,2);*((C) + 4 * ldc + 1) = spu_extract(tmp5,3); HSUM(R(2, 4), R(3, 4), R(4, 4), zero);*((C) + 4 * ldc + 2) = spu_extract(tmp5,0);*((C) + 4 * ldc + 3) = spu_extract(tmp5,1);*((C) + 4 * ldc + 4) = spu_extract(tmp5,2);
|
|
121 |
#define SUM_V5(C) HSUM(R(0, 0), R(1, 0), R(2, 0), R(3, 0)); C[0] = tmp5; HSUM(R(4, 0), zero, zero, zero); C[1] = tmp5;
|
|
122 |
#define SUM_T6(C) HSUM(R(0, 0), R(0, 1), R(1, 1), R(0, 2));*((C) + 0 * ldc + 0) = spu_extract(tmp5,0);*((C) + 1 * ldc + 0) = spu_extract(tmp5,1);*((C) + 1 * ldc + 1) = spu_extract(tmp5,2);*((C) + 2 * ldc + 0) = spu_extract(tmp5,3); HSUM(R(1, 2), R(2, 2), R(0, 3), R(1, 3));*((C) + 2 * ldc + 1) = spu_extract(tmp5,0);*((C) + 2 * ldc + 2) = spu_extract(tmp5,1);*((C) + 3 * ldc + 0) = spu_extract(tmp5,2);*((C) + 3 * ldc + 1) = spu_extract(tmp5,3); HSUM(R(2, 3), R(3, 3), R(0, 4), R(1, 4));*((C) + 3 * ldc + 2) = spu_extract(tmp5,0);*((C) + 3 * ldc + 3) = spu_extract(tmp5,1);*((C) + 4 * ldc + 0) = spu_extract(tmp5,2);*((C) + 4 * ldc + 1) = spu_extract(tmp5,3); HSUM(R(2, 4), R(3, 4), R(4, 4), R(0, 5));*((C) + 4 * ldc + 2) = spu_extract(tmp5,0);*((C) + 4 * ldc + 3) = spu_extract(tmp5,1);*((C) + 4 * ldc + 4) = spu_extract(tmp5,2);*((C) + 5 * ldc + 0) = spu_extract(tmp5,3); HSUM(R(1, 5), R(2, 5), R(3, 5), R(4, 5));*((C) + 5 * ldc + 1) = spu_extract(tmp5,0);*((C) + 5 * ldc + 2) = spu_extract(tmp5,1);*((C) + 5 * ldc + 3) = spu_extract(tmp5,2);*((C) + 5 * ldc + 4) = spu_extract(tmp5,3); HSUM(R(5, 5), zero, zero, zero);*((C) + 5 * ldc + 5) = spu_extract(tmp5,0);
|
|
123 |
#define SUM_V6(C) HSUM(R(0, 0), R(1, 0), R(2, 0), R(3, 0)); C[0] = tmp5; HSUM(R(4, 0), R(5, 0), zero, zero); C[1] = tmp5;
|
|
124 |
#define SUM_T7(C) HSUM(R(0, 0), R(0, 1), R(1, 1), R(0, 2));*((C) + 0 * ldc + 0) = spu_extract(tmp5,0);*((C) + 1 * ldc + 0) = spu_extract(tmp5,1);*((C) + 1 * ldc + 1) = spu_extract(tmp5,2);*((C) + 2 * ldc + 0) = spu_extract(tmp5,3); HSUM(R(1, 2), R(2, 2), R(0, 3), R(1, 3));*((C) + 2 * ldc + 1) = spu_extract(tmp5,0);*((C) + 2 * ldc + 2) = spu_extract(tmp5,1);*((C) + 3 * ldc + 0) = spu_extract(tmp5,2);*((C) + 3 * ldc + 1) = spu_extract(tmp5,3); HSUM(R(2, 3), R(3, 3), R(0, 4), R(1, 4));*((C) + 3 * ldc + 2) = spu_extract(tmp5,0);*((C) + 3 * ldc + 3) = spu_extract(tmp5,1);*((C) + 4 * ldc + 0) = spu_extract(tmp5,2);*((C) + 4 * ldc + 1) = spu_extract(tmp5,3); HSUM(R(2, 4), R(3, 4), R(4, 4), R(0, 5));*((C) + 4 * ldc + 2) = spu_extract(tmp5,0);*((C) + 4 * ldc + 3) = spu_extract(tmp5,1);*((C) + 4 * ldc + 4) = spu_extract(tmp5,2);*((C) + 5 * ldc + 0) = spu_extract(tmp5,3); HSUM(R(1, 5), R(2, 5), R(3, 5), R(4, 5));*((C) + 5 * ldc + 1) = spu_extract(tmp5,0);*((C) + 5 * ldc + 2) = spu_extract(tmp5,1);*((C) + 5 * ldc + 3) = spu_extract(tmp5,2);*((C) + 5 * ldc + 4) = spu_extract(tmp5,3); HSUM(R(5, 5), R(0, 6), R(1, 6), R(2, 6));*((C) + 5 * ldc + 5) = spu_extract(tmp5,0);*((C) + 6 * ldc + 0) = spu_extract(tmp5,1);*((C) + 6 * ldc + 1) = spu_extract(tmp5,2);*((C) + 6 * ldc + 2) = spu_extract(tmp5,3); HSUM(R(3, 6), R(4, 6), R(5, 6), R(6, 6));*((C) + 6 * ldc + 3) = spu_extract(tmp5,0);*((C) + 6 * ldc + 4) = spu_extract(tmp5,1);*((C) + 6 * ldc + 5) = spu_extract(tmp5,2);*((C) + 6 * ldc + 6) = spu_extract(tmp5,3);
|
|
125 |
#define SUM_V7(C) HSUM(R(0, 0), R(1, 0), R(2, 0), R(3, 0)); C[0] = tmp5; HSUM(R(4, 0), R(5, 0), R(6, 0), zero); C[1] = tmp5;
|
|
126 |
#define SUM_T8(C) HSUM(R(0, 0), R(0, 1), R(1, 1), R(0, 2));*((C) + 0 * ldc + 0) = spu_extract(tmp5,0);*((C) + 1 * ldc + 0) = spu_extract(tmp5,1);*((C) + 1 * ldc + 1) = spu_extract(tmp5,2);*((C) + 2 * ldc + 0) = spu_extract(tmp5,3); HSUM(R(1, 2), R(2, 2), R(0, 3), R(1, 3));*((C) + 2 * ldc + 1) = spu_extract(tmp5,0);*((C) + 2 * ldc + 2) = spu_extract(tmp5,1);*((C) + 3 * ldc + 0) = spu_extract(tmp5,2);*((C) + 3 * ldc + 1) = spu_extract(tmp5,3); HSUM(R(2, 3), R(3, 3), R(0, 4), R(1, 4));*((C) + 3 * ldc + 2) = spu_extract(tmp5,0);*((C) + 3 * ldc + 3) = spu_extract(tmp5,1);*((C) + 4 * ldc + 0) = spu_extract(tmp5,2);*((C) + 4 * ldc + 1) = spu_extract(tmp5,3); HSUM(R(2, 4), R(3, 4), R(4, 4), R(0, 5));*((C) + 4 * ldc + 2) = spu_extract(tmp5,0);*((C) + 4 * ldc + 3) = spu_extract(tmp5,1);*((C) + 4 * ldc + 4) = spu_extract(tmp5,2);*((C) + 5 * ldc + 0) = spu_extract(tmp5,3); HSUM(R(1, 5), R(2, 5), R(3, 5), R(4, 5));*((C) + 5 * ldc + 1) = spu_extract(tmp5,0);*((C) + 5 * ldc + 2) = spu_extract(tmp5,1);*((C) + 5 * ldc + 3) = spu_extract(tmp5,2);*((C) + 5 * ldc + 4) = spu_extract(tmp5,3); HSUM(R(5, 5), R(0, 6), R(1, 6), R(2, 6));*((C) + 5 * ldc + 5) = spu_extract(tmp5,0);*((C) + 6 * ldc + 0) = spu_extract(tmp5,1);*((C) + 6 * ldc + 1) = spu_extract(tmp5,2);*((C) + 6 * ldc + 2) = spu_extract(tmp5,3); HSUM(R(3, 6), R(4, 6), R(5, 6), R(6, 6));*((C) + 6 * ldc + 3) = spu_extract(tmp5,0);*((C) + 6 * ldc + 4) = spu_extract(tmp5,1);*((C) + 6 * ldc + 5) = spu_extract(tmp5,2);*((C) + 6 * ldc + 6) = spu_extract(tmp5,3); HSUM(R(0, 7), R(1, 7), R(2, 7), R(3, 7));*((C) + 7 * ldc + 0) = spu_extract(tmp5,0);*((C) + 7 * ldc + 1) = spu_extract(tmp5,1);*((C) + 7 * ldc + 2) = spu_extract(tmp5,2);*((C) + 7 * ldc + 3) = spu_extract(tmp5,3); HSUM(R(4, 7), R(5, 7), R(6, 7), R(7, 7));*((C) + 7 * ldc + 4) = spu_extract(tmp5,0);*((C) + 7 * ldc + 5) = spu_extract(tmp5,1);*((C) + 7 * ldc + 6) = spu_extract(tmp5,2);*((C) + 7 * ldc + 7) = spu_extract(tmp5,3);
|
|
127 |
#define SUM_V8(C) HSUM(R(0, 0), R(1, 0), R(2, 0), R(3, 0)); C[0] = tmp5; HSUM(R(4, 0), R(5, 0), R(6, 0), R(7, 0)); C[1] = tmp5;
|
|
128 |
#define SUM_T9(C) HSUM(R(0, 0), R(0, 1), R(1, 1), R(0, 2));*((C) + 0 * ldc + 0) = spu_extract(tmp5,0);*((C) + 1 * ldc + 0) = spu_extract(tmp5,1);*((C) + 1 * ldc + 1) = spu_extract(tmp5,2);*((C) + 2 * ldc + 0) = spu_extract(tmp5,3); HSUM(R(1, 2), R(2, 2), R(0, 3), R(1, 3));*((C) + 2 * ldc + 1) = spu_extract(tmp5,0);*((C) + 2 * ldc + 2) = spu_extract(tmp5,1);*((C) + 3 * ldc + 0) = spu_extract(tmp5,2);*((C) + 3 * ldc + 1) = spu_extract(tmp5,3); HSUM(R(2, 3), R(3, 3), R(0, 4), R(1, 4));*((C) + 3 * ldc + 2) = spu_extract(tmp5,0);*((C) + 3 * ldc + 3) = spu_extract(tmp5,1);*((C) + 4 * ldc + 0) = spu_extract(tmp5,2);*((C) + 4 * ldc + 1) = spu_extract(tmp5,3); HSUM(R(2, 4), R(3, 4), R(4, 4), R(0, 5));*((C) + 4 * ldc + 2) = spu_extract(tmp5,0);*((C) + 4 * ldc + 3) = spu_extract(tmp5,1);*((C) + 4 * ldc + 4) = spu_extract(tmp5,2);*((C) + 5 * ldc + 0) = spu_extract(tmp5,3); HSUM(R(1, 5), R(2, 5), R(3, 5), R(4, 5));*((C) + 5 * ldc + 1) = spu_extract(tmp5,0);*((C) + 5 * ldc + 2) = spu_extract(tmp5,1);*((C) + 5 * ldc + 3) = spu_extract(tmp5,2);*((C) + 5 * ldc + 4) = spu_extract(tmp5,3); HSUM(R(5, 5), R(0, 6), R(1, 6), R(2, 6));*((C) + 5 * ldc + 5) = spu_extract(tmp5,0);*((C) + 6 * ldc + 0) = spu_extract(tmp5,1);*((C) + 6 * ldc + 1) = spu_extract(tmp5,2);*((C) + 6 * ldc + 2) = spu_extract(tmp5,3); HSUM(R(3, 6), R(4, 6), R(5, 6), R(6, 6));*((C) + 6 * ldc + 3) = spu_extract(tmp5,0);*((C) + 6 * ldc + 4) = spu_extract(tmp5,1);*((C) + 6 * ldc + 5) = spu_extract(tmp5,2);*((C) + 6 * ldc + 6) = spu_extract(tmp5,3); HSUM(R(0, 7), R(1, 7), R(2, 7), R(3, 7));*((C) + 7 * ldc + 0) = spu_extract(tmp5,0);*((C) + 7 * ldc + 1) = spu_extract(tmp5,1);*((C) + 7 * ldc + 2) = spu_extract(tmp5,2);*((C) + 7 * ldc + 3) = spu_extract(tmp5,3); HSUM(R(4, 7), R(5, 7), R(6, 7), R(7, 7));*((C) + 7 * ldc + 4) = spu_extract(tmp5,0);*((C) + 7 * ldc + 5) = spu_extract(tmp5,1);*((C) + 7 * ldc + 6) = spu_extract(tmp5,2);*((C) + 7 * ldc + 7) = spu_extract(tmp5,3); HSUM(R(0, 8), R(1, 8), R(2, 8), R(3, 8));*((C) + 8 * ldc + 0) = spu_extract(tmp5,0);*((C) + 8 * ldc + 1) = spu_extract(tmp5,1);*((C) + 8 * ldc + 2) = spu_extract(tmp5,2);*((C) + 8 * ldc + 3) = spu_extract(tmp5,3); HSUM(R(4, 8), R(5, 8), R(6, 8), R(7, 8));*((C) + 8 * ldc + 4) = spu_extract(tmp5,0);*((C) + 8 * ldc + 5) = spu_extract(tmp5,1);*((C) + 8 * ldc + 6) = spu_extract(tmp5,2);*((C) + 8 * ldc + 7) = spu_extract(tmp5,3); HSUM(R(8, 8), zero, zero, zero);*((C) + 8 * ldc + 8) = spu_extract(tmp5,0);
|
|
129 |
#define SUM_V9(C) HSUM(R(0, 0), R(1, 0), R(2, 0), R(3, 0)); C[0] = tmp5; HSUM(R(4, 0), R(5, 0), R(6, 0), R(7, 0)); C[1] = tmp5; HSUM(R(8, 0), zero, zero, zero); C[2] = tmp5;
|
|
130 |
#define SUM_T10(C) HSUM(R(0, 0), R(0, 1), R(1, 1), R(0, 2));*((C) + 0 * ldc + 0) = spu_extract(tmp5,0);*((C) + 1 * ldc + 0) = spu_extract(tmp5,1);*((C) + 1 * ldc + 1) = spu_extract(tmp5,2);*((C) + 2 * ldc + 0) = spu_extract(tmp5,3); HSUM(R(1, 2), R(2, 2), R(0, 3), R(1, 3));*((C) + 2 * ldc + 1) = spu_extract(tmp5,0);*((C) + 2 * ldc + 2) = spu_extract(tmp5,1);*((C) + 3 * ldc + 0) = spu_extract(tmp5,2);*((C) + 3 * ldc + 1) = spu_extract(tmp5,3); HSUM(R(2, 3), R(3, 3), R(0, 4), R(1, 4));*((C) + 3 * ldc + 2) = spu_extract(tmp5,0);*((C) + 3 * ldc + 3) = spu_extract(tmp5,1);*((C) + 4 * ldc + 0) = spu_extract(tmp5,2);*((C) + 4 * ldc + 1) = spu_extract(tmp5,3); HSUM(R(2, 4), R(3, 4), R(4, 4), R(0, 5));*((C) + 4 * ldc + 2) = spu_extract(tmp5,0);*((C) + 4 * ldc + 3) = spu_extract(tmp5,1);*((C) + 4 * ldc + 4) = spu_extract(tmp5,2);*((C) + 5 * ldc + 0) = spu_extract(tmp5,3); HSUM(R(1, 5), R(2, 5), R(3, 5), R(4, 5));*((C) + 5 * ldc + 1) = spu_extract(tmp5,0);*((C) + 5 * ldc + 2) = spu_extract(tmp5,1);*((C) + 5 * ldc + 3) = spu_extract(tmp5,2);*((C) + 5 * ldc + 4) = spu_extract(tmp5,3); HSUM(R(5, 5), R(0, 6), R(1, 6), R(2, 6));*((C) + 5 * ldc + 5) = spu_extract(tmp5,0);*((C) + 6 * ldc + 0) = spu_extract(tmp5,1);*((C) + 6 * ldc + 1) = spu_extract(tmp5,2);*((C) + 6 * ldc + 2) = spu_extract(tmp5,3); HSUM(R(3, 6), R(4, 6), R(5, 6), R(6, 6));*((C) + 6 * ldc + 3) = spu_extract(tmp5,0);*((C) + 6 * ldc + 4) = spu_extract(tmp5,1);*((C) + 6 * ldc + 5) = spu_extract(tmp5,2);*((C) + 6 * ldc + 6) = spu_extract(tmp5,3); HSUM(R(0, 7), R(1, 7), R(2, 7), R(3, 7));*((C) + 7 * ldc + 0) = spu_extract(tmp5,0);*((C) + 7 * ldc + 1) = spu_extract(tmp5,1);*((C) + 7 * ldc + 2) = spu_extract(tmp5,2);*((C) + 7 * ldc + 3) = spu_extract(tmp5,3); HSUM(R(4, 7), R(5, 7), R(6, 7), R(7, 7));*((C) + 7 * ldc + 4) = spu_extract(tmp5,0);*((C) + 7 * ldc + 5) = spu_extract(tmp5,1);*((C) + 7 * ldc + 6) = spu_extract(tmp5,2);*((C) + 7 * ldc + 7) = spu_extract(tmp5,3); HSUM(R(0, 8), R(1, 8), R(2, 8), R(3, 8));*((C) + 8 * ldc + 0) = spu_extract(tmp5,0);*((C) + 8 * ldc + 1) = spu_extract(tmp5,1);*((C) + 8 * ldc + 2) = spu_extract(tmp5,2);*((C) + 8 * ldc + 3) = spu_extract(tmp5,3); HSUM(R(4, 8), R(5, 8), R(6, 8), R(7, 8));*((C) + 8 * ldc + 4) = spu_extract(tmp5,0);*((C) + 8 * ldc + 5) = spu_extract(tmp5,1);*((C) + 8 * ldc + 6) = spu_extract(tmp5,2);*((C) + 8 * ldc + 7) = spu_extract(tmp5,3); HSUM(R(8, 8), R(0, 9), R(1, 9), R(2, 9));*((C) + 8 * ldc + 8) = spu_extract(tmp5,0);*((C) + 9 * ldc + 0) = spu_extract(tmp5,1);*((C) + 9 * ldc + 1) = spu_extract(tmp5,2);*((C) + 9 * ldc + 2) = spu_extract(tmp5,3); HSUM(R(3, 9), R(4, 9), R(5, 9), R(6, 9));*((C) + 9 * ldc + 3) = spu_extract(tmp5,0);*((C) + 9 * ldc + 4) = spu_extract(tmp5,1);*((C) + 9 * ldc + 5) = spu_extract(tmp5,2);*((C) + 9 * ldc + 6) = spu_extract(tmp5,3); HSUM(R(7, 9), R(8, 9), R(9, 9), zero);*((C) + 9 * ldc + 7) = spu_extract(tmp5,0);*((C) + 9 * ldc + 8) = spu_extract(tmp5,1);*((C) + 9 * ldc + 9) = spu_extract(tmp5,2);
|
|
131 |
#define SUM_V10(C) HSUM(R(0, 0), R(1, 0), R(2, 0), R(3, 0)); C[0] = tmp5; HSUM(R(4, 0), R(5, 0), R(6, 0), R(7, 0)); C[1] = tmp5; HSUM(R(8, 0), R(9, 0), zero, zero); C[2] = tmp5;
|
|
132 |
#define SUM_T11(C) HSUM(R(0, 0), R(0, 1), R(1, 1), R(0, 2));*((C) + 0 * ldc + 0) = spu_extract(tmp5,0);*((C) + 1 * ldc + 0) = spu_extract(tmp5,1);*((C) + 1 * ldc + 1) = spu_extract(tmp5,2);*((C) + 2 * ldc + 0) = spu_extract(tmp5,3); HSUM(R(1, 2), R(2, 2), R(0, 3), R(1, 3));*((C) + 2 * ldc + 1) = spu_extract(tmp5,0);*((C) + 2 * ldc + 2) = spu_extract(tmp5,1);*((C) + 3 * ldc + 0) = spu_extract(tmp5,2);*((C) + 3 * ldc + 1) = spu_extract(tmp5,3); HSUM(R(2, 3), R(3, 3), R(0, 4), R(1, 4));*((C) + 3 * ldc + 2) = spu_extract(tmp5,0);*((C) + 3 * ldc + 3) = spu_extract(tmp5,1);*((C) + 4 * ldc + 0) = spu_extract(tmp5,2);*((C) + 4 * ldc + 1) = spu_extract(tmp5,3); HSUM(R(2, 4), R(3, 4), R(4, 4), R(0, 5));*((C) + 4 * ldc + 2) = spu_extract(tmp5,0);*((C) + 4 * ldc + 3) = spu_extract(tmp5,1);*((C) + 4 * ldc + 4) = spu_extract(tmp5,2);*((C) + 5 * ldc + 0) = spu_extract(tmp5,3); HSUM(R(1, 5), R(2, 5), R(3, 5), R(4, 5));*((C) + 5 * ldc + 1) = spu_extract(tmp5,0);*((C) + 5 * ldc + 2) = spu_extract(tmp5,1);*((C) + 5 * ldc + 3) = spu_extract(tmp5,2);*((C) + 5 * ldc + 4) = spu_extract(tmp5,3); HSUM(R(5, 5), R(0, 6), R(1, 6), R(2, 6));*((C) + 5 * ldc + 5) = spu_extract(tmp5,0);*((C) + 6 * ldc + 0) = spu_extract(tmp5,1);*((C) + 6 * ldc + 1) = spu_extract(tmp5,2);*((C) + 6 * ldc + 2) = spu_extract(tmp5,3); HSUM(R(3, 6), R(4, 6), R(5, 6), R(6, 6));*((C) + 6 * ldc + 3) = spu_extract(tmp5,0);*((C) + 6 * ldc + 4) = spu_extract(tmp5,1);*((C) + 6 * ldc + 5) = spu_extract(tmp5,2);*((C) + 6 * ldc + 6) = spu_extract(tmp5,3); HSUM(R(0, 7), R(1, 7), R(2, 7), R(3, 7));*((C) + 7 * ldc + 0) = spu_extract(tmp5,0);*((C) + 7 * ldc + 1) = spu_extract(tmp5,1);*((C) + 7 * ldc + 2) = spu_extract(tmp5,2);*((C) + 7 * ldc + 3) = spu_extract(tmp5,3); HSUM(R(4, 7), R(5, 7), R(6, 7), R(7, 7));*((C) + 7 * ldc + 4) = spu_extract(tmp5,0);*((C) + 7 * ldc + 5) = spu_extract(tmp5,1);*((C) + 7 * ldc + 6) = spu_extract(tmp5,2);*((C) + 7 * ldc + 7) = spu_extract(tmp5,3); HSUM(R(0, 8), R(1, 8), R(2, 8), R(3, 8));*((C) + 8 * ldc + 0) = spu_extract(tmp5,0);*((C) + 8 * ldc + 1) = spu_extract(tmp5,1);*((C) + 8 * ldc + 2) = spu_extract(tmp5,2);*((C) + 8 * ldc + 3) = spu_extract(tmp5,3); HSUM(R(4, 8), R(5, 8), R(6, 8), R(7, 8));*((C) + 8 * ldc + 4) = spu_extract(tmp5,0);*((C) + 8 * ldc + 5) = spu_extract(tmp5,1);*((C) + 8 * ldc + 6) = spu_extract(tmp5,2);*((C) + 8 * ldc + 7) = spu_extract(tmp5,3); HSUM(R(8, 8), R(0, 9), R(1, 9), R(2, 9));*((C) + 8 * ldc + 8) = spu_extract(tmp5,0);*((C) + 9 * ldc + 0) = spu_extract(tmp5,1);*((C) + 9 * ldc + 1) = spu_extract(tmp5,2);*((C) + 9 * ldc + 2) = spu_extract(tmp5,3); HSUM(R(3, 9), R(4, 9), R(5, 9), R(6, 9));*((C) + 9 * ldc + 3) = spu_extract(tmp5,0);*((C) + 9 * ldc + 4) = spu_extract(tmp5,1);*((C) + 9 * ldc + 5) = spu_extract(tmp5,2);*((C) + 9 * ldc + 6) = spu_extract(tmp5,3); HSUM(R(7, 9), R(8, 9), R(9, 9), R(0, 10));*((C) + 9 * ldc + 7) = spu_extract(tmp5,0);*((C) + 9 * ldc + 8) = spu_extract(tmp5,1);*((C) + 9 * ldc + 9) = spu_extract(tmp5,2);*((C) + 10 * ldc + 0) = spu_extract(tmp5,3); HSUM(R(1, 10), R(2, 10), R(3, 10), R(4, 10));*((C) + 10 * ldc + 1) = spu_extract(tmp5,0);*((C) + 10 * ldc + 2) = spu_extract(tmp5,1);*((C) + 10 * ldc + 3) = spu_extract(tmp5,2);*((C) + 10 * ldc + 4) = spu_extract(tmp5,3); HSUM(R(5, 10), R(6, 10), R(7, 10), R(8, 10));*((C) + 10 * ldc + 5) = spu_extract(tmp5,0);*((C) + 10 * ldc + 6) = spu_extract(tmp5,1);*((C) + 10 * ldc + 7) = spu_extract(tmp5,2);*((C) + 10 * ldc + 8) = spu_extract(tmp5,3); HSUM(R(9, 10), R(10, 10), zero, zero);*((C) + 10 * ldc + 9) = spu_extract(tmp5,0);*((C) + 10 * ldc + 10) = spu_extract(tmp5,1);
|
|
133 |
#define SUM_V11(C) HSUM(R(0, 0), R(1, 0), R(2, 0), R(3, 0)); C[0] = tmp5; HSUM(R(4, 0), R(5, 0), R(6, 0), R(7, 0)); C[1] = tmp5; HSUM(R(8, 0), R(9, 0), R(10, 0), zero); C[2] = tmp5;
|
|
134 |
#define SUM_T12(C) HSUM(R(0, 0), R(0, 1), R(1, 1), R(0, 2));*((C) + 0 * ldc + 0) = spu_extract(tmp5,0);*((C) + 1 * ldc + 0) = spu_extract(tmp5,1);*((C) + 1 * ldc + 1) = spu_extract(tmp5,2);*((C) + 2 * ldc + 0) = spu_extract(tmp5,3); HSUM(R(1, 2), R(2, 2), R(0, 3), R(1, 3));*((C) + 2 * ldc + 1) = spu_extract(tmp5,0);*((C) + 2 * ldc + 2) = spu_extract(tmp5,1);*((C) + 3 * ldc + 0) = spu_extract(tmp5,2);*((C) + 3 * ldc + 1) = spu_extract(tmp5,3); HSUM(R(2, 3), R(3, 3), R(0, 4), R(1, 4));*((C) + 3 * ldc + 2) = spu_extract(tmp5,0);*((C) + 3 * ldc + 3) = spu_extract(tmp5,1);*((C) + 4 * ldc + 0) = spu_extract(tmp5,2);*((C) + 4 * ldc + 1) = spu_extract(tmp5,3); HSUM(R(2, 4), R(3, 4), R(4, 4), R(0, 5));*((C) + 4 * ldc + 2) = spu_extract(tmp5,0);*((C) + 4 * ldc + 3) = spu_extract(tmp5,1);*((C) + 4 * ldc + 4) = spu_extract(tmp5,2);*((C) + 5 * ldc + 0) = spu_extract(tmp5,3); HSUM(R(1, 5), R(2, 5), R(3, 5), R(4, 5));*((C) + 5 * ldc + 1) = spu_extract(tmp5,0);*((C) + 5 * ldc + 2) = spu_extract(tmp5,1);*((C) + 5 * ldc + 3) = spu_extract(tmp5,2);*((C) + 5 * ldc + 4) = spu_extract(tmp5,3); HSUM(R(5, 5), R(0, 6), R(1, 6), R(2, 6));*((C) + 5 * ldc + 5) = spu_extract(tmp5,0);*((C) + 6 * ldc + 0) = spu_extract(tmp5,1);*((C) + 6 * ldc + 1) = spu_extract(tmp5,2);*((C) + 6 * ldc + 2) = spu_extract(tmp5,3); HSUM(R(3, 6), R(4, 6), R(5, 6), R(6, 6));*((C) + 6 * ldc + 3) = spu_extract(tmp5,0);*((C) + 6 * ldc + 4) = spu_extract(tmp5,1);*((C) + 6 * ldc + 5) = spu_extract(tmp5,2);*((C) + 6 * ldc + 6) = spu_extract(tmp5,3); HSUM(R(0, 7), R(1, 7), R(2, 7), R(3, 7));*((C) + 7 * ldc + 0) = spu_extract(tmp5,0);*((C) + 7 * ldc + 1) = spu_extract(tmp5,1);*((C) + 7 * ldc + 2) = spu_extract(tmp5,2);*((C) + 7 * ldc + 3) = spu_extract(tmp5,3); HSUM(R(4, 7), R(5, 7), R(6, 7), R(7, 7));*((C) + 7 * ldc + 4) = spu_extract(tmp5,0);*((C) + 7 * ldc + 5) = spu_extract(tmp5,1);*((C) + 7 * ldc + 6) = spu_extract(tmp5,2);*((C) + 7 * ldc + 7) = spu_extract(tmp5,3); HSUM(R(0, 8), R(1, 8), R(2, 8), R(3, 8));*((C) + 8 * ldc + 0) = spu_extract(tmp5,0);*((C) + 8 * ldc + 1) = spu_extract(tmp5,1);*((C) + 8 * ldc + 2) = spu_extract(tmp5,2);*((C) + 8 * ldc + 3) = spu_extract(tmp5,3); HSUM(R(4, 8), R(5, 8), R(6, 8), R(7, 8));*((C) + 8 * ldc + 4) = spu_extract(tmp5,0);*((C) + 8 * ldc + 5) = spu_extract(tmp5,1);*((C) + 8 * ldc + 6) = spu_extract(tmp5,2);*((C) + 8 * ldc + 7) = spu_extract(tmp5,3); HSUM(R(8, 8), R(0, 9), R(1, 9), R(2, 9));*((C) + 8 * ldc + 8) = spu_extract(tmp5,0);*((C) + 9 * ldc + 0) = spu_extract(tmp5,1);*((C) + 9 * ldc + 1) = spu_extract(tmp5,2);*((C) + 9 * ldc + 2) = spu_extract(tmp5,3); HSUM(R(3, 9), R(4, 9), R(5, 9), R(6, 9));*((C) + 9 * ldc + 3) = spu_extract(tmp5,0);*((C) + 9 * ldc + 4) = spu_extract(tmp5,1);*((C) + 9 * ldc + 5) = spu_extract(tmp5,2);*((C) + 9 * ldc + 6) = spu_extract(tmp5,3); HSUM(R(7, 9), R(8, 9), R(9, 9), R(0, 10));*((C) + 9 * ldc + 7) = spu_extract(tmp5,0);*((C) + 9 * ldc + 8) = spu_extract(tmp5,1);*((C) + 9 * ldc + 9) = spu_extract(tmp5,2);*((C) + 10 * ldc + 0) = spu_extract(tmp5,3); HSUM(R(1, 10), R(2, 10), R(3, 10), R(4, 10));*((C) + 10 * ldc + 1) = spu_extract(tmp5,0);*((C) + 10 * ldc + 2) = spu_extract(tmp5,1);*((C) + 10 * ldc + 3) = spu_extract(tmp5,2);*((C) + 10 * ldc + 4) = spu_extract(tmp5,3); HSUM(R(5, 10), R(6, 10), R(7, 10), R(8, 10));*((C) + 10 * ldc + 5) = spu_extract(tmp5,0);*((C) + 10 * ldc + 6) = spu_extract(tmp5,1);*((C) + 10 * ldc + 7) = spu_extract(tmp5,2);*((C) + 10 * ldc + 8) = spu_extract(tmp5,3); HSUM(R(9, 10), R(10, 10), R(0, 11), R(1, 11));*((C) + 10 * ldc + 9) = spu_extract(tmp5,0);*((C) + 10 * ldc + 10) = spu_extract(tmp5,1);*((C) + 11 * ldc + 0) = spu_extract(tmp5,2);*((C) + 11 * ldc + 1) = spu_extract(tmp5,3); HSUM(R(2, 11), R(3, 11), R(4, 11), R(5, 11));*((C) + 11 * ldc + 2) = spu_extract(tmp5,0);*((C) + 11 * ldc + 3) = spu_extract(tmp5,1);*((C) + 11 * ldc + 4) = spu_extract(tmp5,2);*((C) + 11 * ldc + 5) = spu_extract(tmp5,3); HSUM(R(6, 11), R(7, 11), R(8, 11), R(9, 11));*((C) + 11 * ldc + 6) = spu_extract(tmp5,0);*((C) + 11 * ldc + 7) = spu_extract(tmp5,1);*((C) + 11 * ldc + 8) = spu_extract(tmp5,2);*((C) + 11 * ldc + 9) = spu_extract(tmp5,3); HSUM(R(10, 11), R(11, 11), zero, zero);*((C) + 11 * ldc + 10) = spu_extract(tmp5,0);*((C) + 11 * ldc + 11) = spu_extract(tmp5,1);
|
|
135 |
#define SUM_V12(C) HSUM(R(0, 0), R(1, 0), R(2, 0), R(3, 0)); C[0] = tmp5; HSUM(R(4, 0), R(5, 0), R(6, 0), R(7, 0)); C[1] = tmp5; HSUM(R(8, 0), R(9, 0), R(10, 0), R(11, 0)); C[2] = tmp5;
|
|
136 |
#define SUM_T13(C) HSUM(R(0, 0), R(0, 1), R(1, 1), R(0, 2));*((C) + 0 * ldc + 0) = spu_extract(tmp5,0);*((C) + 1 * ldc + 0) = spu_extract(tmp5,1);*((C) + 1 * ldc + 1) = spu_extract(tmp5,2);*((C) + 2 * ldc + 0) = spu_extract(tmp5,3); HSUM(R(1, 2), R(2, 2), R(0, 3), R(1, 3));*((C) + 2 * ldc + 1) = spu_extract(tmp5,0);*((C) + 2 * ldc + 2) = spu_extract(tmp5,1);*((C) + 3 * ldc + 0) = spu_extract(tmp5,2);*((C) + 3 * ldc + 1) = spu_extract(tmp5,3); HSUM(R(2, 3), R(3, 3), R(0, 4), R(1, 4));*((C) + 3 * ldc + 2) = spu_extract(tmp5,0);*((C) + 3 * ldc + 3) = spu_extract(tmp5,1);*((C) + 4 * ldc + 0) = spu_extract(tmp5,2);*((C) + 4 * ldc + 1) = spu_extract(tmp5,3); HSUM(R(2, 4), R(3, 4), R(4, 4), R(0, 5));*((C) + 4 * ldc + 2) = spu_extract(tmp5,0);*((C) + 4 * ldc + 3) = spu_extract(tmp5,1);*((C) + 4 * ldc + 4) = spu_extract(tmp5,2);*((C) + 5 * ldc + 0) = spu_extract(tmp5,3); HSUM(R(1, 5), R(2, 5), R(3, 5), R(4, 5));*((C) + 5 * ldc + 1) = spu_extract(tmp5,0);*((C) + 5 * ldc + 2) = spu_extract(tmp5,1);*((C) + 5 * ldc + 3) = spu_extract(tmp5,2);*((C) + 5 * ldc + 4) = spu_extract(tmp5,3); HSUM(R(5, 5), R(0, 6), R(1, 6), R(2, 6));*((C) + 5 * ldc + 5) = spu_extract(tmp5,0);*((C) + 6 * ldc + 0) = spu_extract(tmp5,1);*((C) + 6 * ldc + 1) = spu_extract(tmp5,2);*((C) + 6 * ldc + 2) = spu_extract(tmp5,3); HSUM(R(3, 6), R(4, 6), R(5, 6), R(6, 6));*((C) + 6 * ldc + 3) = spu_extract(tmp5,0);*((C) + 6 * ldc + 4) = spu_extract(tmp5,1);*((C) + 6 * ldc + 5) = spu_extract(tmp5,2);*((C) + 6 * ldc + 6) = spu_extract(tmp5,3); HSUM(R(0, 7), R(1, 7), R(2, 7), R(3, 7));*((C) + 7 * ldc + 0) = spu_extract(tmp5,0);*((C) + 7 * ldc + 1) = spu_extract(tmp5,1);*((C) + 7 * ldc + 2) = spu_extract(tmp5,2);*((C) + 7 * ldc + 3) = spu_extract(tmp5,3); HSUM(R(4, 7), R(5, 7), R(6, 7), R(7, 7));*((C) + 7 * ldc + 4) = spu_extract(tmp5,0);*((C) + 7 * ldc + 5) = spu_extract(tmp5,1);*((C) + 7 * ldc + 6) = spu_extract(tmp5,2);*((C) + 7 * ldc + 7) = spu_extract(tmp5,3); HSUM(R(0, 8), R(1, 8), R(2, 8), R(3, 8));*((C) + 8 * ldc + 0) = spu_extract(tmp5,0);*((C) + 8 * ldc + 1) = spu_extract(tmp5,1);*((C) + 8 * ldc + 2) = spu_extract(tmp5,2);*((C) + 8 * ldc + 3) = spu_extract(tmp5,3); HSUM(R(4, 8), R(5, 8), R(6, 8), R(7, 8));*((C) + 8 * ldc + 4) = spu_extract(tmp5,0);*((C) + 8 * ldc + 5) = spu_extract(tmp5,1);*((C) + 8 * ldc + 6) = spu_extract(tmp5,2);*((C) + 8 * ldc + 7) = spu_extract(tmp5,3); HSUM(R(8, 8), R(0, 9), R(1, 9), R(2, 9));*((C) + 8 * ldc + 8) = spu_extract(tmp5,0);*((C) + 9 * ldc + 0) = spu_extract(tmp5,1);*((C) + 9 * ldc + 1) = spu_extract(tmp5,2);*((C) + 9 * ldc + 2) = spu_extract(tmp5,3); HSUM(R(3, 9), R(4, 9), R(5, 9), R(6, 9));*((C) + 9 * ldc + 3) = spu_extract(tmp5,0);*((C) + 9 * ldc + 4) = spu_extract(tmp5,1);*((C) + 9 * ldc + 5) = spu_extract(tmp5,2);*((C) + 9 * ldc + 6) = spu_extract(tmp5,3); HSUM(R(7, 9), R(8, 9), R(9, 9), R(0, 10));*((C) + 9 * ldc + 7) = spu_extract(tmp5,0);*((C) + 9 * ldc + 8) = spu_extract(tmp5,1);*((C) + 9 * ldc + 9) = spu_extract(tmp5,2);*((C) + 10 * ldc + 0) = spu_extract(tmp5,3); HSUM(R(1, 10), R(2, 10), R(3, 10), R(4, 10));*((C) + 10 * ldc + 1) = spu_extract(tmp5,0);*((C) + 10 * ldc + 2) = spu_extract(tmp5,1);*((C) + 10 * ldc + 3) = spu_extract(tmp5,2);*((C) + 10 * ldc + 4) = spu_extract(tmp5,3); HSUM(R(5, 10), R(6, 10), R(7, 10), R(8, 10));*((C) + 10 * ldc + 5) = spu_extract(tmp5,0);*((C) + 10 * ldc + 6) = spu_extract(tmp5,1);*((C) + 10 * ldc + 7) = spu_extract(tmp5,2);*((C) + 10 * ldc + 8) = spu_extract(tmp5,3); HSUM(R(9, 10), R(10, 10), R(0, 11), R(1, 11));*((C) + 10 * ldc + 9) = spu_extract(tmp5,0);*((C) + 10 * ldc + 10) = spu_extract(tmp5,1);*((C) + 11 * ldc + 0) = spu_extract(tmp5,2);*((C) + 11 * ldc + 1) = spu_extract(tmp5,3); HSUM(R(2, 11), R(3, 11), R(4, 11), R(5, 11));*((C) + 11 * ldc + 2) = spu_extract(tmp5,0);*((C) + 11 * ldc + 3) = spu_extract(tmp5,1);*((C) + 11 * ldc + 4) = spu_extract(tmp5,2);*((C) + 11 * ldc + 5) = spu_extract(tmp5,3); HSUM(R(6, 11), R(7, 11), R(8, 11), R(9, 11));*((C) + 11 * ldc + 6) = spu_extract(tmp5,0);*((C) + 11 * ldc + 7) = spu_extract(tmp5,1);*((C) + 11 * ldc + 8) = spu_extract(tmp5,2);*((C) + 11 * ldc + 9) = spu_extract(tmp5,3); HSUM(R(10, 11), R(11, 11), R(0, 12), R(1, 12));*((C) + 11 * ldc + 10) = spu_extract(tmp5,0);*((C) + 11 * ldc + 11) = spu_extract(tmp5,1);*((C) + 12 * ldc + 0) = spu_extract(tmp5,2);*((C) + 12 * ldc + 1) = spu_extract(tmp5,3); HSUM(R(2, 12), R(3, 12), R(4, 12), R(5, 12));*((C) + 12 * ldc + 2) = spu_extract(tmp5,0);*((C) + 12 * ldc + 3) = spu_extract(tmp5,1);*((C) + 12 * ldc + 4) = spu_extract(tmp5,2);*((C) + 12 * ldc + 5) = spu_extract(tmp5,3); HSUM(R(6, 12), R(7, 12), R(8, 12), R(9, 12));*((C) + 12 * ldc + 6) = spu_extract(tmp5,0);*((C) + 12 * ldc + 7) = spu_extract(tmp5,1);*((C) + 12 * ldc + 8) = spu_extract(tmp5,2);*((C) + 12 * ldc + 9) = spu_extract(tmp5,3); HSUM(R(10, 12), R(11, 12), R(12, 12), zero);*((C) + 12 * ldc + 10) = spu_extract(tmp5,0);*((C) + 12 * ldc + 11) = spu_extract(tmp5,1);*((C) + 12 * ldc + 12) = spu_extract(tmp5,2);
|
|
137 |
#define SUM_V13(C) HSUM(R(0, 0), R(1, 0), R(2, 0), R(3, 0)); C[0] = tmp5; HSUM(R(4, 0), R(5, 0), R(6, 0), R(7, 0)); C[1] = tmp5; HSUM(R(8, 0), R(9, 0), R(10, 0), R(11, 0)); C[2] = tmp5; HSUM(R(12, 0), zero, zero, zero); C[3] = tmp5;
|
|
138 |
#define SUM_T14(C) HSUM(R(0, 0), R(0, 1), R(1, 1), R(0, 2));*((C) + 0 * ldc + 0) = spu_extract(tmp5,0);*((C) + 1 * ldc + 0) = spu_extract(tmp5,1);*((C) + 1 * ldc + 1) = spu_extract(tmp5,2);*((C) + 2 * ldc + 0) = spu_extract(tmp5,3); HSUM(R(1, 2), R(2, 2), R(0, 3), R(1, 3));*((C) + 2 * ldc + 1) = spu_extract(tmp5,0);*((C) + 2 * ldc + 2) = spu_extract(tmp5,1);*((C) + 3 * ldc + 0) = spu_extract(tmp5,2);*((C) + 3 * ldc + 1) = spu_extract(tmp5,3); HSUM(R(2, 3), R(3, 3), R(0, 4), R(1, 4));*((C) + 3 * ldc + 2) = spu_extract(tmp5,0);*((C) + 3 * ldc + 3) = spu_extract(tmp5,1);*((C) + 4 * ldc + 0) = spu_extract(tmp5,2);*((C) + 4 * ldc + 1) = spu_extract(tmp5,3); HSUM(R(2, 4), R(3, 4), R(4, 4), R(0, 5));*((C) + 4 * ldc + 2) = spu_extract(tmp5,0);*((C) + 4 * ldc + 3) = spu_extract(tmp5,1);*((C) + 4 * ldc + 4) = spu_extract(tmp5,2);*((C) + 5 * ldc + 0) = spu_extract(tmp5,3); HSUM(R(1, 5), R(2, 5), R(3, 5), R(4, 5));*((C) + 5 * ldc + 1) = spu_extract(tmp5,0);*((C) + 5 * ldc + 2) = spu_extract(tmp5,1);*((C) + 5 * ldc + 3) = spu_extract(tmp5,2);*((C) + 5 * ldc + 4) = spu_extract(tmp5,3); HSUM(R(5, 5), R(0, 6), R(1, 6), R(2, 6));*((C) + 5 * ldc + 5) = spu_extract(tmp5,0);*((C) + 6 * ldc + 0) = spu_extract(tmp5,1);*((C) + 6 * ldc + 1) = spu_extract(tmp5,2);*((C) + 6 * ldc + 2) = spu_extract(tmp5,3); HSUM(R(3, 6), R(4, 6), R(5, 6), R(6, 6));*((C) + 6 * ldc + 3) = spu_extract(tmp5,0);*((C) + 6 * ldc + 4) = spu_extract(tmp5,1);*((C) + 6 * ldc + 5) = spu_extract(tmp5,2);*((C) + 6 * ldc + 6) = spu_extract(tmp5,3); HSUM(R(0, 7), R(1, 7), R(2, 7), R(3, 7));*((C) + 7 * ldc + 0) = spu_extract(tmp5,0);*((C) + 7 * ldc + 1) = spu_extract(tmp5,1);*((C) + 7 * ldc + 2) = spu_extract(tmp5,2);*((C) + 7 * ldc + 3) = spu_extract(tmp5,3); HSUM(R(4, 7), R(5, 7), R(6, 7), R(7, 7));*((C) + 7 * ldc + 4) = spu_extract(tmp5,0);*((C) + 7 * ldc + 5) = spu_extract(tmp5,1);*((C) + 7 * ldc + 6) = spu_extract(tmp5,2);*((C) + 7 * ldc + 7) = spu_extract(tmp5,3); HSUM(R(0, 8), R(1, 8), R(2, 8), R(3, 8));*((C) + 8 * ldc + 0) = spu_extract(tmp5,0);*((C) + 8 * ldc + 1) = spu_extract(tmp5,1);*((C) + 8 * ldc + 2) = spu_extract(tmp5,2);*((C) + 8 * ldc + 3) = spu_extract(tmp5,3); HSUM(R(4, 8), R(5, 8), R(6, 8), R(7, 8));*((C) + 8 * ldc + 4) = spu_extract(tmp5,0);*((C) + 8 * ldc + 5) = spu_extract(tmp5,1);*((C) + 8 * ldc + 6) = spu_extract(tmp5,2);*((C) + 8 * ldc + 7) = spu_extract(tmp5,3); HSUM(R(8, 8), R(0, 9), R(1, 9), R(2, 9));*((C) + 8 * ldc + 8) = spu_extract(tmp5,0);*((C) + 9 * ldc + 0) = spu_extract(tmp5,1);*((C) + 9 * ldc + 1) = spu_extract(tmp5,2);*((C) + 9 * ldc + 2) = spu_extract(tmp5,3); HSUM(R(3, 9), R(4, 9), R(5, 9), R(6, 9));*((C) + 9 * ldc + 3) = spu_extract(tmp5,0);*((C) + 9 * ldc + 4) = spu_extract(tmp5,1);*((C) + 9 * ldc + 5) = spu_extract(tmp5,2);*((C) + 9 * ldc + 6) = spu_extract(tmp5,3); HSUM(R(7, 9), R(8, 9), R(9, 9), R(0, 10));*((C) + 9 * ldc + 7) = spu_extract(tmp5,0);*((C) + 9 * ldc + 8) = spu_extract(tmp5,1);*((C) + 9 * ldc + 9) = spu_extract(tmp5,2);*((C) + 10 * ldc + 0) = spu_extract(tmp5,3); HSUM(R(1, 10), R(2, 10), R(3, 10), R(4, 10));*((C) + 10 * ldc + 1) = spu_extract(tmp5,0);*((C) + 10 * ldc + 2) = spu_extract(tmp5,1);*((C) + 10 * ldc + 3) = spu_extract(tmp5,2);*((C) + 10 * ldc + 4) = spu_extract(tmp5,3); HSUM(R(5, 10), R(6, 10), R(7, 10), R(8, 10));*((C) + 10 * ldc + 5) = spu_extract(tmp5,0);*((C) + 10 * ldc + 6) = spu_extract(tmp5,1);*((C) + 10 * ldc + 7) = spu_extract(tmp5,2);*((C) + 10 * ldc + 8) = spu_extract(tmp5,3); HSUM(R(9, 10), R(10, 10), R(0, 11), R(1, 11));*((C) + 10 * ldc + 9) = spu_extract(tmp5,0);*((C) + 10 * ldc + 10) = spu_extract(tmp5,1);*((C) + 11 * ldc + 0) = spu_extract(tmp5,2);*((C) + 11 * ldc + 1) = spu_extract(tmp5,3); HSUM(R(2, 11), R(3, 11), R(4, 11), R(5, 11));*((C) + 11 * ldc + 2) = spu_extract(tmp5,0);*((C) + 11 * ldc + 3) = spu_extract(tmp5,1);*((C) + 11 * ldc + 4) = spu_extract(tmp5,2);*((C) + 11 * ldc + 5) = spu_extract(tmp5,3); HSUM(R(6, 11), R(7, 11), R(8, 11), R(9, 11));*((C) + 11 * ldc + 6) = spu_extract(tmp5,0);*((C) + 11 * ldc + 7) = spu_extract(tmp5,1);*((C) + 11 * ldc + 8) = spu_extract(tmp5,2);*((C) + 11 * ldc + 9) = spu_extract(tmp5,3); HSUM(R(10, 11), R(11, 11), R(0, 12), R(1, 12));*((C) + 11 * ldc + 10) = spu_extract(tmp5,0);*((C) + 11 * ldc + 11) = spu_extract(tmp5,1);*((C) + 12 * ldc + 0) = spu_extract(tmp5,2);*((C) + 12 * ldc + 1) = spu_extract(tmp5,3); HSUM(R(2, 12), R(3, 12), R(4, 12), R(5, 12));*((C) + 12 * ldc + 2) = spu_extract(tmp5,0);*((C) + 12 * ldc + 3) = spu_extract(tmp5,1);*((C) + 12 * ldc + 4) = spu_extract(tmp5,2);*((C) + 12 * ldc + 5) = spu_extract(tmp5,3); HSUM(R(6, 12), R(7, 12), R(8, 12), R(9, 12));*((C) + 12 * ldc + 6) = spu_extract(tmp5,0);*((C) + 12 * ldc + 7) = spu_extract(tmp5,1);*((C) + 12 * ldc + 8) = spu_extract(tmp5,2);*((C) + 12 * ldc + 9) = spu_extract(tmp5,3); HSUM(R(10, 12), R(11, 12), R(12, 12), R(0, 13));*((C) + 12 * ldc + 10) = spu_extract(tmp5,0);*((C) + 12 * ldc + 11) = spu_extract(tmp5,1);*((C) + 12 * ldc + 12) = spu_extract(tmp5,2);*((C) + 13 * ldc + 0) = spu_extract(tmp5,3); HSUM(R(1, 13), R(2, 13), R(3, 13), R(4, 13));*((C) + 13 * ldc + 1) = spu_extract(tmp5,0);*((C) + 13 * ldc + 2) = spu_extract(tmp5,1);*((C) + 13 * ldc + 3) = spu_extract(tmp5,2);*((C) + 13 * ldc + 4) = spu_extract(tmp5,3); HSUM(R(5, 13), R(6, 13), R(7, 13), R(8, 13));*((C) + 13 * ldc + 5) = spu_extract(tmp5,0);*((C) + 13 * ldc + 6) = spu_extract(tmp5,1);*((C) + 13 * ldc + 7) = spu_extract(tmp5,2);*((C) + 13 * ldc + 8) = spu_extract(tmp5,3); HSUM(R(9, 13), R(10, 13), R(11, 13), R(12, 13));*((C) + 13 * ldc + 9) = spu_extract(tmp5,0);*((C) + 13 * ldc + 10) = spu_extract(tmp5,1);*((C) + 13 * ldc + 11) = spu_extract(tmp5,2);*((C) + 13 * ldc + 12) = spu_extract(tmp5,3); HSUM(R(13, 13), zero, zero, zero);*((C) + 13 * ldc + 13) = spu_extract(tmp5,0);
|
|
139 |
#define SUM_V14(C) HSUM(R(0, 0), R(1, 0), R(2, 0), R(3, 0)); C[0] = tmp5; HSUM(R(4, 0), R(5, 0), R(6, 0), R(7, 0)); C[1] = tmp5; HSUM(R(8, 0), R(9, 0), R(10, 0), R(11, 0)); C[2] = tmp5; HSUM(R(12, 0), R(13, 0), zero, zero); C[3] = tmp5;
|
|
140 |
#define SUM_T15(C) HSUM(R(0, 0), R(0, 1), R(1, 1), R(0, 2));*((C) + 0 * ldc + 0) = spu_extract(tmp5,0);*((C) + 1 * ldc + 0) = spu_extract(tmp5,1);*((C) + 1 * ldc + 1) = spu_extract(tmp5,2);*((C) + 2 * ldc + 0) = spu_extract(tmp5,3); HSUM(R(1, 2), R(2, 2), R(0, 3), R(1, 3));*((C) + 2 * ldc + 1) = spu_extract(tmp5,0);*((C) + 2 * ldc + 2) = spu_extract(tmp5,1);*((C) + 3 * ldc + 0) = spu_extract(tmp5,2);*((C) + 3 * ldc + 1) = spu_extract(tmp5,3); HSUM(R(2, 3), R(3, 3), R(0, 4), R(1, 4));*((C) + 3 * ldc + 2) = spu_extract(tmp5,0);*((C) + 3 * ldc + 3) = spu_extract(tmp5,1);*((C) + 4 * ldc + 0) = spu_extract(tmp5,2);*((C) + 4 * ldc + 1) = spu_extract(tmp5,3); HSUM(R(2, 4), R(3, 4), R(4, 4), R(0, 5));*((C) + 4 * ldc + 2) = spu_extract(tmp5,0);*((C) + 4 * ldc + 3) = spu_extract(tmp5,1);*((C) + 4 * ldc + 4) = spu_extract(tmp5,2);*((C) + 5 * ldc + 0) = spu_extract(tmp5,3); HSUM(R(1, 5), R(2, 5), R(3, 5), R(4, 5));*((C) + 5 * ldc + 1) = spu_extract(tmp5,0);*((C) + 5 * ldc + 2) = spu_extract(tmp5,1);*((C) + 5 * ldc + 3) = spu_extract(tmp5,2);*((C) + 5 * ldc + 4) = spu_extract(tmp5,3); HSUM(R(5, 5), R(0, 6), R(1, 6), R(2, 6));*((C) + 5 * ldc + 5) = spu_extract(tmp5,0);*((C) + 6 * ldc + 0) = spu_extract(tmp5,1);*((C) + 6 * ldc + 1) = spu_extract(tmp5,2);*((C) + 6 * ldc + 2) = spu_extract(tmp5,3); HSUM(R(3, 6), R(4, 6), R(5, 6), R(6, 6));*((C) + 6 * ldc + 3) = spu_extract(tmp5,0);*((C) + 6 * ldc + 4) = spu_extract(tmp5,1);*((C) + 6 * ldc + 5) = spu_extract(tmp5,2);*((C) + 6 * ldc + 6) = spu_extract(tmp5,3); HSUM(R(0, 7), R(1, 7), R(2, 7), R(3, 7));*((C) + 7 * ldc + 0) = spu_extract(tmp5,0);*((C) + 7 * ldc + 1) = spu_extract(tmp5,1);*((C) + 7 * ldc + 2) = spu_extract(tmp5,2);*((C) + 7 * ldc + 3) = spu_extract(tmp5,3); HSUM(R(4, 7), R(5, 7), R(6, 7), R(7, 7));*((C) + 7 * ldc + 4) = spu_extract(tmp5,0);*((C) + 7 * ldc + 5) = spu_extract(tmp5,1);*((C) + 7 * ldc + 6) = spu_extract(tmp5,2);*((C) + 7 * ldc + 7) = spu_extract(tmp5,3); HSUM(R(0, 8), R(1, 8), R(2, 8), R(3, 8));*((C) + 8 * ldc + 0) = spu_extract(tmp5,0);*((C) + 8 * ldc + 1) = spu_extract(tmp5,1);*((C) + 8 * ldc + 2) = spu_extract(tmp5,2);*((C) + 8 * ldc + 3) = spu_extract(tmp5,3); HSUM(R(4, 8), R(5, 8), R(6, 8), R(7, 8));*((C) + 8 * ldc + 4) = spu_extract(tmp5,0);*((C) + 8 * ldc + 5) = spu_extract(tmp5,1);*((C) + 8 * ldc + 6) = spu_extract(tmp5,2);*((C) + 8 * ldc + 7) = spu_extract(tmp5,3); HSUM(R(8, 8), R(0, 9), R(1, 9), R(2, 9));*((C) + 8 * ldc + 8) = spu_extract(tmp5,0);*((C) + 9 * ldc + 0) = spu_extract(tmp5,1);*((C) + 9 * ldc + 1) = spu_extract(tmp5,2);*((C) + 9 * ldc + 2) = spu_extract(tmp5,3); HSUM(R(3, 9), R(4, 9), R(5, 9), R(6, 9));*((C) + 9 * ldc + 3) = spu_extract(tmp5,0);*((C) + 9 * ldc + 4) = spu_extract(tmp5,1);*((C) + 9 * ldc + 5) = spu_extract(tmp5,2);*((C) + 9 * ldc + 6) = spu_extract(tmp5,3); HSUM(R(7, 9), R(8, 9), R(9, 9), R(0, 10));*((C) + 9 * ldc + 7) = spu_extract(tmp5,0);*((C) + 9 * ldc + 8) = spu_extract(tmp5,1);*((C) + 9 * ldc + 9) = spu_extract(tmp5,2);*((C) + 10 * ldc + 0) = spu_extract(tmp5,3); HSUM(R(1, 10), R(2, 10), R(3, 10), R(4, 10));*((C) + 10 * ldc + 1) = spu_extract(tmp5,0);*((C) + 10 * ldc + 2) = spu_extract(tmp5,1);*((C) + 10 * ldc + 3) = spu_extract(tmp5,2);*((C) + 10 * ldc + 4) = spu_extract(tmp5,3); HSUM(R(5, 10), R(6, 10), R(7, 10), R(8, 10));*((C) + 10 * ldc + 5) = spu_extract(tmp5,0);*((C) + 10 * ldc + 6) = spu_extract(tmp5,1);*((C) + 10 * ldc + 7) = spu_extract(tmp5,2);*((C) + 10 * ldc + 8) = spu_extract(tmp5,3); HSUM(R(9, 10), R(10, 10), R(0, 11), R(1, 11));*((C) + 10 * ldc + 9) = spu_extract(tmp5,0);*((C) + 10 * ldc + 10) = spu_extract(tmp5,1);*((C) + 11 * ldc + 0) = spu_extract(tmp5,2);*((C) + 11 * ldc + 1) = spu_extract(tmp5,3); HSUM(R(2, 11), R(3, 11), R(4, 11), R(5, 11));*((C) + 11 * ldc + 2) = spu_extract(tmp5,0);*((C) + 11 * ldc + 3) = spu_extract(tmp5,1);*((C) + 11 * ldc + 4) = spu_extract(tmp5,2);*((C) + 11 * ldc + 5) = spu_extract(tmp5,3); HSUM(R(6, 11), R(7, 11), R(8, 11), R(9, 11));*((C) + 11 * ldc + 6) = spu_extract(tmp5,0);*((C) + 11 * ldc + 7) = spu_extract(tmp5,1);*((C) + 11 * ldc + 8) = spu_extract(tmp5,2);*((C) + 11 * ldc + 9) = spu_extract(tmp5,3); HSUM(R(10, 11), R(11, 11), R(0, 12), R(1, 12));*((C) + 11 * ldc + 10) = spu_extract(tmp5,0);*((C) + 11 * ldc + 11) = spu_extract(tmp5,1);*((C) + 12 * ldc + 0) = spu_extract(tmp5,2);*((C) + 12 * ldc + 1) = spu_extract(tmp5,3); HSUM(R(2, 12), R(3, 12), R(4, 12), R(5, 12));*((C) + 12 * ldc + 2) = spu_extract(tmp5,0);*((C) + 12 * ldc + 3) = spu_extract(tmp5,1);*((C) + 12 * ldc + 4) = spu_extract(tmp5,2);*((C) + 12 * ldc + 5) = spu_extract(tmp5,3); HSUM(R(6, 12), R(7, 12), R(8, 12), R(9, 12));*((C) + 12 * ldc + 6) = spu_extract(tmp5,0);*((C) + 12 * ldc + 7) = spu_extract(tmp5,1);*((C) + 12 * ldc + 8) = spu_extract(tmp5,2);*((C) + 12 * ldc + 9) = spu_extract(tmp5,3); HSUM(R(10, 12), R(11, 12), R(12, 12), R(0, 13));*((C) + 12 * ldc + 10) = spu_extract(tmp5,0);*((C) + 12 * ldc + 11) = spu_extract(tmp5,1);*((C) + 12 * ldc + 12) = spu_extract(tmp5,2);*((C) + 13 * ldc + 0) = spu_extract(tmp5,3); HSUM(R(1, 13), R(2, 13), R(3, 13), R(4, 13));*((C) + 13 * ldc + 1) = spu_extract(tmp5,0);*((C) + 13 * ldc + 2) = spu_extract(tmp5,1);*((C) + 13 * ldc + 3) = spu_extract(tmp5,2);*((C) + 13 * ldc + 4) = spu_extract(tmp5,3); HSUM(R(5, 13), R(6, 13), R(7, 13), R(8, 13));*((C) + 13 * ldc + 5) = spu_extract(tmp5,0);*((C) + 13 * ldc + 6) = spu_extract(tmp5,1);*((C) + 13 * ldc + 7) = spu_extract(tmp5,2);*((C) + 13 * ldc + 8) = spu_extract(tmp5,3); HSUM(R(9, 13), R(10, 13), R(11, 13), R(12, 13));*((C) + 13 * ldc + 9) = spu_extract(tmp5,0);*((C) + 13 * ldc + 10) = spu_extract(tmp5,1);*((C) + 13 * ldc + 11) = spu_extract(tmp5,2);*((C) + 13 * ldc + 12) = spu_extract(tmp5,3); HSUM(R(13, 13), R(0, 14), R(1, 14), R(2, 14));*((C) + 13 * ldc + 13) = spu_extract(tmp5,0);*((C) + 14 * ldc + 0) = spu_extract(tmp5,1);*((C) + 14 * ldc + 1) = spu_extract(tmp5,2);*((C) + 14 * ldc + 2) = spu_extract(tmp5,3); HSUM(R(3, 14), R(4, 14), R(5, 14), R(6, 14));*((C) + 14 * ldc + 3) = spu_extract(tmp5,0);*((C) + 14 * ldc + 4) = spu_extract(tmp5,1);*((C) + 14 * ldc + 5) = spu_extract(tmp5,2);*((C) + 14 * ldc + 6) = spu_extract(tmp5,3); HSUM(R(7, 14), R(8, 14), R(9, 14), R(10, 14));*((C) + 14 * ldc + 7) = spu_extract(tmp5,0);*((C) + 14 * ldc + 8) = spu_extract(tmp5,1);*((C) + 14 * ldc + 9) = spu_extract(tmp5,2);*((C) + 14 * ldc + 10) = spu_extract(tmp5,3); HSUM(R(11, 14), R(12, 14), R(13, 14), R(14, 14));*((C) + 14 * ldc + 11) = spu_extract(tmp5,0);*((C) + 14 * ldc + 12) = spu_extract(tmp5,1);*((C) + 14 * ldc + 13) = spu_extract(tmp5,2);*((C) + 14 * ldc + 14) = spu_extract(tmp5,3);
|
|
141 |
#define SUM_V15(C) HSUM(R(0, 0), R(1, 0), R(2, 0), R(3, 0)); C[0] = tmp5; HSUM(R(4, 0), R(5, 0), R(6, 0), R(7, 0)); C[1] = tmp5; HSUM(R(8, 0), R(9, 0), R(10, 0), R(11, 0)); C[2] = tmp5; HSUM(R(12, 0), R(13, 0), R(14, 0), zero); C[3] = tmp5;
|
|
142 |
#define SUM_T16(C) HSUM(R(0, 0), R(0, 1), R(1, 1), R(0, 2));*((C) + 0 * ldc + 0) = spu_extract(tmp5,0);*((C) + 1 * ldc + 0) = spu_extract(tmp5,1);*((C) + 1 * ldc + 1) = spu_extract(tmp5,2);*((C) + 2 * ldc + 0) = spu_extract(tmp5,3); HSUM(R(1, 2), R(2, 2), R(0, 3), R(1, 3));*((C) + 2 * ldc + 1) = spu_extract(tmp5,0);*((C) + 2 * ldc + 2) = spu_extract(tmp5,1);*((C) + 3 * ldc + 0) = spu_extract(tmp5,2);*((C) + 3 * ldc + 1) = spu_extract(tmp5,3); HSUM(R(2, 3), R(3, 3), R(0, 4), R(1, 4));*((C) + 3 * ldc + 2) = spu_extract(tmp5,0);*((C) + 3 * ldc + 3) = spu_extract(tmp5,1);*((C) + 4 * ldc + 0) = spu_extract(tmp5,2);*((C) + 4 * ldc + 1) = spu_extract(tmp5,3); HSUM(R(2, 4), R(3, 4), R(4, 4), R(0, 5));*((C) + 4 * ldc + 2) = spu_extract(tmp5,0);*((C) + 4 * ldc + 3) = spu_extract(tmp5,1);*((C) + 4 * ldc + 4) = spu_extract(tmp5,2);*((C) + 5 * ldc + 0) = spu_extract(tmp5,3); HSUM(R(1, 5), R(2, 5), R(3, 5), R(4, 5));*((C) + 5 * ldc + 1) = spu_extract(tmp5,0);*((C) + 5 * ldc + 2) = spu_extract(tmp5,1);*((C) + 5 * ldc + 3) = spu_extract(tmp5,2);*((C) + 5 * ldc + 4) = spu_extract(tmp5,3); HSUM(R(5, 5), R(0, 6), R(1, 6), R(2, 6));*((C) + 5 * ldc + 5) = spu_extract(tmp5,0);*((C) + 6 * ldc + 0) = spu_extract(tmp5,1);*((C) + 6 * ldc + 1) = spu_extract(tmp5,2);*((C) + 6 * ldc + 2) = spu_extract(tmp5,3); HSUM(R(3, 6), R(4, 6), R(5, 6), R(6, 6));*((C) + 6 * ldc + 3) = spu_extract(tmp5,0);*((C) + 6 * ldc + 4) = spu_extract(tmp5,1);*((C) + 6 * ldc + 5) = spu_extract(tmp5,2);*((C) + 6 * ldc + 6) = spu_extract(tmp5,3); HSUM(R(0, 7), R(1, 7), R(2, 7), R(3, 7));*((C) + 7 * ldc + 0) = spu_extract(tmp5,0);*((C) + 7 * ldc + 1) = spu_extract(tmp5,1);*((C) + 7 * ldc + 2) = spu_extract(tmp5,2);*((C) + 7 * ldc + 3) = spu_extract(tmp5,3); HSUM(R(4, 7), R(5, 7), R(6, 7), R(7, 7));*((C) + 7 * ldc + 4) = spu_extract(tmp5,0);*((C) + 7 * ldc + 5) = spu_extract(tmp5,1);*((C) + 7 * ldc + 6) = spu_extract(tmp5,2);*((C) + 7 * ldc + 7) = spu_extract(tmp5,3); HSUM(R(0, 8), R(1, 8), R(2, 8), R(3, 8));*((C) + 8 * ldc + 0) = spu_extract(tmp5,0);*((C) + 8 * ldc + 1) = spu_extract(tmp5,1);*((C) + 8 * ldc + 2) = spu_extract(tmp5,2);*((C) + 8 * ldc + 3) = spu_extract(tmp5,3); HSUM(R(4, 8), R(5, 8), R(6, 8), R(7, 8));*((C) + 8 * ldc + 4) = spu_extract(tmp5,0);*((C) + 8 * ldc + 5) = spu_extract(tmp5,1);*((C) + 8 * ldc + 6) = spu_extract(tmp5,2);*((C) + 8 * ldc + 7) = spu_extract(tmp5,3); HSUM(R(8, 8), R(0, 9), R(1, 9), R(2, 9));*((C) + 8 * ldc + 8) = spu_extract(tmp5,0);*((C) + 9 * ldc + 0) = spu_extract(tmp5,1);*((C) + 9 * ldc + 1) = spu_extract(tmp5,2);*((C) + 9 * ldc + 2) = spu_extract(tmp5,3); HSUM(R(3, 9), R(4, 9), R(5, 9), R(6, 9));*((C) + 9 * ldc + 3) = spu_extract(tmp5,0);*((C) + 9 * ldc + 4) = spu_extract(tmp5,1);*((C) + 9 * ldc + 5) = spu_extract(tmp5,2);*((C) + 9 * ldc + 6) = spu_extract(tmp5,3); HSUM(R(7, 9), R(8, 9), R(9, 9), R(0, 10));*((C) + 9 * ldc + 7) = spu_extract(tmp5,0);*((C) + 9 * ldc + 8) = spu_extract(tmp5,1);*((C) + 9 * ldc + 9) = spu_extract(tmp5,2);*((C) + 10 * ldc + 0) = spu_extract(tmp5,3); HSUM(R(1, 10), R(2, 10), R(3, 10), R(4, 10));*((C) + 10 * ldc + 1) = spu_extract(tmp5,0);*((C) + 10 * ldc + 2) = spu_extract(tmp5,1);*((C) + 10 * ldc + 3) = spu_extract(tmp5,2);*((C) + 10 * ldc + 4) = spu_extract(tmp5,3); HSUM(R(5, 10), R(6, 10), R(7, 10), R(8, 10));*((C) + 10 * ldc + 5) = spu_extract(tmp5,0);*((C) + 10 * ldc + 6) = spu_extract(tmp5,1);*((C) + 10 * ldc + 7) = spu_extract(tmp5,2);*((C) + 10 * ldc + 8) = spu_extract(tmp5,3); HSUM(R(9, 10), R(10, 10), R(0, 11), R(1, 11));*((C) + 10 * ldc + 9) = spu_extract(tmp5,0);*((C) + 10 * ldc + 10) = spu_extract(tmp5,1);*((C) + 11 * ldc + 0) = spu_extract(tmp5,2);*((C) + 11 * ldc + 1) = spu_extract(tmp5,3); HSUM(R(2, 11), R(3, 11), R(4, 11), R(5, 11));*((C) + 11 * ldc + 2) = spu_extract(tmp5,0);*((C) + 11 * ldc + 3) = spu_extract(tmp5,1);*((C) + 11 * ldc + 4) = spu_extract(tmp5,2);*((C) + 11 * ldc + 5) = spu_extract(tmp5,3); HSUM(R(6, 11), R(7, 11), R(8, 11), R(9, 11));*((C) + 11 * ldc + 6) = spu_extract(tmp5,0);*((C) + 11 * ldc + 7) = spu_extract(tmp5,1);*((C) + 11 * ldc + 8) = spu_extract(tmp5,2);*((C) + 11 * ldc + 9) = spu_extract(tmp5,3); HSUM(R(10, 11), R(11, 11), R(0, 12), R(1, 12));*((C) + 11 * ldc + 10) = spu_extract(tmp5,0);*((C) + 11 * ldc + 11) = spu_extract(tmp5,1);*((C) + 12 * ldc + 0) = spu_extract(tmp5,2);*((C) + 12 * ldc + 1) = spu_extract(tmp5,3); HSUM(R(2, 12), R(3, 12), R(4, 12), R(5, 12));*((C) + 12 * ldc + 2) = spu_extract(tmp5,0);*((C) + 12 * ldc + 3) = spu_extract(tmp5,1);*((C) + 12 * ldc + 4) = spu_extract(tmp5,2);*((C) + 12 * ldc + 5) = spu_extract(tmp5,3); HSUM(R(6, 12), R(7, 12), R(8, 12), R(9, 12));*((C) + 12 * ldc + 6) = spu_extract(tmp5,0);*((C) + 12 * ldc + 7) = spu_extract(tmp5,1);*((C) + 12 * ldc + 8) = spu_extract(tmp5,2);*((C) + 12 * ldc + 9) = spu_extract(tmp5,3); HSUM(R(10, 12), R(11, 12), R(12, 12), R(0, 13));*((C) + 12 * ldc + 10) = spu_extract(tmp5,0);*((C) + 12 * ldc + 11) = spu_extract(tmp5,1);*((C) + 12 * ldc + 12) = spu_extract(tmp5,2);*((C) + 13 * ldc + 0) = spu_extract(tmp5,3); HSUM(R(1, 13), R(2, 13), R(3, 13), R(4, 13));*((C) + 13 * ldc + 1) = spu_extract(tmp5,0);*((C) + 13 * ldc + 2) = spu_extract(tmp5,1);*((C) + 13 * ldc + 3) = spu_extract(tmp5,2);*((C) + 13 * ldc + 4) = spu_extract(tmp5,3); HSUM(R(5, 13), R(6, 13), R(7, 13), R(8, 13));*((C) + 13 * ldc + 5) = spu_extract(tmp5,0);*((C) + 13 * ldc + 6) = spu_extract(tmp5,1);*((C) + 13 * ldc + 7) = spu_extract(tmp5,2);*((C) + 13 * ldc + 8) = spu_extract(tmp5,3); HSUM(R(9, 13), R(10, 13), R(11, 13), R(12, 13));*((C) + 13 * ldc + 9) = spu_extract(tmp5,0);*((C) + 13 * ldc + 10) = spu_extract(tmp5,1);*((C) + 13 * ldc + 11) = spu_extract(tmp5,2);*((C) + 13 * ldc + 12) = spu_extract(tmp5,3); HSUM(R(13, 13), R(0, 14), R(1, 14), R(2, 14));*((C) + 13 * ldc + 13) = spu_extract(tmp5,0);*((C) + 14 * ldc + 0) = spu_extract(tmp5,1);*((C) + 14 * ldc + 1) = spu_extract(tmp5,2);*((C) + 14 * ldc + 2) = spu_extract(tmp5,3); HSUM(R(3, 14), R(4, 14), R(5, 14), R(6, 14));*((C) + 14 * ldc + 3) = spu_extract(tmp5,0);*((C) + 14 * ldc + 4) = spu_extract(tmp5,1);*((C) + 14 * ldc + 5) = spu_extract(tmp5,2);*((C) + 14 * ldc + 6) = spu_extract(tmp5,3); HSUM(R(7, 14), R(8, 14), R(9, 14), R(10, 14));*((C) + 14 * ldc + 7) = spu_extract(tmp5,0);*((C) + 14 * ldc + 8) = spu_extract(tmp5,1);*((C) + 14 * ldc + 9) = spu_extract(tmp5,2);*((C) + 14 * ldc + 10) = spu_extract(tmp5,3); HSUM(R(11, 14), R(12, 14), R(13, 14), R(14, 14));*((C) + 14 * ldc + 11) = spu_extract(tmp5,0);*((C) + 14 * ldc + 12) = spu_extract(tmp5,1);*((C) + 14 * ldc + 13) = spu_extract(tmp5,2);*((C) + 14 * ldc + 14) = spu_extract(tmp5,3); HSUM(R(0, 15), R(1, 15), R(2, 15), R(3, 15));*((C) + 15 * ldc + 0) = spu_extract(tmp5,0);*((C) + 15 * ldc + 1) = spu_extract(tmp5,1);*((C) + 15 * ldc + 2) = spu_extract(tmp5,2);*((C) + 15 * ldc + 3) = spu_extract(tmp5,3); HSUM(R(4, 15), R(5, 15), R(6, 15), R(7, 15));*((C) + 15 * ldc + 4) = spu_extract(tmp5,0);*((C) + 15 * ldc + 5) = spu_extract(tmp5,1);*((C) + 15 * ldc + 6) = spu_extract(tmp5,2);*((C) + 15 * ldc + 7) = spu_extract(tmp5,3); HSUM(R(8, 15), R(9, 15), R(10, 15), R(11, 15));*((C) + 15 * ldc + 8) = spu_extract(tmp5,0);*((C) + 15 * ldc + 9) = spu_extract(tmp5,1);*((C) + 15 * ldc + 10) = spu_extract(tmp5,2);*((C) + 15 * ldc + 11) = spu_extract(tmp5,3); HSUM(R(12, 15), R(13, 15), R(14, 15), R(15, 15));*((C) + 15 * ldc + 12) = spu_extract(tmp5,0);*((C) + 15 * ldc + 13) = spu_extract(tmp5,1);*((C) + 15 * ldc + 14) = spu_extract(tmp5,2);*((C) + 15 * ldc + 15) = spu_extract(tmp5,3);
|
|
143 |
#define SUM_V16(C) HSUM(R(0, 0), R(1, 0), R(2, 0), R(3, 0)); C[0] = tmp5; HSUM(R(4, 0), R(5, 0), R(6, 0), R(7, 0)); C[1] = tmp5; HSUM(R(8, 0), R(9, 0), R(10, 0), R(11, 0)); C[2] = tmp5; HSUM(R(12, 0), R(13, 0), R(14, 0), R(15, 0)); C[3] = tmp5;
|