1
// Disassembling __globfunc__Z6vecAddPfS_S_ (0)
2
.entry __globfunc__Z6vecAddPfS_S_
8
cvt.u32.u16 $r0, $r0.lo
9
shl.u32 $r2, $r0, 0x00000002
10
add.half.b32 $r0, s[0x0018], $r2
11
add.half.b32 $r3, s[0x0020], $r2
14
add.half.rn.f32 $r1, $r1, $r0
15
add.half.b32 $r0, s[0x0010], $r2
16
mov.end.u32 g[$r0], $r1
18
// Disassembling __globfunc__Z10transpose1PfS_ii (1)
19
.entry __globfunc__Z10transpose1PfS_ii
25
and.b16 $r1.hi, $r0.hi, c1[0x0000]
26
mov.half.b16 $r1.lo, %nctaid.x
27
mov.half.b16 $r0.hi, %ntid.y
28
cvt.u32.u16 $r2, $r1.hi
29
cvt.u32.u16 $r3, $r0.lo
30
mad24.lo.u32.u16.u16.u32 $r2, s[0x000e], $r1.lo, $r2
31
mad24.lo.u32.u16.u16.u32 $r3, s[0x000c], $r0.hi, $r3
32
mov.b32 $r1, s[0x0020]
33
mov.b32 $r0, s[0x0024]
34
mul24.half.lo.u32.u16.u16 $r5, $r1.lo, $r2.hi
35
mul24.half.lo.u32.u16.u16 $r4, $r0.lo, $r3.hi
36
mad24.lo.u32.u16.u16.u32 $r5, $r1.hi, $r2.lo, $r5
37
mad24.lo.u32.u16.u16.u32 $r4, $r0.hi, $r3.lo, $r4
38
shl.u32 $r5, $r5, 0x00000010
39
shl.u32 $r4, $r4, 0x00000010
40
mad24.lo.u32.u16.u16.u32 $r1, $r1.lo, $r2.lo, $r5
41
mad24.lo.u32.u16.u16.u32 $r0, $r0.lo, $r3.lo, $r4
42
add.half.b32 $r1, $r3, $r1
43
add.half.b32 $r0, $r0, $r2
44
shl.u32 $r1, $r1, 0x00000002
45
shl.u32 $r0, $r0, 0x00000002
46
add.half.b32 $r2, s[0x0018], $r1
47
add.half.b32 $r1, s[0x0010], $r0
49
mov.end.u32 g[$r1], $r0
50
#.constseg 1:0x0000 const
52
#d.32 0x000003ff // 0000
55
// Disassembling __globfunc__Z10transpose2PfS_ii (2)
56
.entry __globfunc__Z10transpose2PfS_ii
63
cvt.u32.u16 $r2, %nctaid.x
64
and.b16 $r0.lo, $r1.hi, c1[0x0000]
65
cvt.u32.u16 $r6, $r0.lo
66
mul24.half.lo.u32.u16.u16 $r5, s[0x000e], $r2.lo
67
mov.half.b32 $r0, s[0x0020]
69
mul24.lo.u32.u16.u16 $r4, $r0.lo, $r3.hi
70
mad24.lo.u32.u16.u16.u32 $r8, $r0.hi, $r3.lo, $r4
71
cvt.u32.u16 $r4, %ntid.y
72
cvt.u32.u16 $r7, $r1.lo
73
shl.u32 $r8, $r8, 0x00000010
74
mul24.lo.u32.u16.u16 $r1, s[0x000c], $r4.lo
75
mad24.lo.u32.u16.u16.u32 $r0, $r0.lo, $r3.lo, $r8
76
add.half.b32 $r3, $r1, $r7
77
add.half.b32 $r0, $r0, $r3
78
shl.u32 $r3, $r0, 0x00000002
79
mad24.lo.u32 $r0, $r4, $r6, $r7
80
add.u32 $r3, s[0x0018], $r3
81
movsh.b32 $ofs1, $r0, 0x00000002
83
mov.b32 s[$ofs1+0x0030], $r0
84
bar.sync.u32 0x00000000
85
add.half.b32 $r1, $r1, $r6
86
mov.half.b32 $r0, s[0x0024]
87
mul24.lo.u32.u16.u16 $r3, $r0.lo, $r1.hi
88
mad24.lo.u32.u16.u16.u32 $r3, $r0.hi, $r1.lo, $r3
89
shl.u32 $r3, $r3, 0x00000010
90
mad24.lo.u32.u16.u16.u32 $r1, $r0.lo, $r1.lo, $r3
92
mad24.lo.u32 $r2, $r7, $r2, $r6
94
movsh.b32 $ofs1, $r2, 0x00000002
95
shl.u32 $r1, $r0, 0x00000002
96
mov.half.b32 $r0, s[$ofs1+0x0030]
97
add.half.b32 $r1, s[0x0010], $r1
98
mov.end.u32 g[$r1], $r0
99
#.constseg 1:0x0000 const
101
#d.32 0x000003ff // 0000
104
// Disassembling __globfunc__Z10transpose3PfS_ii (3)
105
.entry __globfunc__Z10transpose3PfS_ii
112
mov.b16 $r0.lo, %nctaid.x
113
and.b16 $r0.hi, $r1.hi, c1[0x0000]
114
mul24.lo.u32.u16.u16 $r4, s[0x000e], $r0.lo
115
cvt.u32.u16 $r5, $r0.hi
116
mov.b32 $r0, s[0x0020]
117
add.u32 $r3, $r4, $r5
118
mul24.lo.u32.u16.u16 $r2, $r0.lo, $r3.hi
119
mad24.lo.u32.u16.u16.u32 $r7, $r0.hi, $r3.lo, $r2
120
cvt.u32.u16 $r2, %ntid.y
121
cvt.u32.u16 $r6, $r1.lo
122
shl.u32 $r7, $r7, 0x00000010
123
mul24.lo.u32.u16.u16 $r1, s[0x000c], $r2.lo
124
mad24.lo.u32.u16.u16.u32 $r0, $r0.lo, $r3.lo, $r7
125
add.half.b32 $r3, $r1, $r6
126
add.half.b32 $r0, $r0, $r3
127
add.b32 $r2, $r2, 0x00000001
128
shl.u32 $r3, $r0, 0x00000002
129
mad24.lo.u32 $r0, $r2, $r5, $r6
130
add.u32 $r3, s[0x0018], $r3
131
movsh.b32 $ofs1, $r0, 0x00000002
133
mov.b32 s[$ofs1+0x0030], $r0
134
bar.sync.u32 0x00000000
135
add.half.b32 $r1, $r1, $r5
136
mov.half.b32 $r0, s[0x0024]
137
mul24.lo.u32.u16.u16 $r3, $r0.lo, $r1.hi
138
mad24.lo.u32.u16.u16.u32 $r3, $r0.hi, $r1.lo, $r3
139
shl.u32 $r3, $r3, 0x00000010
140
mad24.lo.u32.u16.u16.u32 $r1, $r0.lo, $r1.lo, $r3
141
add.u32 $r0, $r4, $r6
142
mad24.lo.u32 $r2, $r6, $r2, $r5
143
add.u32 $r0, $r1, $r0
144
movsh.b32 $ofs1, $r2, 0x00000002
145
shl.u32 $r1, $r0, 0x00000002
146
mov.half.b32 $r0, s[$ofs1+0x0030]
147
add.half.b32 $r1, s[0x0010], $r1
148
mov.end.u32 g[$r1], $r0
149
#.constseg 1:0x0000 const
151
#d.32 0x000003ff // 0000