1
/* Idealized SIMD Operations with SSE versions
2
Copyright (C) 2006, 2007, 2008, Robert D. Cameron
3
Licensed to the public under the Open Software License 3.0.
4
Licensed to International Characters Inc.
5
under the Academic Free License version 3.0.
10
/*------------------------------------------------------------*/
16
#define LITTLE_ENDIAN 1234
17
#define BIG_ENDIAN 4321
18
#define BYTE_ORDER LITTLE_ENDIAN
22
#define LONG_BIT (8* sizeof(unsigned long))
24
#include <emmintrin.h>
26
#include <pmmintrin.h>
28
typedef __m128i SIMD_type;
29
/*------------------------------------------------------------*/
30
/* I. SIMD bitwise logical operations */
32
#define simd_or(b1, b2) _mm_or_si128(b1, b2)
33
#define simd_and(b1, b2) _mm_and_si128(b1, b2)
34
#define simd_xor(b1, b2) _mm_xor_si128(b1, b2)
35
#define simd_andc(b1, b2) _mm_andnot_si128(b2, b1)
36
#define simd_if(cond, then_val, else_val) \
37
simd_or(simd_and(then_val, cond), simd_andc(else_val, cond))
38
#define simd_not(b) (simd_xor(b, _mm_set1_epi32(0xFFFFFFFF)))
39
#define simd_nor(a,b) (simd_not(simd_or(a,b)))
42
/* Specific constants. */
43
#define simd_himask_2 _mm_set1_epi8(0xAA)
44
#define simd_himask_4 _mm_set1_epi8(0xCC)
45
#define simd_himask_8 _mm_set1_epi8(0xF0)
47
#define simd_himask_16 _mm_set1_epi16(0xFF00)
48
#define simd_himask_32 _mm_set1_epi32(0xFFFF0000)
49
#define simd_himask_64 _mm_set_epi32(-1,0,-1,0)
50
#define simd_himask_128 _mm_set_epi32(-1,-1,0,0)
52
/* Idealized operations with direct implementation by built-in
53
operations for various target architectures. */
55
#define simd_add_8(a, b) _mm_add_epi8(a, b)
56
#define simd_add_16(a, b) _mm_add_epi16(a, b)
57
#define simd_add_32(a, b) _mm_add_epi32(a, b)
58
#define simd_add_64(a, b) _mm_add_epi64(a, b)
59
#define simd_sub_8(a, b) _mm_sub_epi8(a, b)
60
#define simd_sub_16(a, b) _mm_sub_epi16(a, b)
61
#define simd_sub_32(a, b) _mm_sub_epi32(a, b)
62
#define simd_sub_64(a, b) _mm_sub_epi64(a, b)
63
#define simd_mult_16(a, b) _mm_mullo_epi16(a, b)
64
#define simd_slli_16(r, shft) _mm_slli_epi16(r, shft)
65
#define simd_srli_16(r, shft) _mm_srli_epi16(r, shft)
66
#define simd_srai_16(r, shft) _mm_srai_epi16(r, shft)
67
#define simd_slli_32(r, shft) _mm_slli_epi32(r, shft)
68
#define simd_srli_32(r, shft) _mm_srli_epi32(r, shft)
69
#define simd_srai_32(r, shft) _mm_srai_epi32(r, shft)
70
#define simd_slli_64(r, shft) _mm_slli_epi64(r, shft)
71
#define simd_srli_64(r, shft) _mm_srli_epi64(r, shft)
72
#define simd_sll_64(r, shft_reg) _mm_sll_epi64(r, shft_reg)
73
#define simd_srl_64(r, shft_reg) _mm_srl_epi64(r, shft_reg)
74
#define simd_pack_16(a, b) \
75
_mm_packus_epi16(simd_andc(b, simd_himask_16), simd_andc(a, simd_himask_16))
76
#define simd_mergeh_8(a, b) _mm_unpackhi_epi8(b, a)
77
#define simd_mergeh_16(a, b) _mm_unpackhi_epi16(b, a)
78
#define simd_mergeh_32(a, b) _mm_unpackhi_epi32(b, a)
79
#define simd_mergeh_64(a, b) _mm_unpackhi_epi64(b, a)
80
#define simd_mergel_8(a, b) _mm_unpacklo_epi8(b, a)
81
#define simd_mergel_16(a, b) _mm_unpacklo_epi16(b, a)
82
#define simd_mergel_32(a, b) _mm_unpacklo_epi32(b, a)
83
#define simd_mergel_64(a, b) _mm_unpacklo_epi64(b, a)
84
#define simd_eq_8(a, b) _mm_cmpeq_epi8(a, b)
85
#define simd_eq_16(a, b) _mm_cmpeq_epi16(a, b)
86
#define simd_eq_32(a, b) _mm_cmpeq_epi32(a, b)
88
#define simd_max_8(a, b) _mm_max_epu8(a, b)
90
#define simd_slli_128(r, shft) \
91
((shft) % 8 == 0 ? _mm_slli_si128(r, (shft)/8) : \
92
(shft) >= 64 ? simd_slli_64(_mm_slli_si128(r, 8), (shft) - 64) : \
93
simd_or(simd_slli_64(r, shft), _mm_slli_si128(simd_srli_64(r, 64-(shft)), 8)))
95
#define simd_srli_128(r, shft) \
96
((shft) % 8 == 0 ? _mm_srli_si128(r, (shft)/8) : \
97
(shft) >= 64 ? simd_srli_64(_mm_srli_si128(r, 8), (shft) - 64) : \
98
simd_or(simd_srli_64(r, shft), _mm_srli_si128(simd_slli_64(r, 64-(shft)), 8)))
100
#define simd_sll_128(r, shft) \
101
simd_or(simd_sll_64(r, shft), \
102
simd_or(_mm_slli_si128(simd_sll_64(r, simd_sub_32(shft, sisd_from_int(64))), 8), \
103
_mm_slli_si128(simd_srl_64(r, simd_sub_32(sisd_from_int(64), shft)), 8)))
105
#define simd_srl_128(r, shft) \
106
simd_or(simd_srl_64(r, shft), \
107
simd_or(_mm_srli_si128(simd_srl_64(r, simd_sub_32(shft, sisd_from_int(64))), 8), \
108
_mm_srli_si128(simd_sll_64(r, simd_sub_32(sisd_from_int(64), shft)), 8)))
110
#define sisd_sll(r, shft) simd_sll_128(r, shft)
111
#define sisd_srl(r, shft) simd_srl_128(r, shft)
112
#define sisd_slli(r, shft) simd_slli_128(r, shft)
113
#define sisd_srli(r, shft) simd_srli_128(r, shft)
114
#define sisd_add(a, b) simd_add_128(a, b)
115
#define sisd_sub(a, b) simd_sub_128(a, b)
117
#define sisd_store_aligned(r, addr) _mm_store_si128(addr, r)
118
#define sisd_store_unaligned(r, addr) _mm_storeu_si128(addr, r)
119
#define sisd_load_aligned(addr) _mm_load_si128(addr)
121
#define sisd_load_unaligned(addr) _mm_loadu_si128(addr)
124
#define sisd_load_unaligned(addr) _mm_lddqu_si128(addr)
129
#define simd_const_32(n) _mm_set1_epi32(n)
130
#define simd_const_16(n) _mm_set1_epi16(n)
131
#define simd_const_8(n) _mm_set1_epi8(n)
132
#define simd_const_4(n) _mm_set1_epi8((n)<<4|(n))
133
#define simd_const_2(n) simd_const_4(n<<2|n)
134
#define simd_const_1(n) \
135
(n==0 ? simd_const_8(0): simd_const_8(-1))
137
#define simd_pack_16_ll(a, b) simd_pack_16(a, b)
138
#define simd_pack_16_hh(a, b) \
139
simd_pack_16(simd_srli_16(a, 8), simd_srli_16(b, 8))
143
SIMD_type simd_add_2(SIMD_type a, SIMD_type b)
145
SIMD_type c1 = simd_xor(a,b);
146
SIMD_type borrow = simd_and(a,b);
147
SIMD_type c2 = simd_xor(c1,(sisd_slli(borrow,1)));
148
return simd_if(simd_himask_2,c2,c1);
150
#define simd_add_4(a, b)\
151
simd_if(simd_himask_8, simd_add_8(simd_and(a,simd_himask_8),simd_and(b,simd_himask_8))\
152
,simd_add_8(simd_andc(a,simd_himask_8),simd_andc(b,simd_himask_8)))
154
#define simd_srli_2(r, sh)\
155
simd_and(simd_srli_32(r,sh),simd_const_2(3>>sh))
157
#define simd_srli_4(r, sh)\
158
simd_and(simd_srli_32(r,sh),simd_const_4(15>>sh))
159
#define simd_srli_8(r, sh)\
160
simd_and(simd_srli_32(r,sh),simd_const_8(255>>sh))
162
#define simd_slli_2(r, sh)\
163
simd_and(simd_slli_32(r,sh),simd_const_2((3<<sh)&3))
165
#define simd_slli_4(r, sh)\
166
simd_and(simd_slli_32(r,sh),simd_const_4((15<<sh)&15))
167
#define simd_slli_8(r, sh)\
168
simd_and(simd_slli_32(r,sh),simd_const_8((255<<sh) &255))
173
#define simd_mergeh_4(a,b)\
174
simd_mergeh_8(simd_if(simd_himask_8,a,simd_srli_8(b,4)),\
175
simd_if(simd_himask_8,simd_slli_8(a,4),b))
176
#define simd_mergel_4(a,b)\
177
simd_mergel_8(simd_if(simd_himask_8,a,simd_srli_8(b,4)),\
178
simd_if(simd_himask_8,simd_slli_8(a,4),b))
179
#define simd_mergeh_2(a,b)\
180
simd_mergeh_4(simd_if(simd_himask_4,a,simd_srli_4(b,2)),\
181
simd_if(simd_himask_4,simd_slli_4(a,2),b))
182
#define simd_mergel_2(a,b)\
183
simd_mergel_4(simd_if(simd_himask_4,a,simd_srli_4(b,2)),\
184
simd_if(simd_himask_4,simd_slli_4(a,2),b))
185
#define simd_mergeh_1(a,b)\
186
simd_mergeh_2(simd_if(simd_himask_2,a,simd_srli_2(b,1)),\
187
simd_if(simd_himask_2,simd_slli_2(a,1),b))
188
#define simd_mergel_1(a,b)\
189
simd_mergel_2(simd_if(simd_himask_2,a,simd_srli_2(b,1)),\
190
simd_if(simd_himask_2,simd_slli_2(a,1),b))
192
#define sisd_to_int(x) _mm_cvtsi128_si32(x)
194
#define sisd_from_int(n) _mm_cvtsi32_si128(n)
196
static inline int simd_all_true_8(SIMD_type v) {
197
return _mm_movemask_epi8(v) == 0xFFFF;
200
static inline int simd_any_true_8(SIMD_type v) {
201
return _mm_movemask_epi8(v) != 0;
204
static inline int simd_any_sign_bit_8(SIMD_type v) {
205
return _mm_movemask_epi8(v) != 0;
208
#define simd_all_eq_8(v1, v2) simd_all_true_8(_mm_cmpeq_epi8(v1, v2))
209
#define simd_all_le_8(v1, v2) \
210
simd_all_eq_8(simd_max_8(v1, v2), v2)
212
#define simd_all_signed_gt_8(v1, v2) simd_all_true_8(_mm_cmpgt_epi8(v1, v2))
214
#define simd_cmpgt_8(v1,v2) _mm_cmpgt_epi8(v1, v2)
216
static inline int bitblock_has_bit(SIMD_type v) {
217
return !simd_all_true_8(simd_eq_8(v, simd_const_8(0)));
222
#define bitblock_test_bit(blk, n) \
223
sisd_to_int(sisd_srli(sisd_slli(blk, ((BLOCKSIZE-1)-(n))), BLOCKSIZE-1))
225
#define simd_pack_2(a,b)\
226
simd_pack_4(simd_if(simd_himask_2,sisd_srli(a,1),a),\
227
simd_if(simd_himask_2,sisd_srli(b,1),b))
228
#define simd_pack_4(a,b)\
229
simd_pack_8(simd_if(simd_himask_4,sisd_srli(a,2),a),\
230
simd_if(simd_himask_4,sisd_srli(b,2),b))
231
#define simd_pack_8(a,b)\
232
simd_pack_16(simd_if(simd_himask_8,sisd_srli(a,4),a),\
233
simd_if(simd_himask_8,sisd_srli(b,4),b))
235
#ifndef simd_add_2_xx
236
#define simd_add_2_xx(v1, v2) simd_add_2(v1, v2)
239
#ifndef simd_add_2_xl
240
#define simd_add_2_xl(v1, v2) simd_add_2(v1, simd_andc(v2, simd_himask_2))
243
#ifndef simd_add_2_xh
244
#define simd_add_2_xh(v1, v2) simd_add_2(v1, simd_srli_2(v2, 1))
247
#ifndef simd_add_2_lx
248
#define simd_add_2_lx(v1, v2) simd_add_2(simd_andc(v1, simd_himask_2), v2)
251
#ifndef simd_add_2_ll
252
#define simd_add_2_ll(v1, v2) simd_add_8(simd_andc(v1, simd_himask_2), simd_andc(v2, simd_himask_2))
255
#ifndef simd_add_2_lh
256
#define simd_add_2_lh(v1, v2) simd_add_8(simd_andc(v1, simd_himask_2), simd_srli_2(v2, 1))
259
#ifndef simd_add_2_hx
260
#define simd_add_2_hx(v1, v2) simd_add_2(simd_srli_2(v1, 1), v2)
263
#ifndef simd_add_2_hl
264
#define simd_add_2_hl(v1, v2) simd_add_8(simd_srli_2(v1, 1), simd_andc(v2, simd_himask_2))
267
#ifndef simd_add_2_hh
268
#define simd_add_2_hh(v1, v2) simd_add_8(simd_srli_2(v1, 1), simd_srli_2(v2, 1))
271
#ifndef simd_add_4_xx
272
#define simd_add_4_xx(v1, v2) simd_add_4(v1, v2)
275
#ifndef simd_add_4_xl
276
#define simd_add_4_xl(v1, v2) simd_add_4(v1, simd_andc(v2, simd_himask_4))
279
#ifndef simd_add_4_xh
280
#define simd_add_4_xh(v1, v2) simd_add_4(v1, simd_srli_4(v2, 2))
283
#ifndef simd_add_4_lx
284
#define simd_add_4_lx(v1, v2) simd_add_4(simd_andc(v1, simd_himask_4), v2)
287
#ifndef simd_add_4_ll
288
#define simd_add_4_ll(v1, v2) simd_add_8(simd_andc(v1, simd_himask_4), simd_andc(v2, simd_himask_4))
291
#ifndef simd_add_4_lh
292
#define simd_add_4_lh(v1, v2) simd_add_8(simd_andc(v1, simd_himask_4), simd_srli_4(v2, 2))
295
#ifndef simd_add_4_hx
296
#define simd_add_4_hx(v1, v2) simd_add_4(simd_srli_4(v1, 2), v2)
299
#ifndef simd_add_4_hl
300
#define simd_add_4_hl(v1, v2) simd_add_8(simd_srli_4(v1, 2), simd_andc(v2, simd_himask_4))
303
#ifndef simd_add_4_hh
304
#define simd_add_4_hh(v1, v2) simd_add_8(simd_srli_4(v1, 2), simd_srli_4(v2, 2))
307
#ifndef simd_add_8_xx
308
#define simd_add_8_xx(v1, v2) simd_add_8(v1, v2)
311
#ifndef simd_add_8_xl
312
#define simd_add_8_xl(v1, v2) simd_add_8(v1, simd_andc(v2, simd_himask_8))
315
#ifndef simd_add_8_xh
316
#define simd_add_8_xh(v1, v2) simd_add_8(v1, simd_srli_8(v2, 4))
319
#ifndef simd_add_8_lx
320
#define simd_add_8_lx(v1, v2) simd_add_8(simd_andc(v1, simd_himask_8), v2)
323
#ifndef simd_add_8_ll
324
#define simd_add_8_ll(v1, v2) simd_add_8(simd_andc(v1, simd_himask_8), simd_andc(v2, simd_himask_8))
327
#ifndef simd_add_8_lh
328
#define simd_add_8_lh(v1, v2) simd_add_8(simd_andc(v1, simd_himask_8), simd_srli_8(v2, 4))
331
#ifndef simd_add_8_hx
332
#define simd_add_8_hx(v1, v2) simd_add_8(simd_srli_8(v1, 4), v2)
335
#ifndef simd_add_8_hl
336
#define simd_add_8_hl(v1, v2) simd_add_8(simd_srli_8(v1, 4), simd_andc(v2, simd_himask_8))
339
#ifndef simd_add_8_hh
340
#define simd_add_8_hh(v1, v2) simd_add_8(simd_srli_8(v1, 4), simd_srli_8(v2, 4))
343
#ifndef simd_add_16_xx
344
#define simd_add_16_xx(v1, v2) simd_add_16(v1, v2)
347
#ifndef simd_add_16_xl
348
#define simd_add_16_xl(v1, v2) simd_add_16(v1, simd_andc(v2, simd_himask_16))
351
#ifndef simd_add_16_xh
352
#define simd_add_16_xh(v1, v2) simd_add_16(v1, simd_srli_16(v2, 8))
355
#ifndef simd_add_16_lx
356
#define simd_add_16_lx(v1, v2) simd_add_16(simd_andc(v1, simd_himask_16), v2)
359
#ifndef simd_add_16_ll
360
#define simd_add_16_ll(v1, v2) simd_add_16(simd_andc(v1, simd_himask_16), simd_andc(v2, simd_himask_16))
363
#ifndef simd_add_16_lh
364
#define simd_add_16_lh(v1, v2) simd_add_16(simd_andc(v1, simd_himask_16), simd_srli_16(v2, 8))
367
#ifndef simd_add_16_hx
368
#define simd_add_16_hx(v1, v2) simd_add_16(simd_srli_16(v1, 8), v2)
371
#ifndef simd_add_16_hl
372
#define simd_add_16_hl(v1, v2) simd_add_16(simd_srli_16(v1, 8), simd_andc(v2, simd_himask_16))
375
#ifndef simd_add_16_hh
376
#define simd_add_16_hh(v1, v2) simd_add_16(simd_srli_16(v1, 8), simd_srli_16(v2, 8))
379
#ifndef simd_add_32_xx
380
#define simd_add_32_xx(v1, v2) simd_add_32(v1, v2)
383
#ifndef simd_add_32_xl
384
#define simd_add_32_xl(v1, v2) simd_add_32(v1, simd_andc(v2, simd_himask_32))
387
#ifndef simd_add_32_xh
388
#define simd_add_32_xh(v1, v2) simd_add_32(v1, simd_srli_32(v2, 16))
391
#ifndef simd_add_32_lx
392
#define simd_add_32_lx(v1, v2) simd_add_32(simd_andc(v1, simd_himask_32), v2)
395
#ifndef simd_add_32_ll
396
#define simd_add_32_ll(v1, v2) simd_add_32(simd_andc(v1, simd_himask_32), simd_andc(v2, simd_himask_32))
399
#ifndef simd_add_32_lh
400
#define simd_add_32_lh(v1, v2) simd_add_32(simd_andc(v1, simd_himask_32), simd_srli_32(v2, 16))
403
#ifndef simd_add_32_hx
404
#define simd_add_32_hx(v1, v2) simd_add_32(simd_srli_32(v1, 16), v2)
407
#ifndef simd_add_32_hl
408
#define simd_add_32_hl(v1, v2) simd_add_32(simd_srli_32(v1, 16), simd_andc(v2, simd_himask_32))
411
#ifndef simd_add_32_hh
412
#define simd_add_32_hh(v1, v2) simd_add_32(simd_srli_32(v1, 16), simd_srli_32(v2, 16))
415
#ifndef simd_add_64_xx
416
#define simd_add_64_xx(v1, v2) simd_add_64(v1, v2)
419
#ifndef simd_add_64_xl
420
#define simd_add_64_xl(v1, v2) simd_add_64(v1, simd_andc(v2, simd_himask_64))
423
#ifndef simd_add_64_xh
424
#define simd_add_64_xh(v1, v2) simd_add_64(v1, simd_srli_64(v2, 32))
427
#ifndef simd_add_64_lx
428
#define simd_add_64_lx(v1, v2) simd_add_64(simd_andc(v1, simd_himask_64), v2)
431
#ifndef simd_add_64_ll
432
#define simd_add_64_ll(v1, v2) simd_add_64(simd_andc(v1, simd_himask_64), simd_andc(v2, simd_himask_64))
435
#ifndef simd_add_64_lh
436
#define simd_add_64_lh(v1, v2) simd_add_64(simd_andc(v1, simd_himask_64), simd_srli_64(v2, 32))
439
#ifndef simd_add_64_hx
440
#define simd_add_64_hx(v1, v2) simd_add_64(simd_srli_64(v1, 32), v2)
443
#ifndef simd_add_64_hl
444
#define simd_add_64_hl(v1, v2) simd_add_64(simd_srli_64(v1, 32), simd_andc(v2, simd_himask_64))
447
#ifndef simd_add_64_hh
448
#define simd_add_64_hh(v1, v2) simd_add_64(simd_srli_64(v1, 32), simd_srli_64(v2, 32))
451
#ifndef simd_add_128_xx
452
#define simd_add_128_xx(v1, v2) simd_add_128(v1, v2)
455
#ifndef simd_add_128_xl
456
#define simd_add_128_xl(v1, v2) simd_add_128(v1, simd_andc(v2, simd_himask_128))
459
#ifndef simd_add_128_xh
460
#define simd_add_128_xh(v1, v2) simd_add_128(v1, simd_srli_128(v2, 64))
463
#ifndef simd_add_128_lx
464
#define simd_add_128_lx(v1, v2) simd_add_128(simd_andc(v1, simd_himask_128), v2)
467
#ifndef simd_add_128_ll
468
#define simd_add_128_ll(v1, v2) simd_add_128(simd_andc(v1, simd_himask_128), simd_andc(v2, simd_himask_128))
471
#ifndef simd_add_128_lh
472
#define simd_add_128_lh(v1, v2) simd_add_128(simd_andc(v1, simd_himask_128), simd_srli_128(v2, 64))
475
#ifndef simd_add_128_hx
476
#define simd_add_128_hx(v1, v2) simd_add_128(simd_srli_128(v1, 64), v2)
479
#ifndef simd_add_128_hl
480
#define simd_add_128_hl(v1, v2) simd_add_128(simd_srli_128(v1, 64), simd_andc(v2, simd_himask_128))
483
#ifndef simd_add_128_hh
484
#define simd_add_128_hh(v1, v2) simd_add_128(simd_srli_128(v1, 64), simd_srli_128(v2, 64))
487
#ifndef simd_pack_2_xx
488
#define simd_pack_2_xx(v1, v2) simd_pack_2(v1, v2)
491
#ifndef simd_pack_2_xl
492
#define simd_pack_2_xl(v1, v2) simd_pack_2(v1, v2)
495
#ifndef simd_pack_2_xh
496
#define simd_pack_2_xh(v1, v2) simd_pack_2(v1, simd_srli_16(v2, 1))
499
#ifndef simd_pack_2_lx
500
#define simd_pack_2_lx(v1, v2) simd_pack_2(v1, v2)
503
#ifndef simd_pack_2_ll
504
#define simd_pack_2_ll(v1, v2) simd_pack_2(v1, v2)
507
#ifndef simd_pack_2_lh
508
#define simd_pack_2_lh(v1, v2) simd_pack_2(v1, simd_srli_16(v2, 1))
511
#ifndef simd_pack_2_hx
512
#define simd_pack_2_hx(v1, v2) simd_pack_2(simd_srli_16(v1, 1), v2)
515
#ifndef simd_pack_2_hl
516
#define simd_pack_2_hl(v1, v2) simd_pack_2(simd_srli_16(v1, 1), v2)
519
#ifndef simd_pack_2_hh
520
#define simd_pack_2_hh(v1, v2) simd_pack_2(simd_srli_16(v1, 1), simd_srli_16(v2, 1))
523
#ifndef simd_pack_4_xx
524
#define simd_pack_4_xx(v1, v2) simd_pack_4(v1, v2)
527
#ifndef simd_pack_4_xl
528
#define simd_pack_4_xl(v1, v2) simd_pack_4(v1, v2)
531
#ifndef simd_pack_4_xh
532
#define simd_pack_4_xh(v1, v2) simd_pack_4(v1, simd_srli_16(v2, 2))
535
#ifndef simd_pack_4_lx
536
#define simd_pack_4_lx(v1, v2) simd_pack_4(v1, v2)
539
#ifndef simd_pack_4_ll
540
#define simd_pack_4_ll(v1, v2) simd_pack_4(v1, v2)
543
#ifndef simd_pack_4_lh
544
#define simd_pack_4_lh(v1, v2) simd_pack_4(v1, simd_srli_16(v2, 2))
547
#ifndef simd_pack_4_hx
548
#define simd_pack_4_hx(v1, v2) simd_pack_4(simd_srli_16(v1, 2), v2)
551
#ifndef simd_pack_4_hl
552
#define simd_pack_4_hl(v1, v2) simd_pack_4(simd_srli_16(v1, 2), v2)
555
#ifndef simd_pack_4_hh
556
#define simd_pack_4_hh(v1, v2) simd_pack_4(simd_srli_16(v1, 2), simd_srli_16(v2, 2))
559
#ifndef simd_pack_8_xx
560
#define simd_pack_8_xx(v1, v2) simd_pack_8(v1, v2)
563
#ifndef simd_pack_8_xl
564
#define simd_pack_8_xl(v1, v2) simd_pack_8(v1, v2)
567
#ifndef simd_pack_8_xh
568
#define simd_pack_8_xh(v1, v2) simd_pack_8(v1, simd_srli_16(v2, 4))
571
#ifndef simd_pack_8_lx
572
#define simd_pack_8_lx(v1, v2) simd_pack_8(v1, v2)
575
#ifndef simd_pack_8_ll
576
#define simd_pack_8_ll(v1, v2) simd_pack_8(v1, v2)
579
#ifndef simd_pack_8_lh
580
#define simd_pack_8_lh(v1, v2) simd_pack_8(v1, simd_srli_16(v2, 4))
583
#ifndef simd_pack_8_hx
584
#define simd_pack_8_hx(v1, v2) simd_pack_8(simd_srli_16(v1, 4), v2)
587
#ifndef simd_pack_8_hl
588
#define simd_pack_8_hl(v1, v2) simd_pack_8(simd_srli_16(v1, 4), v2)
591
#ifndef simd_pack_8_hh
592
#define simd_pack_8_hh(v1, v2) simd_pack_8(simd_srli_16(v1, 4), simd_srli_16(v2, 4))
595
#ifndef simd_pack_16_xx
596
#define simd_pack_16_xx(v1, v2) simd_pack_16(v1, v2)
599
#ifndef simd_pack_16_xl
600
#define simd_pack_16_xl(v1, v2) simd_pack_16(v1, v2)
603
#ifndef simd_pack_16_xh
604
#define simd_pack_16_xh(v1, v2) simd_pack_16(v1, simd_srli_16(v2, 8))
607
#ifndef simd_pack_16_lx
608
#define simd_pack_16_lx(v1, v2) simd_pack_16(v1, v2)
611
#ifndef simd_pack_16_ll
612
#define simd_pack_16_ll(v1, v2) simd_pack_16(v1, v2)
615
#ifndef simd_pack_16_lh
616
#define simd_pack_16_lh(v1, v2) simd_pack_16(v1, simd_srli_16(v2, 8))
619
#ifndef simd_pack_16_hx
620
#define simd_pack_16_hx(v1, v2) simd_pack_16(simd_srli_16(v1, 8), v2)
623
#ifndef simd_pack_16_hl
624
#define simd_pack_16_hl(v1, v2) simd_pack_16(simd_srli_16(v1, 8), v2)
627
#ifndef simd_pack_16_hh
628
#define simd_pack_16_hh(v1, v2) simd_pack_16(simd_srli_16(v1, 8), simd_srli_16(v2, 8))
632
// Splat the first 16-bit int into all positions.
633
static inline SIMD_type simd_splat_16(SIMD_type x) {
634
SIMD_type t = _mm_shufflelo_epi16(x,0);
635
return _mm_shuffle_epi32(t,0);
638
// Splat the first 32-bit int into all positions.
639
static inline SIMD_type simd_splat_32(SIMD_type x) {
640
return _mm_shuffle_epi32(x,0);
644
void print_bit_block(char * var_name, SIMD_type v) {
645
union {SIMD_type vec; unsigned char elems[8];} x;
647
unsigned char c, bit_reversed;
649
printf("%20s = ", var_name);
650
for (i = 0; i < sizeof(SIMD_type); i++) {
657
static inline int bitblock_bit_count(SIMD_type v) {
659
SIMD_type cts_2 = simd_add_2_lh(v, v);
660
SIMD_type cts_4 = simd_add_4_lh(cts_2, cts_2);
661
SIMD_type cts_8 = simd_add_8_lh(cts_4, cts_4);
662
SIMD_type cts_64 = _mm_sad_epu8(cts_8, simd_const_8(0));
663
/* SIMD_type cts_128 = simd_add_128_lh(cts_64, cts_64) */;
664
SIMD_type cts_128 = simd_add_64(cts_64, sisd_srli(cts_64,64));
665
return (int) sisd_to_int(cts_128);