1
/* Idealized SIMD Operations with SSE versions
2
Copyright (C) 2006, 2007, 2008, Robert D. Cameron and Dan Lin
3
Licensed to the public under the Open Software License 3.0.
4
Licensed to International Characters Inc.
5
under the Academic Free License version 3.0.
10
/*------------------------------------------------------------*/
16
#define LITTLE_ENDIAN 1234
17
#define BIG_ENDIAN 4321
18
#define BYTE_ORDER LITTLE_ENDIAN
22
#define LONG_BIT (8* sizeof(unsigned long))
24
#include <emmintrin.h>
26
#include <pmmintrin.h>
28
typedef __m128i SIMD_type;
29
/*------------------------------------------------------------*/
30
/* I. SIMD bitwise logical operations */
32
static inline SIMD_type simd_and(SIMD_type b1, SIMD_type b2) {
33
return _mm_and_si128(b1, b2);
35
static inline SIMD_type simd_andc(SIMD_type b1, SIMD_type b2) {
36
return _mm_andnot_si128(b2, b1);
38
static inline SIMD_type simd_or(SIMD_type b1, SIMD_type b2) {
39
return _mm_or_si128(b1, b2);
41
static inline SIMD_type simd_xor(SIMD_type b1, SIMD_type b2) {
42
return _mm_xor_si128(b1, b2);
44
static inline SIMD_type simd_not(SIMD_type b) {
45
return simd_xor(b, _mm_set1_epi32(0xFFFFFFFF));
47
static inline SIMD_type simd_nor(SIMD_type b1, SIMD_type b2) {
48
return simd_not(simd_or(b1,b2));
50
static inline SIMD_type simd_if(SIMD_type cond, SIMD_type then_val, SIMD_type else_val) {
51
return simd_or(simd_and(then_val, cond), simd_andc(else_val, cond));
55
/*------------------------------------------------------------*/
56
/* II. Declarations of field-width based operations. */
58
/* Half-operand modifier specifications use "x", "h" or "l",
59
* "x" - no modification of the corresponding operand value
60
* "h" - each n-bit field is modified by taking the high n/2 bits.
61
* "l" - each n-bit field is modified by taking the low n/2 bits. */
65
/* simd<fw> is a template struct providing all the simd operations
66
* for a given field width. */
69
/* The himask selector in which each field is fw/2 1 bits,
70
* followed by fw/2 0 bits. */
71
static inline SIMD_type himask();
73
/* Splat constant generator with compile-time constant. */
74
template <int v> static inline SIMD_type constant();
75
/* Splat generator using the first field of a register. */
76
static inline SIMD_type splat(SIMD_type r);
78
/* Shift immediate with the shift constant as a template parameter. */
79
template <int shft> static inline SIMD_type srli(SIMD_type r);
80
template <int shft> static inline SIMD_type slli(SIMD_type r);
81
template <int shft> static inline SIMD_type srai(SIMD_type r);
83
/* Shift operations with register-specified shift values. */
84
static inline SIMD_type srl(SIMD_type r, SIMD_type shft);
85
static inline SIMD_type sll(SIMD_type r, SIMD_type shft);
87
/* Binary operations. */
88
static inline SIMD_type add(SIMD_type r1, SIMD_type r2);
89
static inline SIMD_type sub(SIMD_type r1, SIMD_type r2);
90
static inline SIMD_type mult(SIMD_type r1, SIMD_type r2);
91
static inline SIMD_type max(SIMD_type r1, SIMD_type r2);
92
static inline SIMD_type eq(SIMD_type r1, SIMD_type r2);
93
static inline SIMD_type gt(SIMD_type r1, SIMD_type r2);
94
static inline SIMD_type pack(SIMD_type r1, SIMD_type r2);
95
static inline SIMD_type mergeh(SIMD_type r1, SIMD_type r2);
96
static inline SIMD_type mergel(SIMD_type r1, SIMD_type r2);
98
// /* Functions for half-operand modification. */
100
// template <HOM_t m> static inline SIMD_type hom(SIMD_type r);
101
// template <HOM_t m> static inline SIMD_type hx(SIMD_type r);
103
/* Binary operations with half-operand modifiers */
105
template <HOM_t m1, HOM_t m2> static inline SIMD_type add(SIMD_type r1, SIMD_type r2);
106
template <HOM_t m1, HOM_t m2> static inline SIMD_type sub(SIMD_type r1, SIMD_type r2);
107
template <HOM_t m1, HOM_t m2> static inline SIMD_type mult(SIMD_type r1, SIMD_type r2);
108
template <HOM_t m1, HOM_t m2> static inline SIMD_type pack(SIMD_type r1, SIMD_type r2);
109
template <HOM_t m1, HOM_t m2> static inline SIMD_type mergeh(SIMD_type r1, SIMD_type r2);
110
template <HOM_t m1, HOM_t m2> static inline SIMD_type mergel(SIMD_type r1, SIMD_type r2);
113
#define sisd_to_int(x) _mm_cvtsi128_si32(x)
115
#define sisd_from_int(n) _mm_cvtsi32_si128(n)
120
/* III. Implementations of simd<fw> operations. */
122
/* Constant generator functions for various field widths. */
124
template<> inline SIMD_type simd<2>::himask() {return _mm_set1_epi8(0xAA);}
126
template<> inline SIMD_type simd<4>::himask() {return _mm_set1_epi8(0xCC);}
128
template<> inline SIMD_type simd<8>::himask() {return _mm_set1_epi8(0xF0);}
130
template<> inline SIMD_type simd<16>::himask() {return _mm_set1_epi16(0xFF00);}
132
template<> inline SIMD_type simd<32>::himask() {return _mm_set1_epi32(0xFFFF0000);}
134
template<> inline SIMD_type simd<64>::himask() {return _mm_set_epi32(-1,0,-1,0);}
136
template<> inline SIMD_type simd<128>::himask() {return _mm_set_epi32(-1,-1,0,0);}
138
template<> template <int n> inline SIMD_type simd<4>::constant() {return _mm_set1_epi8((n)<<4|(n));}
140
template<> template <int n> inline SIMD_type simd<8>::constant() {return _mm_set1_epi8(n);}
142
template<> template <int n> inline SIMD_type simd<16>::constant() {return _mm_set1_epi16(n);}
144
template<> template <int n> inline SIMD_type simd<32>::constant() {return _mm_set1_epi32(n);}
146
template<> template <> inline SIMD_type simd<1>::constant<0>() {return simd<8>::constant<0>();}
147
template<> template <> inline SIMD_type simd<1>::constant<1>() {return simd<8>::constant<-1>();}
149
template<> template <int n> inline SIMD_type simd<2>::constant() {return simd<4>::constant<(n<<2|n)>();}
151
// Splat the first 16-bit int into all positions.
152
template <> inline SIMD_type simd<16>::splat(SIMD_type x) {
153
SIMD_type t = _mm_shufflelo_epi16(x,0);
154
return _mm_shuffle_epi32(t,0);
157
// Splat the first 32-bit int into all positions.
158
template <> inline SIMD_type simd<32>::splat(SIMD_type x) {
159
return _mm_shuffle_epi32(x,0);
162
/* Shift immediate operations with direct implementation by built-ins. */
164
template<> template<int sh> inline SIMD_type simd<16>::slli(SIMD_type r) {return _mm_slli_epi16(r, sh);}
166
template<> template<int sh> inline SIMD_type simd<32>::slli(SIMD_type r) {return _mm_slli_epi32(r, sh);}
168
template<> template<int sh> inline SIMD_type simd<64>::slli(SIMD_type r) {return _mm_slli_epi64(r, sh);}
170
template<> template<int sh> inline SIMD_type simd<16>::srli(SIMD_type r) {return _mm_srli_epi16(r, sh);}
172
template<> template<int sh> inline SIMD_type simd<32>::srli(SIMD_type r) {return _mm_srli_epi32(r, sh);}
174
template<> template<int sh> inline SIMD_type simd<64>::srli(SIMD_type r) {return _mm_srli_epi64(r, sh);}
178
template<> template<int sh> inline SIMD_type simd<16>::srai(SIMD_type r) {return _mm_srai_epi16(r, sh);}
180
template<> template<int sh> inline SIMD_type simd<32>::srai(SIMD_type r) {return _mm_srai_epi32(r, sh);}
184
/* General rules for slli/srli for field widths 2, 4, 8 in terms of 32-bit shifts. */
188
//template<int fw> template<int sh>
189
//inline SIMD_type simd<fw>::slli(SIMD_type r) {
190
// return simd_and(simd<32>::slli<sh>(r), simd<fw>::constant<6>());
195
template<> template<int sh>
196
inline SIMD_type simd<2>::slli(SIMD_type r) {
197
return simd_and(simd<32>::slli<sh>(r),simd<2>::constant<((3<<sh)&3)>());
200
template<> template<int sh>
201
inline SIMD_type simd<4>::slli(SIMD_type r) {
202
return simd_and(simd<32>::slli<sh>(r),simd<4>::constant<((15<<sh)&15)>());
205
template<> template<int sh>
206
inline SIMD_type simd<8>::slli(SIMD_type r) {
207
return simd_and(simd<32>::slli<sh>(r),simd<8>::constant<((255<<sh)&255)>());
211
//template<int fw> template<int sh>
212
//inline SIMD_type simd<fw>::srli(SIMD_type r) {
213
// return simd_and(simd<32>::srli<sh>(r),simd<fw>::constant<((1<<(fw-sh))-1)>());
218
template<> template<int sh>
219
inline SIMD_type simd<2>::srli(SIMD_type r) {
220
return simd_and(simd<32>::srli<sh>(r),simd<2>::constant<(3>>sh)>());
223
template<> template<int sh>
224
inline SIMD_type simd<4>::srli(SIMD_type r) {
225
return simd_and(simd<32>::srli<sh>(r),simd<4>::constant<(15>>sh)>());
228
template<> template<int sh>
229
inline SIMD_type simd<8>::srli(SIMD_type r) {
230
return simd_and(simd<32>::srli<sh>(r),simd<8>::constant<(255>>sh)>());
236
/* Shift immediate for 128-bit fields */
238
template<> template<int shft>
239
inline SIMD_type simd<128>::slli(SIMD_type r) {
240
return (shft % 8 == 0 ? _mm_slli_si128(r, shft/8) :
241
shft >= 64 ? simd<64>::slli<shft-64>(_mm_slli_si128(r, 8)) :
242
simd_or(simd<64>::slli<shft>(r), _mm_slli_si128(simd<64>::srli<64-shft>(r), 8)));
245
template<> template<int shft>
246
inline SIMD_type simd<128>::srli(SIMD_type r) {
247
return (shft % 8 == 0 ? _mm_srli_si128(r, shft/8) :
248
shft >= 64 ? simd<64>::srli<shft-64>(_mm_srli_si128(r, 8)) :
249
simd_or(simd<64>::srli<shft>(r), _mm_srli_si128(simd<64>::slli<64-shft>(r), 8)));
253
/* Shifts with shift values specified in an operand register. */
256
inline SIMD_type simd<128>::srl(SIMD_type r, SIMD_type shft) {
257
return simd_or(_mm_srl_epi64(r, shft),
258
simd_or(_mm_srli_si128(_mm_srl_epi64(r, _mm_sub_epi32(shft, sisd_from_int(64))), 8),
259
_mm_srli_si128(_mm_sll_epi64(r, _mm_sub_epi32(sisd_from_int(64), shft)), 8)));
263
inline SIMD_type simd<128>::sll(SIMD_type r, SIMD_type shft) {
264
return simd_or(_mm_sll_epi64(r, shft),
265
simd_or(_mm_slli_si128(_mm_sll_epi64(r, _mm_sub_epi32(shft, sisd_from_int(64))), 8),
266
_mm_slli_si128(_mm_srl_epi64(r, _mm_sub_epi32(sisd_from_int(64), shft)), 8)));
270
inline SIMD_type simd<64>::srl(SIMD_type r, SIMD_type shft) {
271
return simd_if(simd<128>::himask(),
272
_mm_srl_epi64(r, _mm_srli_si128(shft, 8)),
273
_mm_srl_epi64(r, simd_andc(shft, simd<128>::himask())));
277
inline SIMD_type simd<64>::sll(SIMD_type r, SIMD_type shft) {
278
return simd_if(simd<128>::himask(),
279
_mm_sll_epi64(r, _mm_srli_si128(shft, 8)),
280
_mm_sll_epi64(r, simd_andc(shft, simd<128>::himask())));
287
Use built-ins for 8, 16, 32, 64, simulations for 2, 4. */
289
template<> inline SIMD_type simd<8>::add(SIMD_type r1, SIMD_type r2) {return _mm_add_epi8(r1, r2);}
291
template<> inline SIMD_type simd<16>::add(SIMD_type r1, SIMD_type r2) {return _mm_add_epi16(r1, r2);}
293
template<> inline SIMD_type simd<32>::add(SIMD_type r1, SIMD_type r2) {return _mm_add_epi32(r1, r2);}
295
template<> inline SIMD_type simd<64>::add(SIMD_type r1, SIMD_type r2) {return _mm_add_epi64(r1, r2);}
298
inline SIMD_type simd<2>::add(SIMD_type r1, SIMD_type r2) {
299
SIMD_type c1 = simd_xor(r1,r2);
300
SIMD_type borrow = simd_and(r1,r2);
301
SIMD_type c2 = simd_xor(c1,(simd<128>::slli<1>(borrow)));
302
return simd_if(simd<2>::himask(),c2,c1);
306
SIMD_type simd<4>::add(SIMD_type r1, SIMD_type r2) {
307
return simd_if(simd<8>::himask(),
308
simd<8>::add(r1,simd_and(r2,simd<8>::himask())),
309
simd<8>::add(r1, r2));
315
Use built-ins for 8, 16, 32, 64, simulations for 2, 4. */
317
template<> inline SIMD_type simd<8>::sub(SIMD_type r1, SIMD_type r2) {return _mm_sub_epi8(r1, r2);}
319
template<> inline SIMD_type simd<16>::sub(SIMD_type r1, SIMD_type r2) {return _mm_sub_epi16(r1, r2);}
321
template<> inline SIMD_type simd<32>::sub(SIMD_type r1, SIMD_type r2) {return _mm_sub_epi32(r1, r2);}
323
template<> inline SIMD_type simd<64>::sub(SIMD_type r1, SIMD_type r2) {return _mm_sub_epi64(r1, r2);}
327
inline SIMD_type simd<2>::sub(SIMD_type r1, SIMD_type r2)
329
SIMD_type c1 = simd_xor(r1,r2);
330
SIMD_type borrow = simd_andc(r2,r1);
331
SIMD_type c2 = simd_xor(c1,(simd<128>::slli<1>(borrow)));
332
return simd_if(simd<2>::himask(),c2,c1);
336
inline SIMD_type simd<4>::sub(SIMD_type r1, SIMD_type r2){
337
return simd_if(simd<8>::himask(),
338
simd<8>::sub(r1, simd_and(r2,simd<8>::himask())),
339
simd<8>::sub(r1, r2));
342
/* simd_mult for 16 bits only. */
344
template<> inline SIMD_type simd<16>::mult(SIMD_type r1, SIMD_type r2) {return _mm_mullo_epi16(r1, r2);}
346
/* simd_max for 8 bits only. */
348
template<> inline SIMD_type simd<8>::max(SIMD_type r1, SIMD_type r2) {return _mm_max_epu8(r1, r2);}
354
template<> inline SIMD_type simd<8>::eq(SIMD_type r1, SIMD_type r2) {return _mm_cmpeq_epi8(r1, r2);}
356
template<> inline SIMD_type simd<16>::eq(SIMD_type r1, SIMD_type r2) {return _mm_cmpeq_epi16(r1, r2);}
358
template<> inline SIMD_type simd<32>::eq(SIMD_type r1, SIMD_type r2) {return _mm_cmpeq_epi32(r1, r2);}
365
/* Built-in operation for fw = 16. */
367
inline SIMD_type simd<16>::pack(SIMD_type r1, SIMD_type r2) {
368
return _mm_packus_epi16(simd_andc(r2, simd<16>::himask()), simd_andc(r1, simd<16>::himask()));
373
inline SIMD_type simd<fw>::pack(SIMD_type r1, SIMD_type r2){
374
return simd<fw*2>::pack(simd_if(simd<fw>::himask(),simd<128>::srli<fw/2>(r1),r1),
375
simd_if(simd<fw>::himask(),simd<128>::srli<fw/2>(r2),r2));
379
* fw: 1,2,4,8,16,32,64*/
381
inline SIMD_type simd<fw>::mergeh(SIMD_type r1, SIMD_type r2){
383
return simd<fw*2>::mergeh(simd_if(simd<fw*2>::himask(),r1,simd<fw*2>::srli<fw>(r2)),
384
simd_if(simd<fw*2>::himask(),simd<fw*2>::slli<fw>(r1),r2));
387
template<> inline SIMD_type simd<8>::mergeh(SIMD_type r1, SIMD_type r2) {return _mm_unpackhi_epi8(r2, r1);}
388
template<> inline SIMD_type simd<16>::mergeh(SIMD_type r1, SIMD_type r2) {return _mm_unpackhi_epi16(r2, r1);}
389
template<> inline SIMD_type simd<32>::mergeh(SIMD_type r1, SIMD_type r2) {return _mm_unpackhi_epi32(r2, r1);}
390
template<> inline SIMD_type simd<64>::mergeh(SIMD_type r1, SIMD_type r2) {return _mm_unpackhi_epi64(r2, r1);}
394
* fw: 1,2,4,8,16,32,64*/
396
inline SIMD_type simd<fw>::mergel(SIMD_type r1, SIMD_type r2){
398
return simd<fw*2>::mergel(simd_if(simd<fw*2>::himask(),r1,simd<fw*2>::srli<fw>(r2)),
399
simd_if(simd<fw*2>::himask(),simd<fw*2>::slli<fw>(r1),r2));
402
template<> inline SIMD_type simd<8>::mergel(SIMD_type r1, SIMD_type r2) {return _mm_unpacklo_epi8(r2, r1);}
403
template<> inline SIMD_type simd<16>::mergel(SIMD_type r1, SIMD_type r2) {return _mm_unpacklo_epi16(r2, r1);}
404
template<> inline SIMD_type simd<32>::mergel(SIMD_type r1, SIMD_type r2) {return _mm_unpacklo_epi32(r2, r1);}
405
template<> inline SIMD_type simd<64>::mergel(SIMD_type r1, SIMD_type r2) {return _mm_unpacklo_epi64(r2, r1);}
411
#define simd_all_eq_8(v1, v2) simd_all_true<8>(_mm_cmpeq_epi8(v1, v2))
412
#define simd_all_le_8(v1, v2) simd_all_eq_8(simd_max_8(v1, v2), v2)
414
#define simd_all_signed_gt_8(v1, v2) simd_all_true_8(_mm_cmpgt_epi8(v1, v2))
416
#define simd_cmpgt_8(v1,v2) _mm_cmpgt_epi8(v1, v2)
423
static inline int simd_all_true(SIMD_type r);
425
static inline int simd_all_true<8>(SIMD_type r) {
426
return _mm_movemask_epi8(r) == 0xFFFF;
432
static inline int simd_any_true(SIMD_type r);
434
static inline int simd_any_true<8>(SIMD_type r) {
435
return _mm_movemask_epi8(r) != 0;
441
static inline int simd_any_sign_bit(SIMD_type r);
443
static inline int simd_any_sign_bit<8>(SIMD_type r) {
444
return _mm_movemask_epi8(r) != 0;
449
/* IV. Half operand modifiers - implementations. */
450
/* Half operand modifier functions.*/
452
/* Half operand modifier*/
453
/* Half operand modifier*/
454
template <int fw, HOM_t m>
456
static inline SIMD_type hom(SIMD_type r) {}
461
static inline SIMD_type hom(SIMD_type r) {return r;}
462
static inline SIMD_type l2x(SIMD_type r) {return r;}
467
static inline SIMD_type hom(SIMD_type r) {return simd_andc(r, simd<fw>::himask());}
468
static inline SIMD_type l2x(SIMD_type r) {return r;}
472
//struct SIMD<fw, h> {
473
// static inline SIMD_type hom(SIMD_type r) {return simd<fw>::srli<fw/2>(r);}
474
// static inline SIMD_type l2x(SIMD_type r) {return simd<fw>::srli<fw/2>(r);}
479
static inline SIMD_type hom(SIMD_type r) {return simd<2>::srli<1>(r);}
480
static inline SIMD_type l2x(SIMD_type r) {return simd<2>::srli<1>(r);}
485
static inline SIMD_type hom(SIMD_type r) {return simd<4>::srli<2>(r);}
486
static inline SIMD_type l2x(SIMD_type r) {return simd<4>::srli<2>(r);}
491
static inline SIMD_type hom(SIMD_type r) {return simd<8>::srli<4>(r);}
492
static inline SIMD_type l2x(SIMD_type r) {return simd<8>::srli<4>(r);}
497
static inline SIMD_type hom(SIMD_type r) {return simd<16>::srli<8>(r);}
498
static inline SIMD_type l2x(SIMD_type r) {return simd<16>::srli<8>(r);}
503
static inline SIMD_type hom(SIMD_type r) {return simd<32>::srli<16>(r);}
504
static inline SIMD_type l2x(SIMD_type r) {return simd<32>::srli<16>(r);}
508
/* SIMD operations extended with HOM*/
509
template<int fw> template <HOM_t m1, HOM_t m2>
510
inline SIMD_type simd<fw>::add(SIMD_type r1, SIMD_type r2){
511
return simd<fw>::add(SIMD<fw,m1>::hom(r1),SIMD<fw,m2>::hom(r2));
514
template<int fw> template <HOM_t m1, HOM_t m2>
515
inline SIMD_type simd<fw>::sub(SIMD_type r1, SIMD_type r2){
516
return simd<fw>::sub(SIMD<fw,m1>::hom(r1),SIMD<fw,m2>::hom(r2));
519
template<int fw> template <HOM_t m1, HOM_t m2>
520
inline SIMD_type simd<fw>::pack(SIMD_type r1, SIMD_type r2){
521
return simd<fw>::pack(SIMD<fw,m1>::l2x(r1),SIMD<fw,m2>::l2x(r2));
524
template<int fw> template <HOM_t m1, HOM_t m2>
525
inline SIMD_type simd<fw>::mergeh(SIMD_type r1, SIMD_type r2){
526
return simd<fw>::mergeh(SIMD<fw,m1>::hom(r1),SIMD<fw,m2>::hom(r2));
529
template<int fw> template <HOM_t m1, HOM_t m2>
530
inline SIMD_type simd<fw>::mergel(SIMD_type r1, SIMD_type r2){
531
return simd<fw>::mergel(SIMD<fw,m1>::hom(r1),SIMD<fw,m2>::hom(r2));
538
//template<int fw> SIMD_type hom(SIMD_type r) {return r;}
539
//template<int fw> SIMD_type l2x(SIMD_type r) {return r;}
544
//SIMD_type HOM<l>::hom(SIMD_type r) {return simd_andc(r, simd<fw>::himask());}
548
//SIMD_type HOM<h>::hom(SIMD_type r) {return simd<fw>::srli<fw/2>(r);}
552
//SIMD_type HOM<h>::l2x(SIMD_type r) {return simd<fw>::srli<fw/2>(r);}
555
///* SIMD operations extended with Half-Operand Modifiers */
557
//template<int fw> template <HOM_t m1, HOM_t m2>
558
//inline SIMD_type simd<fw>::add(SIMD_type r1, SIMD_type r2){
559
// return simd<fw>::add(HOM<m1>::hom<fw>, HOM<m2>::hom<fw>(r2));
562
//template<int fw> template <HOM_t m1, HOM_t m2>
563
//inline SIMD_type simd<fw>::sub(SIMD_type r1, SIMD_type r2){
564
// return simd<fw>::sub(HOM<m1>::hom<fw>, HOM<m2>::hom<fw>(r2));
567
//template<int fw> template <HOM_t m1, HOM_t m2>
568
//inline SIMD_type simd<fw>::mult(SIMD_type r1, SIMD_type r2){
569
// return simd<fw>::mult(HOM<m1>::hom<fw>, HOM<m2>::hom<fw>(r2));
572
//template<int fw> template <HOM_t m1, HOM_t m2>
573
//inline SIMD_type simd<fw>::pack(SIMD_type r1, SIMD_type r2){
574
// return simd<fw>::pack(HOM<m1>::l2x<fw>, HOM<m2>::hom<fw>::hom(r2));
577
//template<int fw> template <HOM_t m1, HOM_t m2>
578
//inline SIMD_type simd<fw>::mergeh(SIMD_type r1, SIMD_type r2){
579
// return simd<fw>::mergeh(HOM<m1>::hom<fw>, HOM<m2>::hom<fw>::hom(r2));
582
//template<int fw> template <HOM_t m1, HOM_t m2>
583
//inline SIMD_type simd<fw>::mergel(SIMD_type r1, SIMD_type r2){
584
// return simd<fw>::mergel(HOM<m1>::hom<fw>, HOM<m2>::hom<fw>::hom(r2));
587
/* V. sisd operations on full 128-bit register width. */
590
// template <int shft> inline SIMD_type slli(SIMD_type r) {return simd<128>::slli<shft>(r);}
591
// template <int shft> inline SIMD_type srli(SIMD_type r) {return simd<128>::srli<shft>(r);}
592
// inline SIMD_type sll(SIMD_type r, SIMD_type shft) {return simd<128>::sll<shft>(r, shft);}
593
// inline SIMD_type srl(SIMD_type r, SIMD_type shft) {return simd<128>::srl<shft>(r, shft);}
597
#define sisd_store_aligned(r, addr) _mm_store_si128(addr, r)
598
#define sisd_store_unaligned(r, addr) _mm_storeu_si128(addr, r)
599
#define sisd_load_aligned(addr) _mm_load_si128(addr)
601
#define sisd_load_unaligned(addr) _mm_loadu_si128(addr)
604
#define sisd_load_unaligned(addr) _mm_lddqu_si128(addr)
608
#define bitblock_test_bit(blk, n) \
609
sisd_to_int(sisd_srli(sisd_slli(blk, ((BLOCKSIZE-1)-(n))), BLOCKSIZE-1))
612
#if (BYTE_ORDER == BIG_ENDIAN)
613
void print_bit_block(char * var_name, SIMD_type v) {
614
union {SIMD_type vec; unsigned char elems[8];} x;
616
unsigned char c, bit_reversed;
618
printf("%20s = ", var_name);
619
for (i = 0; i < sizeof(SIMD_type); i++) {
627
#if (BYTE_ORDER == LITTLE_ENDIAN)
628
void print_bit_block(char * var_name, SIMD_type v) {
629
union {SIMD_type vec; unsigned char elems[8];} x;
631
unsigned char c, bit_reversed;
633
printf("%20s = ", var_name);
634
for (i = sizeof(SIMD_type)-1; i >= 0; i--) {
643
static inline int bitblock_has_bit(SIMD_type v) {
644
return !simd_all_true<8>(simd<8>::eq(v, simd<8>::constant<0>()));
647
static inline int bitblock_bit_count(SIMD_type v) {
649
SIMD_type cts_2 = simd<2>::add<l,h>(v, v);
650
SIMD_type cts_4 = simd<4>::add<l,h>(cts_2, cts_2);
651
SIMD_type cts_8 = simd<8>::add<l,h>(cts_4, cts_4);
652
SIMD_type cts_64 = _mm_sad_epu8(cts_8, simd<8>::constant<0>());
653
/* SIMD_type cts_128 = simd<a28>::add<l,h>(cts_64, cts_64) */;
654
SIMD_type cts_128 = simd<64>::add(cts_64, simd<128>::srli<64>(cts_64));
655
return (int) sisd_to_int(cts_128);