1
/* lib_simd_h: SIMD Library including idealized SIMD operations
2
Copyright (C) 2008, Robert D. Cameron
3
Licensed to the public under the Open Software License 3.0.
4
Licensed to International Characters Inc.
5
under the Academic Free License version 3.0.
7
This file contains generic architecture-independent definitions,
8
importing architecture-specific implementations from appropriate
12
/*------------------------------------------------------------*/
15
#include <sys/types.h>
18
#if (defined(__i386) || defined(__x86_64))
19
#ifdef TEMPLATED_SIMD_LIB
20
#include "sse_simd_t.h"
22
#ifndef TEMPLATED_SIMD_LIB
27
#include "altivec_simd.h"
30
/* Useful definitions from Linux kernel*/
33
#define likely(x) __builtin_expect((x),1)
34
#define unlikely(x) __builtin_expect((x),0)
36
static inline long likely(long x) {
37
return __builtin_expect(x, 1);
39
static inline long unlikely(long x) {
40
return __builtin_expect(x, 0);
46
#define unlikely(x) (x)
49
/* Shift forward and back operations, based on endianness */
50
#if BYTE_ORDER == BIG_ENDIAN
51
#define sisd_sfl(blk, n) sisd_srl(blk, n)
52
#define sisd_sbl(blk, n) sisd_sll(blk, n)
53
#define sisd_sfli(blk, n) sisd_srli(blk, n)
54
#define sisd_sbli(blk, n) sisd_slli(blk, n)
55
#define sb_op(x, n) ((x)<<(n))
56
#define sf_op(x, n) ((x)>>(n))
57
#define cfzl __builtin_clzl
59
#if BYTE_ORDER == LITTLE_ENDIAN
60
#ifdef TEMPLATED_SIMD_LIB
61
static inline SIMD_type sisd_sfl(SIMD_type blk, SIMD_type n) {
62
return simd<128>::sll(blk, n);
65
static inline SIMD_type sisd_sbl(SIMD_type blk, SIMD_type n) {
66
return simd<128>::srl(blk, n);
68
#define sisd_sfli(blk, n) simd<128>::slli<n>(blk)
69
#define sisd_sbli(blk, n) simd<128>::srli<n>(blk)
71
#ifndef TEMPLATED_SIMD_LIB
72
static inline SIMD_type sisd_sfl(SIMD_type blk, SIMD_type n) {
73
return sisd_sll(blk, n);
75
static inline SIMD_type sisd_sbl(SIMD_type blk, SIMD_type n) {
76
return sisd_srl(blk, n);
78
#define sisd_sfli(blk, n) sisd_slli(blk, n)
79
#define sisd_sbli(blk, n) sisd_srli(blk, n)
81
#define sb_op(x, n) ((x)>>(n))
82
#define sf_op(x, n) ((x)<<(n))
84
#define cfzl __builtin_ctzl
88
#pragma intrinsic(_BitScanForward)
89
// precondition: x > 0
90
static inline unsigned long cfzl(unsigned long x) {
92
_BitScanForward(&zeroes, x);
99
static inline int count_forward_zeroes(SIMD_type bits) {
100
union {SIMD_type vec; unsigned long elems[sizeof(SIMD_type)/sizeof(long)];} v;
102
if (v.elems[0] != 0) return cfzl(v.elems[0]);
103
else if (v.elems[1] != 0) return LONG_BIT + cfzl(v.elems[1]);
105
else if (v.elems[2] != 0) return 2*LONG_BIT + cfzl(v.elems[2]);
106
else if (v.elems[3] != 0) return 3*LONG_BIT + cfzl(v.elems[3]);
110
else if (v.elems[2] != 0) return 2*LONG_BIT + cfzl(v.elems[2]);
111
else if (v.elems[3] != 0) return 3*LONG_BIT + cfzl(v.elems[3]);
114
else return 8*sizeof(SIMD_type);
117
static inline unsigned long bitstream_segment_from(SIMD_type * stream, int bit_posn) {
118
unsigned long * bitstream_ptr = (unsigned long *) (((intptr_t) stream) + bit_posn/8);
119
return sb_op(*bitstream_ptr, bit_posn % 8);
122
/* Scans for a 1 as long as it takes. Use a sentinel to fence.
123
Works for either endianness. */
124
static inline int bitstream_scan(SIMD_type * stream, int bit_posn) {
125
unsigned long * bitstream_ptr = (unsigned long *) (((intptr_t) stream) + bit_posn/8);
126
unsigned long bitstream_slice = sb_op(*bitstream_ptr, bit_posn % 8);
127
if (bitstream_slice != 0) return bit_posn + cfzl(bitstream_slice);
131
bitstream_slice = *bitstream_ptr;
132
} while (bitstream_slice == 0);
133
int base_posn = 8*((intptr_t) bitstream_ptr - (intptr_t) stream);
134
return base_posn + cfzl(bitstream_slice);
138
static inline int bitstream_scan0(SIMD_type * stream) {
139
unsigned long * bitstream_ptr = (unsigned long *) stream;
140
unsigned long bitstream_slice = *bitstream_ptr;
142
while (bitstream_slice == 0) {
144
bitstream_slice = *bitstream_ptr;
146
base_posn = 8*((intptr_t) bitstream_ptr - (intptr_t) stream);
147
return base_posn + cfzl(bitstream_slice);
151
/* Allocator for arrays of aligned SIMD data values.
152
Ideally the new operator could be used to allocate arrays
153
of vector data aligned on the required boundaries
154
(16-byte for SSE or Altivec). But since this alignment
155
is not guaranteed except on Mac OS X, the following routine
158
static inline SIMD_type * simd_new(size_t SIMD_packs) {
160
return new SIMD_type [SIMD_packs];
163
SIMD_type * v = (SIMD_type*)_aligned_malloc(sizeof(SIMD_type) * SIMD_packs, sizeof(SIMD_type));
164
if (v != 0) return v;
166
printf("Failed to allocated new array of %i SIMD packs.\n", SIMD_packs);
170
#if !defined(__APPLE__) && !defined(_MSC_VER)
172
int rslt = posix_memalign((void **) &v,
174
sizeof(SIMD_type) * SIMD_packs);
175
if (rslt == 0) return v;
177
printf("Failed to allocated new array of %i SIMD packs.\n", SIMD_packs);
183
static inline void simd_delete(SIMD_type * blk_ptr) {
188
free((void *) blk_ptr);