/perf/kseta

To get this branch, use:
bzr branch http://darksoft.org/webbzr/perf/kseta

« back to all changes in this revision

Viewing changes to tutorials/4_pi/random123/array.h

  • Committer: Suren A. Chilingaryan
  • Date: 2013-10-08 23:53:50 UTC
  • Revision ID: csa@dside.dyndns.org-20131008235350-hsu8oukzkh05gtcm
Add tutorials

Show diffs side-by-side

added added

removed removed

Lines of Context:
 
1
/*
 
2
Copyright 2010-2011, D. E. Shaw Research.
 
3
All rights reserved.
 
4
 
 
5
Redistribution and use in source and binary forms, with or without
 
6
modification, are permitted provided that the following conditions are
 
7
met:
 
8
 
 
9
* Redistributions of source code must retain the above copyright
 
10
  notice, this list of conditions, and the following disclaimer.
 
11
 
 
12
* Redistributions in binary form must reproduce the above copyright
 
13
  notice, this list of conditions, and the following disclaimer in the
 
14
  documentation and/or other materials provided with the distribution.
 
15
 
 
16
* Neither the name of D. E. Shaw Research nor the names of its
 
17
  contributors may be used to endorse or promote products derived from
 
18
  this software without specific prior written permission.
 
19
 
 
20
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 
21
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 
22
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 
23
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 
24
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 
25
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 
26
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 
27
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 
28
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 
29
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 
30
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
31
*/
 
32
#ifndef _r123array_dot_h__
 
33
#define _r123array_dot_h__
 
34
#include "features/compilerfeatures.h"
 
35
#include "features/sse.h"
 
36
 
 
37
#ifndef __cplusplus
 
38
#define CXXMETHODS(_N, W, T)
 
39
#define CXXOVERLOADS(_N, W, T)
 
40
#else
 
41
 
 
42
#include <stddef.h>
 
43
#include <algorithm>
 
44
#include <stdexcept>
 
45
#include <iterator>
 
46
#include <limits>
 
47
#include <iostream>
 
48
 
 
49
/** @defgroup arrayNxW The r123arrayNxW classes 
 
50
 
 
51
    Each of the r123arrayNxW is a fixed size array of N W-bit unsigned integers.
 
52
    It is functionally equivalent to the C++0x std::array<N, uintW_t>,
 
53
    but does not require C++0x features or libraries.
 
54
 
 
55
    In addition to meeting most of the requirements of a Container,
 
56
    it also has a member function, incr(), which increments the zero-th
 
57
    element and carrys overflows into higher indexed elements.  Thus,
 
58
    by using incr(), sequences of up to 2^(N*W) distinct values
 
59
    can be produced. 
 
60
 
 
61
    If SSE is supported by the compiler, then the class
 
62
    r123array1xm128i is also defined, in which the data member is an
 
63
    array of one r123128i object.
 
64
 
 
65
    @cond HIDDEN_FROM_DOXYGEN
 
66
*/
 
67
 
 
68
template <typename value_type>
 
69
inline R123_CUDA_DEVICE value_type assemble_from_u32(uint32_t *p32){
 
70
    value_type v=0;
 
71
    for(size_t i=0; i<(3+sizeof(value_type))/4; ++i)
 
72
        v |= ((value_type)(*p32++)) << (32*i);
 
73
    return v;
 
74
}
 
75
 
 
76
// Work-alike methods and typedefs modeled on std::array:
 
77
#define CXXMETHODS(_N, W, T)                                            \
 
78
    typedef T value_type;                                               \
 
79
    typedef T* iterator;                                                \
 
80
    typedef const T* const_iterator;                                    \
 
81
    typedef value_type& reference;                                      \
 
82
    typedef const value_type& const_reference;                          \
 
83
    typedef size_t size_type;                                           \
 
84
    typedef ptrdiff_t difference_type;                                  \
 
85
    typedef T* pointer;                                                 \
 
86
    typedef const T* const_pointer;                                     \
 
87
    typedef std::reverse_iterator<iterator> reverse_iterator;           \
 
88
    typedef std::reverse_iterator<const_iterator> const_reverse_iterator; \
 
89
    /* Boost.array has static_size.  C++11 specializes tuple_size */    \
 
90
    enum {static_size = _N};                                            \
 
91
    R123_CUDA_DEVICE reference operator[](size_type i){return v[i];}                     \
 
92
    R123_CUDA_DEVICE const_reference operator[](size_type i) const {return v[i];}        \
 
93
    R123_CUDA_DEVICE reference at(size_type i){ if(i >=  _N) R123_THROW(std::out_of_range("array index out of range")); return (*this)[i]; } \
 
94
    R123_CUDA_DEVICE const_reference at(size_type i) const { if(i >=  _N) R123_THROW(std::out_of_range("array index out of range")); return (*this)[i]; } \
 
95
    R123_CUDA_DEVICE size_type size() const { return  _N; }                              \
 
96
    R123_CUDA_DEVICE size_type max_size() const { return _N; }                           \
 
97
    R123_CUDA_DEVICE bool empty() const { return _N==0; };                               \
 
98
    R123_CUDA_DEVICE iterator begin() { return &v[0]; }                                  \
 
99
    R123_CUDA_DEVICE iterator end() { return &v[_N]; }                                   \
 
100
    R123_CUDA_DEVICE const_iterator begin() const { return &v[0]; }                      \
 
101
    R123_CUDA_DEVICE const_iterator end() const { return &v[_N]; }                       \
 
102
    R123_CUDA_DEVICE const_iterator cbegin() const { return &v[0]; }                     \
 
103
    R123_CUDA_DEVICE const_iterator cend() const { return &v[_N]; }                      \
 
104
    R123_CUDA_DEVICE reverse_iterator rbegin(){ return reverse_iterator(end()); }        \
 
105
    R123_CUDA_DEVICE const_reverse_iterator rbegin() const{ return const_reverse_iterator(end()); } \
 
106
    R123_CUDA_DEVICE reverse_iterator rend(){ return reverse_iterator(begin()); }        \
 
107
    R123_CUDA_DEVICE const_reverse_iterator rend() const{ return const_reverse_iterator(begin()); } \
 
108
    R123_CUDA_DEVICE const_reverse_iterator crbegin() const{ return const_reverse_iterator(cend()); } \
 
109
    R123_CUDA_DEVICE const_reverse_iterator crend() const{ return const_reverse_iterator(cbegin()); } \
 
110
    R123_CUDA_DEVICE pointer data(){ return &v[0]; }                                     \
 
111
    R123_CUDA_DEVICE const_pointer data() const{ return &v[0]; }                         \
 
112
    R123_CUDA_DEVICE reference front(){ return v[0]; }                                   \
 
113
    R123_CUDA_DEVICE const_reference front() const{ return v[0]; }                       \
 
114
    R123_CUDA_DEVICE reference back(){ return v[_N-1]; }                                 \
 
115
    R123_CUDA_DEVICE const_reference back() const{ return v[_N-1]; }                     \
 
116
    R123_CUDA_DEVICE bool operator==(const r123array##_N##x##W& rhs) const{ \
 
117
        /* CUDA3 does not have std::equal */ \
 
118
        for (size_t i = 0; i < _N; ++i) \
 
119
            if (v[i] != rhs.v[i]) return false; \
 
120
        return true; \
 
121
    } \
 
122
    R123_CUDA_DEVICE bool operator!=(const r123array##_N##x##W& rhs) const{ return !(*this == rhs); } \
 
123
    /* CUDA3 does not have std::fill_n */ \
 
124
    R123_CUDA_DEVICE void fill(const value_type& val){ for (size_t i = 0; i < _N; ++i) v[i] = val; } \
 
125
    R123_CUDA_DEVICE void swap(r123array##_N##x##W& rhs){ \
 
126
        /* CUDA3 does not have std::swap_ranges */ \
 
127
        for (size_t i = 0; i < _N; ++i) { \
 
128
            T tmp = v[i]; \
 
129
            v[i] = rhs.v[i]; \
 
130
            rhs.v[i] = tmp; \
 
131
        } \
 
132
    } \
 
133
    R123_CUDA_DEVICE r123array##_N##x##W& incr(R123_ULONG_LONG n=1){                         \
 
134
        /* This test is tricky because we're trying to avoid spurious   \
 
135
           complaints about illegal shifts, yet still be compile-time   \
 
136
           evaulated. */                                                \
 
137
        if(sizeof(T)<sizeof(n) && n>>((sizeof(T)<sizeof(n))?8*sizeof(T):0) ) \
 
138
            return incr_carefully(n);                                   \
 
139
        if(n==1){                                                       \
 
140
            ++v[0];                                                     \
 
141
            if(_N==1 || R123_BUILTIN_EXPECT(!!v[0], 1)) return *this;   \
 
142
        }else{                                                          \
 
143
            v[0] += n;                                                  \
 
144
            if(_N==1 || R123_BUILTIN_EXPECT(n<=v[0], 1)) return *this;  \
 
145
        }                                                               \
 
146
        /* We expect that the N==?? tests will be                       \
 
147
           constant-folded/optimized away by the compiler, so only the  \
 
148
           overflow tests (!!v[i]) remain to be done at runtime.  For  \
 
149
           small values of N, it would be better to do this as an       \
 
150
           uncondtional sequence of adc.  An experiment/optimization    \
 
151
           for another day...                                           \
 
152
           N.B.  The weird subscripting: v[_N>3?3:0] is to silence      \
 
153
           a spurious error from icpc                                   \
 
154
           */                                                           \
 
155
        ++v[_N>1?1:0];                                                  \
 
156
        if(_N==2 || R123_BUILTIN_EXPECT(!!v[_N>1?1:0], 1)) return *this; \
 
157
        ++v[_N>2?2:0];                                                  \
 
158
        if(_N==3 || R123_BUILTIN_EXPECT(!!v[_N>2?2:0], 1)) return *this;  \
 
159
        ++v[_N>3?3:0];                                                  \
 
160
        for(size_t i=4; i<_N; ++i){                                     \
 
161
            if( R123_BUILTIN_EXPECT(!!v[i-1], 1) ) return *this;        \
 
162
            ++v[i];                                                     \
 
163
        }                                                               \
 
164
        return *this;                                                   \
 
165
    }                                                                   \
 
166
    /* seed(SeedSeq) would be a constructor if having a constructor */  \
 
167
    /* didn't cause headaches with defaults */                          \
 
168
    template <typename SeedSeq>                                         \
 
169
    R123_CUDA_DEVICE static r123array##_N##x##W seed(SeedSeq &ss){      \
 
170
        r123array##_N##x##W ret;                                        \
 
171
        const size_t Ngen = _N*((3+sizeof(value_type))/4);              \
 
172
        uint32_t u32[Ngen];                                             \
 
173
        uint32_t *p32 = &u32[0];                                        \
 
174
        ss.generate(&u32[0], &u32[Ngen]);                               \
 
175
        for(size_t i=0; i<_N; ++i){                                     \
 
176
            ret.v[i] = assemble_from_u32<value_type>(p32);              \
 
177
            p32 += (3+sizeof(value_type))/4;                            \
 
178
        }                                                               \
 
179
        return ret;                                                     \
 
180
    }                                                                   \
 
181
protected:                                                              \
 
182
    R123_CUDA_DEVICE r123array##_N##x##W& incr_carefully(R123_ULONG_LONG n){ \
 
183
        /* n may be greater than the maximum value of a single value_type */ \
 
184
        value_type vtn;                                                 \
 
185
        vtn = n;                                                        \
 
186
        v[0] += n;                                                      \
 
187
        const unsigned rshift = 8* ((sizeof(n)>sizeof(value_type))? sizeof(value_type) : 0); \
 
188
        for(size_t i=1; i<_N; ++i){                                     \
 
189
            if(rshift){                                                 \
 
190
                n >>= rshift;                                           \
 
191
            }else{                                                      \
 
192
                n=0;                                                    \
 
193
            }                                                           \
 
194
            if( v[i-1] < vtn )                                          \
 
195
                ++n;                                                    \
 
196
            if( n==0 ) break;                                           \
 
197
            vtn = n;                                                    \
 
198
            v[i] += n;                                                  \
 
199
        }                                                               \
 
200
        return *this;                                                   \
 
201
    }                                                                   \
 
202
    
 
203
                                                                        
 
204
// There are several tricky considerations for the insertion and extraction
 
205
// operators:
 
206
// - we would like to be able to print r123array16x8 as a sequence of 16 integers,
 
207
//   not as 16 bytes.
 
208
// - we would like to be able to print r123array1xm128i.
 
209
// - we do not want an int conversion operator in r123m128i because it causes
 
210
//   lots of ambiguity problems with automatic promotions.
 
211
// Solution: r123arrayinsertable and r123arrayextractable
 
212
 
 
213
template<typename T>
 
214
struct r123arrayinsertable{
 
215
    const T& v;
 
216
    r123arrayinsertable(const T& t_) : v(t_) {} 
 
217
    friend std::ostream& operator<<(std::ostream& os, const r123arrayinsertable<T>& t){
 
218
        return os << t.v;
 
219
    }
 
220
};
 
221
 
 
222
template<>
 
223
struct r123arrayinsertable<uint8_t>{
 
224
    const uint8_t& v;
 
225
    r123arrayinsertable(const uint8_t& t_) : v(t_) {} 
 
226
    friend std::ostream& operator<<(std::ostream& os, const r123arrayinsertable<uint8_t>& t){
 
227
        return os << (int)t.v;
 
228
    }
 
229
};
 
230
 
 
231
template<typename T>
 
232
struct r123arrayextractable{
 
233
    T& v;
 
234
    r123arrayextractable(T& t_) : v(t_) {}
 
235
    friend std::istream& operator>>(std::istream& is, r123arrayextractable<T>& t){
 
236
        return is >> t.v;
 
237
    }
 
238
};
 
239
 
 
240
template<>
 
241
struct r123arrayextractable<uint8_t>{
 
242
    uint8_t& v;
 
243
    r123arrayextractable(uint8_t& t_) : v(t_) {} 
 
244
    friend std::istream& operator>>(std::istream& is, r123arrayextractable<uint8_t>& t){
 
245
        int i;
 
246
        is >>  i;
 
247
        t.v = i;
 
248
        return is;
 
249
    }
 
250
};
 
251
 
 
252
#define CXXOVERLOADS(_N, W, T)                                          \
 
253
                                                                        \
 
254
inline std::ostream& operator<<(std::ostream& os, const r123array##_N##x##W& a){   \
 
255
    os << r123arrayinsertable<T>(a.v[0]);                                  \
 
256
    for(size_t i=1; i<_N; ++i)                                          \
 
257
        os << " " << r123arrayinsertable<T>(a.v[i]);                       \
 
258
    return os;                                                          \
 
259
}                                                                       \
 
260
                                                                        \
 
261
inline std::istream& operator>>(std::istream& is, r123array##_N##x##W& a){         \
 
262
    for(size_t i=0; i<_N; ++i){                                         \
 
263
        r123arrayextractable<T> x(a.v[i]);                                 \
 
264
        is >> x;                                                        \
 
265
    }                                                                   \
 
266
    return is;                                                          \
 
267
}                                                                       \
 
268
                                                                        \
 
269
namespace r123{                                                        \
 
270
 typedef r123array##_N##x##W Array##_N##x##W;                          \
 
271
}
 
272
                                                                        
 
273
#endif /* __cplusplus */
 
274
 
 
275
/* _r123array_tpl expands to a declaration of struct r123arrayNxW.  
 
276
 
 
277
   In C, it's nothing more than a struct containing an array of N
 
278
   objects of type T.
 
279
 
 
280
   In C++ it's the same, but endowed with an assortment of member
 
281
   functions, typedefs and friends.  In C++, r123arrayNxW looks a lot
 
282
   like std::array<T,N>, has most of the capabilities of a container,
 
283
   and satisfies the requirements outlined in compat/Engine.hpp for
 
284
   counter and key types.  ArrayNxW, in the r123 namespace is
 
285
   a typedef equivalent to r123arrayNxW.
 
286
*/
 
287
 
 
288
#define _r123array_tpl(_N, W, T)                   \
 
289
    /** @ingroup arrayNxW */                        \
 
290
    /** @see arrayNxW */                            \
 
291
struct r123array##_N##x##W{                         \
 
292
 T v[_N];                                       \
 
293
 CXXMETHODS(_N, W, T)                           \
 
294
};                                              \
 
295
                                                \
 
296
CXXOVERLOADS(_N, W, T)
 
297
 
 
298
/** @endcond */
 
299
 
 
300
_r123array_tpl(1, 32, uint32_t)  /* r123array1x32 */
 
301
_r123array_tpl(2, 32, uint32_t)  /* r123array2x32 */
 
302
_r123array_tpl(4, 32, uint32_t)  /* r123array4x32 */
 
303
_r123array_tpl(8, 32, uint32_t)  /* r123array8x32 */
 
304
 
 
305
_r123array_tpl(1, 64, uint64_t)  /* r123array1x64 */
 
306
_r123array_tpl(2, 64, uint64_t)  /* r123array2x64 */
 
307
_r123array_tpl(4, 64, uint64_t)  /* r123array4x64 */
 
308
 
 
309
_r123array_tpl(16, 8, uint8_t)  /* r123array16x8 for ARSsw, AESsw */
 
310
 
 
311
#if R123_USE_SSE
 
312
_r123array_tpl(1, m128i, r123m128i) /* r123array1x128i for ARSni, AESni */
 
313
#endif
 
314
 
 
315
/* In C++, it's natural to use sizeof(a::value_type), but in C it's
 
316
   pretty convoluted to figure out the width of the value_type of an
 
317
   r123arrayNxW:
 
318
*/
 
319
#define R123_W(a)   (8*sizeof(((a *)0)->v[0]))
 
320
 
 
321
/** @namespace r123
 
322
  Most of the Random123 C++ API is contained in the r123 namespace. 
 
323
*/
 
324
 
 
325
#endif
 
326