/xmlbench/trunk

To get this branch, use:
bzr branch http://darksoft.org/webbzr/xmlbench/trunk

« back to all changes in this revision

Viewing changes to parse/parabix.20090211/src/byteplex.c

  • Committer: Suren A. Chilingaryan
  • Date: 2009-09-23 17:13:04 UTC
  • Revision ID: csa@dside.dyndns.org-20090923171304-osvtr4zqb29h11kd
Intel, Tango, Phobos, and RapidXML parsers; Memory benchmark scripts

Show diffs side-by-side

added added

removed removed

Lines of Context:
1
 
/*  byteplex.c - Parallel byte stream module.
2
 
    Copyright (c) 2008, Robert D. Cameron.
3
 
    Licensed to the public under the Open Software License 3.0.
4
 
    Licensed to International Characters, Inc., under the Academic
5
 
    Free License 3.0.
6
 
 
7
 
*/
8
 
#include <stdio.h>
9
 
#include <stdlib.h>
10
 
#include <string.h>
11
 
#include <errno.h>
12
 
#include <sys/types.h>
13
 
#include <sys/stat.h>
14
 
#include "byteplex.h"
15
 
#include "xml_error.h"
16
 
#include "multiliteral.h"
17
 
#include "bytelex.h"
18
 
 
19
 
 
20
 
 
21
 
/*  Space for sentinels in bytescans of the pseudo-ASCII stream. */
22
 
const int SENTINEL_PACKS = 1;
23
 
 
24
 
Byteplex::~Byteplex() {
25
 
        if (infile != NULL) fclose(infile);
26
 
}
27
 
 
28
 
 
29
 
template <CodeUnit_Base C>
30
 
X8_Buffer<C>::X8_Buffer() : Byteplex() {
31
 
        /* For 8-bit code units, the input buffer is a also used directly
32
 
           as the pseudo-ASCII buffer; make sure that there is room for
33
 
           sentinels. */
34
 
        src_buffer = simd_new(BYTEPLEX_SIZE/PACKSIZE+SENTINEL_PACKS);
35
 
        x8data = src_buffer;
36
 
        // Set the sentinel for ScanToQuote,ScanWS in reading XML/text decls.
37
 
        ((unsigned char *) x8data)[BYTEPLEX_SIZE] = Ord<C, '"'>::value;
38
 
}
39
 
 
40
 
 
41
 
template <CodeUnit_Base C>
42
 
X8_Buffer<C>::~X8_Buffer() {
43
 
  simd_delete((SIMD_type *) src_buffer);
44
 
}
45
 
 
46
 
U16_Buffer::U16_Buffer()
47
 
        : Byteplex() {
48
 
 
49
 
        src_buffer = simd_new((BYTEPLEX_SIZE/PACKSIZE)*2);
50
 
        x16hi = simd_new(BYTEPLEX_SIZE/PACKSIZE);
51
 
        x16lo = simd_new(BYTEPLEX_SIZE/PACKSIZE);
52
 
        x8data = simd_new(BYTEPLEX_SIZE/PACKSIZE+SENTINEL_PACKS);
53
 
        // Set the sentinel for ScanToQuote,ScanWS in reading XML/text decls.
54
 
        ((unsigned char *) x8data)[BYTEPLEX_SIZE] = '"';
55
 
}
56
 
 
57
 
U16_Buffer::~U16_Buffer() {
58
 
  simd_delete((SIMD_type *) x16hi);
59
 
  simd_delete((SIMD_type *) x16lo);
60
 
  simd_delete((SIMD_type *) x8data);
61
 
  simd_delete((SIMD_type *) src_buffer);
62
 
}
63
 
 
64
 
U16LE_Buffer::U16LE_Buffer()
65
 
        : U16_Buffer() {
66
 
}
67
 
 
68
 
U16BE_Buffer::U16BE_Buffer()
69
 
        : U16_Buffer() {
70
 
}
71
 
 
72
 
U32_Buffer::U32_Buffer()
73
 
        : Byteplex() {
74
 
 
75
 
        src_buffer = simd_new((BYTEPLEX_SIZE/PACKSIZE)*4);
76
 
        x32hh = simd_new(BYTEPLEX_SIZE/PACKSIZE);
77
 
        x32hl = simd_new(BYTEPLEX_SIZE/PACKSIZE);
78
 
        x32lh = simd_new(BYTEPLEX_SIZE/PACKSIZE);
79
 
        x32ll = simd_new(BYTEPLEX_SIZE/PACKSIZE);
80
 
        x8data = simd_new(BYTEPLEX_SIZE/PACKSIZE+SENTINEL_PACKS);
81
 
        x8data = simd_new(BYTEPLEX_SIZE/PACKSIZE+SENTINEL_PACKS);
82
 
        // Set the sentinel for ScanToQuote,ScanWS in reading XML/text decls.
83
 
        ((unsigned char *) x8data)[BYTEPLEX_SIZE] = '"';
84
 
}
85
 
 
86
 
U32_Buffer::~U32_Buffer() {
87
 
  simd_delete((SIMD_type *) x32hh);
88
 
  simd_delete((SIMD_type *) x32hl);
89
 
  simd_delete((SIMD_type *) x32lh);
90
 
  simd_delete((SIMD_type *) x32ll);
91
 
  simd_delete((SIMD_type *) x8data);
92
 
  simd_delete((SIMD_type *) src_buffer);
93
 
}
94
 
 
95
 
U32LE_Buffer::U32LE_Buffer()
96
 
        : U32_Buffer() {
97
 
}
98
 
 
99
 
U32BE_Buffer::U32BE_Buffer()
100
 
        : U32_Buffer() {
101
 
}
102
 
 
103
 
U32_2143_Buffer::U32_2143_Buffer()
104
 
        : U32_Buffer() {
105
 
}
106
 
 
107
 
U32_3412_Buffer::U32_3412_Buffer()
108
 
        : U32_Buffer() {
109
 
}
110
 
 
111
 
 
112
 
/* Byteplex methods.
113
 
 
114
 
   No byteplexing is required for 8-bit code units; byteplex methods are no-ops.
115
 
 
116
 
*/
117
 
 
118
 
template <CodeUnit_Base C>
119
 
void X8_Buffer<C>::DoByteplex() {
120
 
        x8data = src_buffer;
121
 
}
122
 
 
123
 
 
124
 
void DoDuplex(BytePack * src_data, int packs_in_buffer,
125
 
                                         BytePack * p0, BytePack * p1) {
126
 
 
127
 
        for (int pk = 0; pk < packs_in_buffer; pk++) {
128
 
                BytePack s0 = src_data[2*pk];
129
 
                BytePack s1 = src_data[2*pk+1];
130
 
#if (BYTE_ORDER == LITTLE_ENDIAN)
131
 
#ifdef TEMPLATED_SIMD_LIB
132
 
                p0[pk] = simd<16>::pack<l,l>(s1, s0);
133
 
                p1[pk] = simd<16>::pack<h,h>(s1, s0);
134
 
#endif
135
 
#ifndef TEMPLATED_SIMD_LIB
136
 
                p0[pk] = simd_pack_16_ll(s1, s0);
137
 
                p1[pk] = simd_pack_16_hh(s1, s0);
138
 
#endif
139
 
#endif
140
 
#if (BYTE_ORDER == BIG_ENDIAN)
141
 
#ifdef TEMPLATED_SIMD_LIB
142
 
                p0[pk] = simd<16>::pack<l,l>(s0, s1);
143
 
                p1[pk] = simd<16>::pack<h,h>(s0, s1);
144
 
#endif
145
 
#ifndef TEMPLATED_SIMD_LIB
146
 
                p0[pk] = simd_pack_16_ll(s0, s1);
147
 
                p1[pk] = simd_pack_16_hh(s0, s1);
148
 
#endif
149
 
#endif
150
 
        }
151
 
}
152
 
                                         
153
 
void U16LE_Buffer::DoByteplex() {
154
 
        DoDuplex(src_buffer, packs_in_buffer, x16lo, x16hi);
155
 
}
156
 
 
157
 
void U16BE_Buffer::DoByteplex() {
158
 
        DoDuplex(src_buffer, packs_in_buffer, x16hi, x16lo);
159
 
}
160
 
 
161
 
void DoQuadplex(BytePack * src_data, int packs_in_buffer,
162
 
                                BytePack * p0, BytePack * p1, BytePack * p2, BytePack * p3) {
163
 
 
164
 
        for (int pk = 0; pk < packs_in_buffer; pk++) {
165
 
                BytePack s0 = src_data[4*pk];
166
 
                BytePack s1 = src_data[4*pk+1];
167
 
                BytePack s2 = src_data[4*pk+2];
168
 
                BytePack s3 = src_data[4*pk+3];
169
 
#if (BYTE_ORDER == LITTLE_ENDIAN)
170
 
#ifdef TEMPLATED_SIMD_LIB
171
 
                BytePack p02_0 = simd<16>::pack<l,l>(s1, s0);
172
 
                BytePack p13_0 = simd<16>::pack<h,h>(s1, s0);
173
 
                BytePack p02_1 = simd<16>::pack<l,l>(s3, s2);
174
 
                BytePack p13_1 = simd<16>::pack<h,h>(s3, s2);
175
 
                p0[pk] = simd<16>::pack<l,l>(p02_1, p02_0);
176
 
                p1[pk] = simd<16>::pack<l,l>(p13_1, p13_0);
177
 
                p2[pk] = simd<16>::pack<h,h>(p02_1, p02_0);
178
 
                p3[pk] = simd<16>::pack<h,h>(p13_1, p13_0);
179
 
#endif
180
 
#ifndef TEMPLATED_SIMD_LIB
181
 
                BytePack p02_0 = simd_pack_16_ll(s1, s0);
182
 
                BytePack p13_0 = simd_pack_16_hh(s1, s0);
183
 
                BytePack p02_1 = simd_pack_16_ll(s3, s2);
184
 
                BytePack p13_1 = simd_pack_16_hh(s3, s2);
185
 
                p0[pk] = simd_pack_16_ll(p02_1, p02_0);
186
 
                p1[pk] = simd_pack_16_ll(p13_1, p13_0);
187
 
                p2[pk] = simd_pack_16_hh(p02_1, p02_0);
188
 
                p3[pk] = simd_pack_16_hh(p13_1, p13_0);
189
 
#endif
190
 
#endif
191
 
#if (BYTE_ORDER == BIG_ENDIAN)
192
 
#ifdef TEMPLATED_SIMD_LIB
193
 
                BytePack p02_0 = simd<16>::pack<h,h>(s0, s1);
194
 
                BytePack p13_0 = simd<16>::pack<l,l>(s0, s1);
195
 
                BytePack p02_1 = simd<16>::pack<h,h>(s2, s3);
196
 
                BytePack p13_1 = simd<16>::pack<l,l>(s2, s3);
197
 
                p0[pk] = simd<16>::pack<h,h>(p02_0, p02_1);
198
 
                p1[pk] = simd<16>::pack<h,h>(p13_0, p13_1);
199
 
                p2[pk] = simd<16>::pack<l,l>(p02_0, p02_1);
200
 
                p3[pk] = simd<16>::pack<l,l>(p13_0, p13_1);
201
 
#endif
202
 
#ifndef TEMPLATED_SIMD_LIB
203
 
                BytePack p02_0 = simd_pack_16_hh(s0, s1);
204
 
                BytePack p13_0 = simd_pack_16_ll(s0, s1);
205
 
                BytePack p02_1 = simd_pack_16_hh(s2, s3);
206
 
                BytePack p13_1 = simd_pack_16_ll(s2, s3);
207
 
                p0[pk] = simd_pack_16_hh(p02_0, p02_1);
208
 
                p1[pk] = simd_pack_16_hh(p13_0, p13_1);
209
 
                p2[pk] = simd_pack_16_ll(p02_0, p02_1);
210
 
                p3[pk] = simd_pack_16_ll(p13_0, p13_1);
211
 
#endif
212
 
#endif
213
 
        }
214
 
}
215
 
 
216
 
void U32LE_Buffer::DoByteplex() {
217
 
        DoQuadplex(src_buffer, packs_in_buffer, x32ll, x32lh, x32hl, x32hh);
218
 
}
219
 
 
220
 
void U32BE_Buffer::DoByteplex() {
221
 
        DoQuadplex(src_buffer, packs_in_buffer, x32hh, x32hl, x32lh, x32ll);
222
 
}
223
 
 
224
 
void U32_2143_Buffer::DoByteplex() {
225
 
        DoQuadplex(src_buffer, packs_in_buffer, x32hl, x32hh, x32ll, x32lh);
226
 
}
227
 
 
228
 
void U32_3412_Buffer::DoByteplex() {
229
 
        DoQuadplex(src_buffer, packs_in_buffer, x32lh, x32ll, x32hh, x32hl);
230
 
}
231
 
 
232
 
 
233
 
/* Pseudo-ASCII stream methods */
234
 
 
235
 
template <CodeUnit_Base C>
236
 
void X8_Buffer<C>::PreparePseudoASCII_Stream() {
237
 
        x8data = src_buffer;
238
 
}
239
 
 
240
 
void U16_Buffer::PreparePseudoASCII_Stream() {
241
 
        for (int pk = 0; pk < packs_in_buffer; pk++) {
242
 
#ifdef TEMPLATED_SIMD_LIB
243
 
                x8data[pk] = simd_or(x16lo[pk], simd_andc(simd<8>::constant<(0x80)>(), 
244
 
                                               simd<8>::eq(x16hi[pk], simd<8>::constant<0>())));
245
 
#endif
246
 
#ifndef TEMPLATED_SIMD_LIB
247
 
                x8data[pk] = simd_or(x16lo[pk], simd_andc(simd_const_8(0x80), 
248
 
                                               simd_eq_8(x16hi[pk], simd_const_8(0))));
249
 
#endif
250
 
        }
251
 
}
252
 
 
253
 
void U32_Buffer::PreparePseudoASCII_Stream() {
254
 
        for (int pk = 0; pk < packs_in_buffer; pk++) {
255
 
                BytePack hi = simd_or(simd_or(x32hh[pk], x32hl[pk]), x32lh[pk]);
256
 
#ifdef TEMPLATED_SIMD_LIB
257
 
                x8data[pk] = simd_or(x32ll[pk], simd_andc(simd<8>::constant<(0x80)>(), 
258
 
                                               simd<8>::eq(hi, simd<8>::constant<0>())));
259
 
#endif
260
 
#ifndef TEMPLATED_SIMD_LIB
261
 
                x8data[pk] = simd_or(x32ll[pk], simd_andc(simd_const_8(0x80), 
262
 
                                               simd_eq_8(hi, simd_const_8(0))));
263
 
#endif
264
 
        }
265
 
}
266
 
 
267
 
 
268
 
int Byteplex::CopyAndFill(unsigned char * bytes_to_copy, int lgth, int bytes_to_read) {
269
 
        memcpy(src_buffer, bytes_to_copy, lgth);
270
 
#if defined(PAPI) and defined(CODE_CLOCKING) and (CODE_CLOCKING == FILE_READING)
271
 
        code_clocker->cc_start_interval();
272
 
#endif
273
 
        unsigned char * end_ptr = &((unsigned char *)src_buffer)[lgth];
274
 
        int bytes_read = fread(end_ptr, 1, bytes_to_read, infile);
275
 
        if (bytes_read < bytes_to_read) end_ptr[bytes_read] = '\0'; /* sentinel */
276
 
#if defined(PAPI) and defined(CODE_CLOCKING) and (CODE_CLOCKING == FILE_READING)
277
 
        code_clocker->cc_end_interval(bytes_read);
278
 
#endif
279
 
        return bytes_read;
280
 
}
281
 
 
282
 
void Byteplex::Set_limits(int units) {
283
 
        units_in_buffer = units;
284
 
        packs_in_buffer = (units_in_buffer + PACKSIZE -1)/PACKSIZE;
285
 
        //buffer_limit_pos = min(units_in_buffer, BUFFER_SIZE);
286
 
}
287
 
 
288
 
template <CodeUnit_Base C>
289
 
void X8_Buffer<C>::InitializeBuffer(unsigned char * src, int lgth){     
290
 
        int byte_advance = BYTEPLEX_SIZE - lgth;        
291
 
        int bytes_read = CopyAndFill(src, lgth, byte_advance);
292
 
        Set_limits(bytes_read + lgth);
293
 
}
294
 
 
295
 
void U16_Buffer::InitializeBuffer(unsigned char * src, int lgth){       
296
 
        int byte_advance = BYTEPLEX_SIZE * 2 - lgth;    
297
 
        int bytes_read = CopyAndFill(src, lgth, byte_advance);
298
 
        if (bytes_read % 2 != 0) {
299
 
                IncompleteCodeUnitError();
300
 
        }
301
 
        Set_limits((bytes_read + lgth)/2);
302
 
}
303
 
 
304
 
void U32_Buffer::InitializeBuffer(unsigned char * src, int lgth){       
305
 
        int byte_advance = BYTEPLEX_SIZE * 4 - lgth;    
306
 
        int bytes_read = CopyAndFill(src, lgth, byte_advance);
307
 
        if (bytes_read % 4 != 0) {
308
 
                IncompleteCodeUnitError();
309
 
        }
310
 
        Set_limits((bytes_read + lgth)/4);
311
 
}
312
 
 
313
 
 
314
 
 
315
 
template <CodeUnit_Base C>
316
 
void X8_Buffer<C>::AdvanceInputBuffer(int advance_amt){ 
317
 
        int bytes_to_keep = units_in_buffer - advance_amt;      
318
 
        int bytes_read = CopyAndFill(&((unsigned char *)src_buffer)[advance_amt],
319
 
                                        bytes_to_keep, advance_amt);
320
 
        Set_limits(bytes_read + bytes_to_keep);
321
 
}
322
 
 
323
 
void U16_Buffer::AdvanceInputBuffer(int advance_amt){   
324
 
        int bytes_to_keep = (units_in_buffer - advance_amt)*2;  
325
 
        int bytes_read = CopyAndFill(&((unsigned char *)src_buffer)[advance_amt*2],
326
 
                                        bytes_to_keep, advance_amt*2);
327
 
        if (bytes_read % 2 != 0) {
328
 
                IncompleteCodeUnitError();
329
 
        }
330
 
        Set_limits((bytes_read + bytes_to_keep)/2);
331
 
}
332
 
 
333
 
void U32_Buffer::AdvanceInputBuffer(int advance_amt){   
334
 
        int bytes_to_keep = (units_in_buffer - advance_amt)*4;  
335
 
        int bytes_read = CopyAndFill(&((unsigned char *)src_buffer)[advance_amt*4],
336
 
                                        bytes_to_keep, advance_amt*4);
337
 
        if (bytes_read % 4 != 0) {
338
 
                IncompleteCodeUnitError();
339
 
        }
340
 
        Set_limits((bytes_read + bytes_to_keep)/4);
341
 
}
342
 
 
343
 
void U16_Buffer::Validate_UTF16() {
344
 
        BytePack surrogate_select;
345
 
        BytePack hi_surrogate;
346
 
        BytePack lo_surrogate;
347
 
#ifdef TEMPLATED_SIMD_LIB
348
 
        BytePack hi_surrogate_pending = simd<8>::constant<0>();
349
 
#endif
350
 
#ifndef TEMPLATED_SIMD_LIB
351
 
        BytePack hi_surrogate_pending = simd_const_8(0);
352
 
#endif
353
 
        BytePack surrogate_scope;
354
 
        BytePack u16_surrogate_error;
355
 
//      BytePack u16_surrogate_accum = simd<8>::constant<0>();
356
 
//      BytePack u16_FFFE_FFFF_accum = simd<8>::constant<0>();
357
 
        BytePack u16_FFFE_FFFF;
358
 
        for (int pk = 0; pk < packs_in_buffer; pk++) {
359
 
                /* UTF-16 code units in the range D800-DBFF and DC00-DFFF are
360
 
                   reserved for the first and second elements, respectively
361
 
                   of surrogate pairs.  Validation requires that these values
362
 
                   only occur in well-formed pairs. */
363
 
#ifdef TEMPLATED_SIMD_LIB
364
 
                surrogate_select = simd_and(x16hi[pk], simd<8>::constant<0xDC>());
365
 
                hi_surrogate = simd<8>::eq(surrogate_select, simd<8>::constant<0xD8>());
366
 
                lo_surrogate = simd<8>::eq(surrogate_select, simd<8>::constant<0xDC>());
367
 
                surrogate_scope = simd_or(hi_surrogate_pending,
368
 
                                          sisd_sfli(hi_surrogate, 8));
369
 
                                          
370
 
                u16_surrogate_error = simd_xor(surrogate_scope, lo_surrogate);
371
 
                hi_surrogate_pending = sisd_sbli(hi_surrogate, 8 * (PACKSIZE-1));
372
 
                /* The values FFFE and FFFF are excluded. */
373
 
                u16_FFFE_FFFF = simd<8>::eq(simd_and(x16hi[pk],
374
 
                                                   simd_or(x16lo[pk], simd<8>::constant<1>())),
375
 
                                          simd<8>::constant<0xFF>());
376
 
#endif
377
 
#ifndef TEMPLATED_SIMD_LIB
378
 
                surrogate_select = simd_and(x16hi[pk], simd_const_8(0xDC));
379
 
                hi_surrogate = simd_eq_8(surrogate_select, simd_const_8(0xD8));
380
 
                lo_surrogate = simd_eq_8(surrogate_select, simd_const_8(0xDC));
381
 
                surrogate_scope = simd_or(hi_surrogate_pending,
382
 
                                          sisd_sfli(hi_surrogate, 8));
383
 
                                          
384
 
                u16_surrogate_error = simd_xor(surrogate_scope, lo_surrogate);
385
 
                hi_surrogate_pending = sisd_sbli(hi_surrogate, 8 * (PACKSIZE-1));
386
 
                /* The values FFFE and FFFF are excluded. */
387
 
                u16_FFFE_FFFF = simd_eq_8(simd_and(x16hi[pk],
388
 
                                                   simd_or(x16lo[pk], simd_const_8(1))),
389
 
                                          simd_const_8(0xFF));
390
 
#endif
391
 
//              u16_FFFE_FFFF_accum = simd_or(u16_FFFE_FFFF_accum, u16_FFFE_FFFF);
392
 
                u16_surrogate_error = simd_or(u16_surrogate_error, u16_FFFE_FFFF);
393
 
        
394
 
                if (bitblock_has_bit(u16_surrogate_error)) {
395
 
                        CharSetValidationError("UTF-16 (relative position reported)",
396
 
                                                pk * PACKSIZE + count_forward_zeroes(u16_surrogate_error)/8);
397
 
                }
398
 
        }
399
 
};
400
 
 
401
 
 
402
 
void U16_Buffer::Validate_UCS2() {
403
 
#ifdef X16HILO_ACCESS
404
 
        int packs = (buffer_units - 1)/PACKSIZE + 1;
405
 
#ifdef TEMPLATED_SIMD_LIB
406
 
        BytePack u16_surrogate_accum = simd<8>::constant<0>();
407
 
        BytePack u16_FFFE_FFFF_accum = simd<8>::constant<0>();
408
 
#endif
409
 
#ifndef TEMPLATED_SIMD_LIB
410
 
        BytePack u16_surrogate_accum = simd_const_8(0);
411
 
        BytePack u16_FFFE_FFFF_accum = simd_const_8(0);
412
 
#endif
413
 
        BytePack u16_FFFE_FFFF;
414
 
        for (int pk = 0; pk < packs; pk++) {
415
 
                /* The high byte of UCS-2 code units cannot be in the range D8-DF.
416
 
                   This corresponds to the D800-DFFF range of illegal codepoints
417
 
                   reserved for UTF-16 surrogate pairs. Accumulate the results. 
418
 
                   To check, 0x20 is added to each such octet, mapping the D8-DF
419
 
                   range to F8-FF and wrapping E0-FF values around.  The max value
420
 
                   is then accumulated.  */  
421
 
#ifdef TEMPLATED_SIMD_LIB 
422
 
                u16_surrogate_accum =
423
 
                        simd_max_8(u16_surrogate_accum, simd<8>::add(x16hi[pk], simd<8>::constant<0x20>()));
424
 
                /* The values FFFE and FFFF are excluded. */
425
 
                u16_FFFE_FFFF = simd<8>::eq(simd_and(x16hi[pk],
426
 
                                                   simd_or(x16lo[pk], simd<8>::constant<1>())), simd<8>::constant<0xFF>());
427
 
                u16_FFFE_FFFF_accum = simd_or(u16_FFFE_FFFF_accum, u16_FFFE_FFFF);
428
 
#endif
429
 
#ifndef TEMPLATED_SIMD_LIB
430
 
                u16_surrogate_accum =
431
 
                        simd_max_8(u16_surrogate_accum, simd_add_8(x16hi[pk], simd_const_8(0x20)));
432
 
                /* The values FFFE and FFFF are excluded. */
433
 
                u16_FFFE_FFFF = simd_eq_8(simd_and(x16hi[pk],
434
 
                                                   simd_or(x16lo[pk], simd_const_8(1))), simd_const_8(0xFF));
435
 
                u16_FFFE_FFFF_accum = simd_or(u16_FFFE_FFFF_accum, u16_FFFE_FFFF);
436
 
#endif
437
 
        }
438
 
#ifdef TEMPLATED_SIMD_LIB 
439
 
        u16_surrogate_accum = simd<8>::eq(simd_or(u16_surrogate_accum, simd<8>::constant<0x07>()),
440
 
                                        simd<8>::constant<0xFF>());
441
 
#endif
442
 
#ifndef TEMPLATED_SIMD_LIB
443
 
        u16_surrogate_accum = simd_eq_8(simd_or(u16_surrogate_accum, simd_const_8(0x07)),
444
 
                                        simd_const_8(0xFF));
445
 
#endif
446
 
 
447
 
        if (bitblock_has_bit(simd_or(u16_surrogate_accum, u16_FFFE_FFFF_accum)))
448
 
                CharSetValidationError("UCS-2");
449
 
        }
450
 
#endif
451
 
#ifndef X16HILO_ACCESS
452
 
        printf("UCS_2_Lexer::Do_CharsetValidation not yet complete; assuming OK.\n");
453
 
#endif
454
 
};
455
 
 
456
 
 
457
 
void U32_Buffer::Validate_UTF32() {
458
 
#ifdef X32BYTEPLEX_ACCESS
459
 
        int packs = (buffer_units - 1)/PACKSIZE + 1;
460
 
#ifdef TEMPLATED_SIMD_LIB
461
 
        BytePack u32hh_accum = simd<8>::constant<0>();
462
 
        BytePack u32hl_accum = simd<8>::constant<0>();
463
 
        BytePack u32_surrogate_accum = simd<8>::constant<0>();
464
 
        BytePack u32_FFFE_FFFF_accum = simd<8>::constant<0>();
465
 
#endif
466
 
#ifndef TEMPLATED_SIMD_LIB
467
 
        BytePack u32hh_accum = simd_const_8(0);
468
 
        BytePack u32hl_accum = simd_const_8(0);
469
 
        BytePack u32_surrogate_accum = simd_const_8(0);
470
 
        BytePack u32_FFFE_FFFF_accum = simd_const_8(0);
471
 
#endif
472
 
        BytePack u32_BMP_select;
473
 
        BytePack u32l_FFFE_FFFF;
474
 
        for (int pk = 0; pk < packs; pk++) {
475
 
                /* There can be no bits set in the high octet; "or" together
476
 
                   all octet values to check for any bit set. */
477
 
                u32hh_accum = simd_or(u32hh_accum, x32hh[pk]);
478
 
                /* The second octet has a max value of 0x10, corresponding to the
479
 
                   maximum Unicode code point value of 0x10FFFF.  Accumulate the
480
 
                   maximum of all u32hl values observed. */ 
481
 
                u32hl_accum = simd_max_8(u32hl_accum, x32hl[pk]);
482
 
                /* The third octet cannot be in the range D8-DF if the second octet
483
 
                   is 0.  This corresponds to the D800-DFFF range of illegal codepoints
484
 
                   reserved for UTF-16 surrogate pairs. Accumulate the results. 
485
 
                   To check, 0x20 is added to each such octet, mapping the D8-DF
486
 
                   range to F8-FF and wrapping E0-FF values around.  The max value
487
 
                   is then accumulated.  */
488
 
#ifdef TEMPLATED_SIMD_LIB
489
 
                u32_BMP_select = simd<8>::eq(x32hl[pk], simd<8>::constant<0>());
490
 
                u32_surrogate_accum = simd_max_8(u32_surrogate_accum, 
491
 
                                                                 simd_and(u32_BMP_select, simd<8>::add(x32lh[pk], simd<8>::constant<0x20>())));
492
 
                /* The low two octets cannot have the value FFFE or FFFF if
493
 
                   we're in the BMP (second octet is 0). */
494
 
                u32l_FFFE_FFFF = simd<8>::eq(simd_and(x32lh[pk],
495
 
                                                    simd_or(x32ll[pk], simd<8>::constant<1>())),simd<8>::constant<0xFF>());
496
 
                u32_FFFE_FFFF_accum = simd_or(u32_FFFE_FFFF_accum,
497
 
                                              simd_and(u32_BMP_select, u32l_FFFE_FFFF));
498
 
#endif
499
 
#ifndef TEMPLATED_SIMD_LIB
500
 
                u32_BMP_select = simd_eq_8(x32hl[pk], simd_const_8(0));
501
 
                u32_surrogate_accum = simd_max_8(u32_surrogate_accum, 
502
 
                                                                 simd_and(u32_BMP_select, simd<8>::add(x32lh[pk], simd_const_8(0x20))));
503
 
                /* The low two octets cannot have the value FFFE or FFFF if
504
 
                   we're in the BMP (second octet is 0). */
505
 
                u32l_FFFE_FFFF = simd_eq_8(simd_and(x32lh[pk],
506
 
                                                    simd_or(x32ll[pk], simd_const_8(1))),simd_const_8(0xFF));
507
 
                u32_FFFE_FFFF_accum = simd_or(u32_FFFE_FFFF_accum,
508
 
                                              simd_and(u32_BMP_select, u32l_FFFE_FFFF));
509
 
#endif
510
 
        }
511
 
#ifdef TEMPLATED_SIMD_LIB
512
 
        u32hl_accum = simd_gt_8(u32hl_accum, simd<8>::constant<0x10>());
513
 
        u32_surrogate_accum = simd<8>::eq(simd_or(u32_surrogate_accum, simd<8>::constant<0x07>()),
514
 
                                        simd<8>::constant<0xFF>());
515
 
#endif
516
 
#ifndef TEMPLATED_SIMD_LIB
517
 
        u32hl_accum = simd_gt_8(u32hl_accum, simd_const_8(0x10));
518
 
        u32_surrogate_accum = simd_eq_8(simd_or(u32_surrogate_accum, simd_const_8(0x07)),
519
 
                                        simd_const_8(0xFF));
520
 
#endif
521
 
        if (bitblock_has_bit(simd_or(simd_or(u32hh_accum, u32hl_accum),
522
 
                                         simd_or(u32_surrogate_accum, u32_FFFE_FFFF_accum)))) {
523
 
                CharSetValidationError("UTF-32");
524
 
        }
525
 
#endif
526
 
#ifndef X32BYTEPLEX_ACCESS
527
 
        printf("UTF_32_Lexer::Do_CharsetValidation not yet complete; assuming OK.\n");
528
 
#endif
529
 
};
530
 
 
531
 
Byteplex * Byteplex::ByteplexFactory(Entity_Info * e) {
532
 
        Byteplex * b;
533
 
        if (likely(e->code_unit_size == SingleByte)) {
534
 
                if (likely(e->code_unit_base == ASCII)) 
535
 
                        b = new X8_Buffer<ASCII>();
536
 
                else b = new X8_Buffer<EBCDIC>();
537
 
        }
538
 
        else if (likely(e->code_unit_size == DoubleByte)) {
539
 
                if (likely(e->byte_order == BigEndian))
540
 
                        b = new U16BE_Buffer();
541
 
                else b = new U16LE_Buffer();
542
 
        }
543
 
        else switch (e->byte_order) {
544
 
                case BigEndian: b = new U32BE_Buffer(); break;
545
 
                case LittleEndian: b = new U32LE_Buffer(); break;
546
 
                case Unusual_2143: b = new U32_2143_Buffer(); break;
547
 
                case Unusual_3412: b = new U32_3412_Buffer(); break;
548
 
        }       
549
 
        return b;
550
 
}
551
 
 
552
 
Byteplex * Byteplex::ByteplexFactory(Entity_Info * e, FILE * inputfile) {
553
 
        Byteplex * b = ByteplexFactory(e);
554
 
        b->infile = inputfile;
555
 
        return b;
556
 
}
557
 
        
558
 
Byteplex * Byteplex::ByteplexFactory(Entity_Info * e, unsigned char * buffer_bytes, int buffer_size) {
559
 
        Byteplex * b = ByteplexFactory(e);
560
 
        b->infile = NULL;
561
 
        memcpy(b->src_buffer, buffer_bytes, buffer_size);
562
 
//printf("buffer_bytes = %s\n", buffer_bytes);
563
 
        b->units_in_buffer = buffer_size / e->code_unit_size;
564
 
        b->packs_in_buffer = (b->units_in_buffer + PACKSIZE -1)/PACKSIZE;
565
 
        return b;
566
 
}
567
 
 
568
 
template <>
569
 
int X8_Buffer<EBCDIC>::UTF8_Length(int name_pos, int lgth){
570
 
        int u8_lgth = 0;
571
 
        for (int i = name_pos; i < name_pos+lgth; i++) {
572
 
                u8_lgth += /*TEMPORARY - NEED TO USE A TABLE FOR LOOKUP*/ 2;
573
 
        }
574
 
        return u8_lgth;
575
 
}
576
 
 
577
 
template <>
578
 
int X8_Buffer<ASCII>::UTF8_Length(int name_pos, int lgth){
579
 
        int u8_lgth = 0;
580
 
        for (int i = name_pos; i < name_pos+lgth; i++) {
581
 
                if (((unsigned char *)x8data)[i] < 0x80) u8_lgth += 1;
582
 
                else u8_lgth += /*TEMPORARY - NEED TO USE A TABLE FOR LOOKUP*/ 1;
583
 
        }
584
 
        return u8_lgth;
585
 
}
586
 
 
587
 
int UTF8_Buffer::UTF8_Length(int name_pos, int lgth){
588
 
        return lgth;
589
 
}
590
 
 
591
 
int U16_Buffer::UTF8_Length(int name_pos, int lgth){
592
 
        int u8_lgth = 0;
593
 
        for (int i = name_pos; i < name_pos+lgth; i++) {
594
 
                if (((unsigned char *)x8data)[i] < 0x80) u8_lgth += 1;
595
 
                else if(((unsigned char *)x16hi)[i]<=0x7 || (((unsigned char *)x16hi)[i] >= 0xD8 &&((unsigned char *)x16hi)[i]<= 0xDF))
596
 
                        u8_lgth += 2;
597
 
                else
598
 
                        u8_lgth += 3;
599
 
        }
600
 
        return u8_lgth;
601
 
}
602
 
 
603
 
int U32_Buffer::UTF8_Length(int name_pos, int lgth){
604
 
        int u8_lgth = 0;
605
 
        unsigned char * u32hl = (unsigned char *) x32hl;
606
 
        unsigned char * u32lh = (unsigned char *) x32lh;
607
 
        for (int i = name_pos; i < name_pos+lgth; i++) {
608
 
                if (((unsigned char *)x8data)[i] < 0x80)  {
609
 
                        u8_lgth += 1;
610
 
                }
611
 
                else if(u32hl[i] > 0)
612
 
                        u8_lgth += 4;
613
 
                else if(u32lh[i]<=0x7)
614
 
                        u8_lgth += 2;
615
 
                else
616
 
                        u8_lgth += 3;
617
 
        }
618
 
        return u8_lgth;
619
 
}
620
 
 
621
 
template <>
622
 
void X8_Buffer<ASCII>::to_UTF8(int name_pos, int lgth, char * u8_ptr){
623
 
        memcpy(u8_ptr, &((char *)x8data)[name_pos], lgth);
624
 
        u8_ptr[lgth] = '\0';
625
 
//      u8_ptr = copy_name(&((char *)x8data)[name_pos], lgth);
626
 
}
627
 
 
628
 
template <>
629
 
void X8_Buffer<EBCDIC>::to_UTF8(int name_pos, int lgth, char * u8_ptr){
630
 
        
631
 
}
632
 
void U16_Buffer::to_UTF8(int name_pos, int lgth, char * u8_ptr){
633
 
        int u8_lgth = 0;
634
 
        unsigned char * u16h = (unsigned char *) x16hi;
635
 
        unsigned char * u16l = (unsigned char *) x16lo;
636
 
        for (int i = name_pos; i < name_pos+lgth; i++) {
637
 
                if (((unsigned char *)x8data)[i] < 0x80) {
638
 
                        u8_ptr[u8_lgth] = ((unsigned char *)x8data)[i];
639
 
                        u8_lgth += 1;
640
 
                }
641
 
                else if (u16h[i]<=0x7) {
642
 
                        u8_ptr[u8_lgth] = 0xC0 + (u16h[i] << 2) + (u16l[i] >> 6);
643
 
                        u8_ptr[u8_lgth+1] = 0x80 + (u16l[i] & 0x3F);
644
 
                        u8_lgth += 2;
645
 
                }
646
 
                else if ((u16h[i] >= 0xD8) && (u16h[i]<= 0xDB)){
647
 
                        char temp =  ((u16h[i] & 0x03) << 2) + (u16l[i] >> 6) + 1;
648
 
                        u8_ptr[u8_lgth] = 0xF0 + (temp >> 2);
649
 
                        u8_ptr[u8_lgth+1] = 0x80 + ((temp & 0x03) << 4) + ((u16l[i] & 0x3F) >> 2);
650
 
                        u8_ptr[u8_lgth+2] = 0x80 + ((u16l[i] & 0x03) << 4) + ((u16h[i+1] & 0x03) << 2) + (u16l[i+1] >> 6);
651
 
                        u8_ptr[u8_lgth+3] = 0x80 + (u16l[i+1] & 0x3F);
652
 
                        i++;
653
 
                        u8_lgth += 4;
654
 
                }
655
 
                else{
656
 
                        u8_ptr[u8_lgth] = 0xE0 + (u16h[i] >> 4);
657
 
                        u8_ptr[u8_lgth+1] = 0x80 + ((u16h[i] & 0x0F) << 2) + (u16l[i] >> 6);
658
 
                        u8_ptr[u8_lgth+2] = 0x80 + (u16l[i] & 0x3F);
659
 
                        u8_lgth += 3;
660
 
                }
661
 
        }
662
 
        u8_ptr[u8_lgth] = '\0';
663
 
}
664
 
 
665
 
 
666
 
 
667
 
void U32_Buffer::to_UTF8(int name_pos, int lgth, char * u8_ptr){
668
 
        int u8_lgth = 0;
669
 
        unsigned char * u32hl = (unsigned char *) x32hl;
670
 
        unsigned char * u32lh = (unsigned char *) x32lh;
671
 
        unsigned char * u32ll = (unsigned char *) x32ll;
672
 
        for (int i = name_pos; i < name_pos+lgth; i++) {
673
 
                if (((unsigned char *)x8data)[i] < 0x80)  {
674
 
                        u8_ptr[u8_lgth] = ((unsigned char *)x8data)[i];
675
 
                        u8_lgth += 1;
676
 
                }
677
 
                else if(u32hl[i] > 0) {
678
 
                        u8_ptr[u8_lgth] = 0xF0 + (u32hl[i] >> 2);
679
 
                        u8_ptr[u8_lgth+1] = 0x80 + ((u32hl[i] & 0x03) << 4) + (u32lh[i] >> 4);
680
 
                        u8_ptr[u8_lgth+2] = 0x80 + ((u32lh[i] & 0x0F) << 2) + (u32ll[i] >> 6);
681
 
                        u8_ptr[u8_lgth+3] = 0x80 + (u32ll[i] & 0x3F);           
682
 
                        u8_lgth += 4;
683
 
                }
684
 
                else if(u32lh[i]<=0x7) {
685
 
                        u8_ptr[u8_lgth] = 0xC0 + (u32lh[i] << 2) + (u32ll[i] >> 6);
686
 
                        u8_ptr[u8_lgth+1] = 0x80 + (u32ll[i] & 0x3F);
687
 
                        u8_lgth += 2;
688
 
                }
689
 
                else {
690
 
                        u8_ptr[u8_lgth] = 0xE0 + (u32lh[i] >> 4);
691
 
                        u8_ptr[u8_lgth+1] = 0x80 + ((u32lh[i] & 0x0F) << 2) + (u32ll[i] >> 6);
692
 
                        u8_ptr[u8_lgth+2] = 0x80 + (u32ll[i] & 0x3F);
693
 
                        u8_lgth += 3;
694
 
                }
695
 
        }
696
 
        u8_ptr[u8_lgth] = '\0';
697
 
}
698
 
 
699