1
/* byteplex.c - Parallel byte stream module.
2
Copyright (c) 2008, Robert D. Cameron.
3
Licensed to the public under the Open Software License 3.0.
4
Licensed to International Characters, Inc., under the Academic
12
#include <sys/types.h>
15
#include "xml_error.h"
16
#include "multiliteral.h"
21
/* Space for sentinels in bytescans of the pseudo-ASCII stream. */
22
const int SENTINEL_PACKS = 1;
24
Byteplex::~Byteplex() {
25
if (infile != NULL) fclose(infile);
29
template <CodeUnit_Base C>
30
X8_Buffer<C>::X8_Buffer() : Byteplex() {
31
/* For 8-bit code units, the input buffer is a also used directly
32
as the pseudo-ASCII buffer; make sure that there is room for
34
src_buffer = simd_new(BYTEPLEX_SIZE/PACKSIZE+SENTINEL_PACKS);
36
// Set the sentinel for ScanToQuote,ScanWS in reading XML/text decls.
37
((unsigned char *) x8data)[BYTEPLEX_SIZE] = Ord<C, '"'>::value;
41
template <CodeUnit_Base C>
42
X8_Buffer<C>::~X8_Buffer() {
43
simd_delete((SIMD_type *) src_buffer);
46
U16_Buffer::U16_Buffer()
49
src_buffer = simd_new((BYTEPLEX_SIZE/PACKSIZE)*2);
50
x16hi = simd_new(BYTEPLEX_SIZE/PACKSIZE);
51
x16lo = simd_new(BYTEPLEX_SIZE/PACKSIZE);
52
x8data = simd_new(BYTEPLEX_SIZE/PACKSIZE+SENTINEL_PACKS);
53
// Set the sentinel for ScanToQuote,ScanWS in reading XML/text decls.
54
((unsigned char *) x8data)[BYTEPLEX_SIZE] = '"';
57
U16_Buffer::~U16_Buffer() {
58
simd_delete((SIMD_type *) x16hi);
59
simd_delete((SIMD_type *) x16lo);
60
simd_delete((SIMD_type *) x8data);
61
simd_delete((SIMD_type *) src_buffer);
64
U16LE_Buffer::U16LE_Buffer()
68
U16BE_Buffer::U16BE_Buffer()
72
U32_Buffer::U32_Buffer()
75
src_buffer = simd_new((BYTEPLEX_SIZE/PACKSIZE)*4);
76
x32hh = simd_new(BYTEPLEX_SIZE/PACKSIZE);
77
x32hl = simd_new(BYTEPLEX_SIZE/PACKSIZE);
78
x32lh = simd_new(BYTEPLEX_SIZE/PACKSIZE);
79
x32ll = simd_new(BYTEPLEX_SIZE/PACKSIZE);
80
x8data = simd_new(BYTEPLEX_SIZE/PACKSIZE+SENTINEL_PACKS);
81
x8data = simd_new(BYTEPLEX_SIZE/PACKSIZE+SENTINEL_PACKS);
82
// Set the sentinel for ScanToQuote,ScanWS in reading XML/text decls.
83
((unsigned char *) x8data)[BYTEPLEX_SIZE] = '"';
86
U32_Buffer::~U32_Buffer() {
87
simd_delete((SIMD_type *) x32hh);
88
simd_delete((SIMD_type *) x32hl);
89
simd_delete((SIMD_type *) x32lh);
90
simd_delete((SIMD_type *) x32ll);
91
simd_delete((SIMD_type *) x8data);
92
simd_delete((SIMD_type *) src_buffer);
95
U32LE_Buffer::U32LE_Buffer()
99
U32BE_Buffer::U32BE_Buffer()
103
U32_2143_Buffer::U32_2143_Buffer()
107
U32_3412_Buffer::U32_3412_Buffer()
114
No byteplexing is required for 8-bit code units; byteplex methods are no-ops.
118
template <CodeUnit_Base C>
119
void X8_Buffer<C>::DoByteplex() {
124
void DoDuplex(BytePack * src_data, int packs_in_buffer,
125
BytePack * p0, BytePack * p1) {
127
for (int pk = 0; pk < packs_in_buffer; pk++) {
128
BytePack s0 = src_data[2*pk];
129
BytePack s1 = src_data[2*pk+1];
130
#if (BYTE_ORDER == LITTLE_ENDIAN)
131
#ifdef TEMPLATED_SIMD_LIB
132
p0[pk] = simd<16>::pack<l,l>(s1, s0);
133
p1[pk] = simd<16>::pack<h,h>(s1, s0);
135
#ifndef TEMPLATED_SIMD_LIB
136
p0[pk] = simd_pack_16_ll(s1, s0);
137
p1[pk] = simd_pack_16_hh(s1, s0);
140
#if (BYTE_ORDER == BIG_ENDIAN)
141
#ifdef TEMPLATED_SIMD_LIB
142
p0[pk] = simd<16>::pack<l,l>(s0, s1);
143
p1[pk] = simd<16>::pack<h,h>(s0, s1);
145
#ifndef TEMPLATED_SIMD_LIB
146
p0[pk] = simd_pack_16_ll(s0, s1);
147
p1[pk] = simd_pack_16_hh(s0, s1);
153
void U16LE_Buffer::DoByteplex() {
154
DoDuplex(src_buffer, packs_in_buffer, x16lo, x16hi);
157
void U16BE_Buffer::DoByteplex() {
158
DoDuplex(src_buffer, packs_in_buffer, x16hi, x16lo);
161
void DoQuadplex(BytePack * src_data, int packs_in_buffer,
162
BytePack * p0, BytePack * p1, BytePack * p2, BytePack * p3) {
164
for (int pk = 0; pk < packs_in_buffer; pk++) {
165
BytePack s0 = src_data[4*pk];
166
BytePack s1 = src_data[4*pk+1];
167
BytePack s2 = src_data[4*pk+2];
168
BytePack s3 = src_data[4*pk+3];
169
#if (BYTE_ORDER == LITTLE_ENDIAN)
170
#ifdef TEMPLATED_SIMD_LIB
171
BytePack p02_0 = simd<16>::pack<l,l>(s1, s0);
172
BytePack p13_0 = simd<16>::pack<h,h>(s1, s0);
173
BytePack p02_1 = simd<16>::pack<l,l>(s3, s2);
174
BytePack p13_1 = simd<16>::pack<h,h>(s3, s2);
175
p0[pk] = simd<16>::pack<l,l>(p02_1, p02_0);
176
p1[pk] = simd<16>::pack<l,l>(p13_1, p13_0);
177
p2[pk] = simd<16>::pack<h,h>(p02_1, p02_0);
178
p3[pk] = simd<16>::pack<h,h>(p13_1, p13_0);
180
#ifndef TEMPLATED_SIMD_LIB
181
BytePack p02_0 = simd_pack_16_ll(s1, s0);
182
BytePack p13_0 = simd_pack_16_hh(s1, s0);
183
BytePack p02_1 = simd_pack_16_ll(s3, s2);
184
BytePack p13_1 = simd_pack_16_hh(s3, s2);
185
p0[pk] = simd_pack_16_ll(p02_1, p02_0);
186
p1[pk] = simd_pack_16_ll(p13_1, p13_0);
187
p2[pk] = simd_pack_16_hh(p02_1, p02_0);
188
p3[pk] = simd_pack_16_hh(p13_1, p13_0);
191
#if (BYTE_ORDER == BIG_ENDIAN)
192
#ifdef TEMPLATED_SIMD_LIB
193
BytePack p02_0 = simd<16>::pack<h,h>(s0, s1);
194
BytePack p13_0 = simd<16>::pack<l,l>(s0, s1);
195
BytePack p02_1 = simd<16>::pack<h,h>(s2, s3);
196
BytePack p13_1 = simd<16>::pack<l,l>(s2, s3);
197
p0[pk] = simd<16>::pack<h,h>(p02_0, p02_1);
198
p1[pk] = simd<16>::pack<h,h>(p13_0, p13_1);
199
p2[pk] = simd<16>::pack<l,l>(p02_0, p02_1);
200
p3[pk] = simd<16>::pack<l,l>(p13_0, p13_1);
202
#ifndef TEMPLATED_SIMD_LIB
203
BytePack p02_0 = simd_pack_16_hh(s0, s1);
204
BytePack p13_0 = simd_pack_16_ll(s0, s1);
205
BytePack p02_1 = simd_pack_16_hh(s2, s3);
206
BytePack p13_1 = simd_pack_16_ll(s2, s3);
207
p0[pk] = simd_pack_16_hh(p02_0, p02_1);
208
p1[pk] = simd_pack_16_hh(p13_0, p13_1);
209
p2[pk] = simd_pack_16_ll(p02_0, p02_1);
210
p3[pk] = simd_pack_16_ll(p13_0, p13_1);
216
void U32LE_Buffer::DoByteplex() {
217
DoQuadplex(src_buffer, packs_in_buffer, x32ll, x32lh, x32hl, x32hh);
220
void U32BE_Buffer::DoByteplex() {
221
DoQuadplex(src_buffer, packs_in_buffer, x32hh, x32hl, x32lh, x32ll);
224
void U32_2143_Buffer::DoByteplex() {
225
DoQuadplex(src_buffer, packs_in_buffer, x32hl, x32hh, x32ll, x32lh);
228
void U32_3412_Buffer::DoByteplex() {
229
DoQuadplex(src_buffer, packs_in_buffer, x32lh, x32ll, x32hh, x32hl);
233
/* Pseudo-ASCII stream methods */
235
template <CodeUnit_Base C>
236
void X8_Buffer<C>::PreparePseudoASCII_Stream() {
240
void U16_Buffer::PreparePseudoASCII_Stream() {
241
for (int pk = 0; pk < packs_in_buffer; pk++) {
242
#ifdef TEMPLATED_SIMD_LIB
243
x8data[pk] = simd_or(x16lo[pk], simd_andc(simd<8>::constant<(0x80)>(),
244
simd<8>::eq(x16hi[pk], simd<8>::constant<0>())));
246
#ifndef TEMPLATED_SIMD_LIB
247
x8data[pk] = simd_or(x16lo[pk], simd_andc(simd_const_8(0x80),
248
simd_eq_8(x16hi[pk], simd_const_8(0))));
253
void U32_Buffer::PreparePseudoASCII_Stream() {
254
for (int pk = 0; pk < packs_in_buffer; pk++) {
255
BytePack hi = simd_or(simd_or(x32hh[pk], x32hl[pk]), x32lh[pk]);
256
#ifdef TEMPLATED_SIMD_LIB
257
x8data[pk] = simd_or(x32ll[pk], simd_andc(simd<8>::constant<(0x80)>(),
258
simd<8>::eq(hi, simd<8>::constant<0>())));
260
#ifndef TEMPLATED_SIMD_LIB
261
x8data[pk] = simd_or(x32ll[pk], simd_andc(simd_const_8(0x80),
262
simd_eq_8(hi, simd_const_8(0))));
268
int Byteplex::CopyAndFill(unsigned char * bytes_to_copy, int lgth, int bytes_to_read) {
269
memcpy(src_buffer, bytes_to_copy, lgth);
270
#if defined(PAPI) and defined(CODE_CLOCKING) and (CODE_CLOCKING == FILE_READING)
271
code_clocker->cc_start_interval();
273
unsigned char * end_ptr = &((unsigned char *)src_buffer)[lgth];
274
int bytes_read = fread(end_ptr, 1, bytes_to_read, infile);
275
if (bytes_read < bytes_to_read) end_ptr[bytes_read] = '\0'; /* sentinel */
276
#if defined(PAPI) and defined(CODE_CLOCKING) and (CODE_CLOCKING == FILE_READING)
277
code_clocker->cc_end_interval(bytes_read);
282
void Byteplex::Set_limits(int units) {
283
units_in_buffer = units;
284
packs_in_buffer = (units_in_buffer + PACKSIZE -1)/PACKSIZE;
285
//buffer_limit_pos = min(units_in_buffer, BUFFER_SIZE);
288
template <CodeUnit_Base C>
289
void X8_Buffer<C>::InitializeBuffer(unsigned char * src, int lgth){
290
int byte_advance = BYTEPLEX_SIZE - lgth;
291
int bytes_read = CopyAndFill(src, lgth, byte_advance);
292
Set_limits(bytes_read + lgth);
295
void U16_Buffer::InitializeBuffer(unsigned char * src, int lgth){
296
int byte_advance = BYTEPLEX_SIZE * 2 - lgth;
297
int bytes_read = CopyAndFill(src, lgth, byte_advance);
298
if (bytes_read % 2 != 0) {
299
IncompleteCodeUnitError();
301
Set_limits((bytes_read + lgth)/2);
304
void U32_Buffer::InitializeBuffer(unsigned char * src, int lgth){
305
int byte_advance = BYTEPLEX_SIZE * 4 - lgth;
306
int bytes_read = CopyAndFill(src, lgth, byte_advance);
307
if (bytes_read % 4 != 0) {
308
IncompleteCodeUnitError();
310
Set_limits((bytes_read + lgth)/4);
315
template <CodeUnit_Base C>
316
void X8_Buffer<C>::AdvanceInputBuffer(int advance_amt){
317
int bytes_to_keep = units_in_buffer - advance_amt;
318
int bytes_read = CopyAndFill(&((unsigned char *)src_buffer)[advance_amt],
319
bytes_to_keep, advance_amt);
320
Set_limits(bytes_read + bytes_to_keep);
323
void U16_Buffer::AdvanceInputBuffer(int advance_amt){
324
int bytes_to_keep = (units_in_buffer - advance_amt)*2;
325
int bytes_read = CopyAndFill(&((unsigned char *)src_buffer)[advance_amt*2],
326
bytes_to_keep, advance_amt*2);
327
if (bytes_read % 2 != 0) {
328
IncompleteCodeUnitError();
330
Set_limits((bytes_read + bytes_to_keep)/2);
333
void U32_Buffer::AdvanceInputBuffer(int advance_amt){
334
int bytes_to_keep = (units_in_buffer - advance_amt)*4;
335
int bytes_read = CopyAndFill(&((unsigned char *)src_buffer)[advance_amt*4],
336
bytes_to_keep, advance_amt*4);
337
if (bytes_read % 4 != 0) {
338
IncompleteCodeUnitError();
340
Set_limits((bytes_read + bytes_to_keep)/4);
343
void U16_Buffer::Validate_UTF16() {
344
BytePack surrogate_select;
345
BytePack hi_surrogate;
346
BytePack lo_surrogate;
347
#ifdef TEMPLATED_SIMD_LIB
348
BytePack hi_surrogate_pending = simd<8>::constant<0>();
350
#ifndef TEMPLATED_SIMD_LIB
351
BytePack hi_surrogate_pending = simd_const_8(0);
353
BytePack surrogate_scope;
354
BytePack u16_surrogate_error;
355
// BytePack u16_surrogate_accum = simd<8>::constant<0>();
356
// BytePack u16_FFFE_FFFF_accum = simd<8>::constant<0>();
357
BytePack u16_FFFE_FFFF;
358
for (int pk = 0; pk < packs_in_buffer; pk++) {
359
/* UTF-16 code units in the range D800-DBFF and DC00-DFFF are
360
reserved for the first and second elements, respectively
361
of surrogate pairs. Validation requires that these values
362
only occur in well-formed pairs. */
363
#ifdef TEMPLATED_SIMD_LIB
364
surrogate_select = simd_and(x16hi[pk], simd<8>::constant<0xDC>());
365
hi_surrogate = simd<8>::eq(surrogate_select, simd<8>::constant<0xD8>());
366
lo_surrogate = simd<8>::eq(surrogate_select, simd<8>::constant<0xDC>());
367
surrogate_scope = simd_or(hi_surrogate_pending,
368
sisd_sfli(hi_surrogate, 8));
370
u16_surrogate_error = simd_xor(surrogate_scope, lo_surrogate);
371
hi_surrogate_pending = sisd_sbli(hi_surrogate, 8 * (PACKSIZE-1));
372
/* The values FFFE and FFFF are excluded. */
373
u16_FFFE_FFFF = simd<8>::eq(simd_and(x16hi[pk],
374
simd_or(x16lo[pk], simd<8>::constant<1>())),
375
simd<8>::constant<0xFF>());
377
#ifndef TEMPLATED_SIMD_LIB
378
surrogate_select = simd_and(x16hi[pk], simd_const_8(0xDC));
379
hi_surrogate = simd_eq_8(surrogate_select, simd_const_8(0xD8));
380
lo_surrogate = simd_eq_8(surrogate_select, simd_const_8(0xDC));
381
surrogate_scope = simd_or(hi_surrogate_pending,
382
sisd_sfli(hi_surrogate, 8));
384
u16_surrogate_error = simd_xor(surrogate_scope, lo_surrogate);
385
hi_surrogate_pending = sisd_sbli(hi_surrogate, 8 * (PACKSIZE-1));
386
/* The values FFFE and FFFF are excluded. */
387
u16_FFFE_FFFF = simd_eq_8(simd_and(x16hi[pk],
388
simd_or(x16lo[pk], simd_const_8(1))),
391
// u16_FFFE_FFFF_accum = simd_or(u16_FFFE_FFFF_accum, u16_FFFE_FFFF);
392
u16_surrogate_error = simd_or(u16_surrogate_error, u16_FFFE_FFFF);
394
if (bitblock_has_bit(u16_surrogate_error)) {
395
CharSetValidationError("UTF-16 (relative position reported)",
396
pk * PACKSIZE + count_forward_zeroes(u16_surrogate_error)/8);
402
void U16_Buffer::Validate_UCS2() {
403
#ifdef X16HILO_ACCESS
404
int packs = (buffer_units - 1)/PACKSIZE + 1;
405
#ifdef TEMPLATED_SIMD_LIB
406
BytePack u16_surrogate_accum = simd<8>::constant<0>();
407
BytePack u16_FFFE_FFFF_accum = simd<8>::constant<0>();
409
#ifndef TEMPLATED_SIMD_LIB
410
BytePack u16_surrogate_accum = simd_const_8(0);
411
BytePack u16_FFFE_FFFF_accum = simd_const_8(0);
413
BytePack u16_FFFE_FFFF;
414
for (int pk = 0; pk < packs; pk++) {
415
/* The high byte of UCS-2 code units cannot be in the range D8-DF.
416
This corresponds to the D800-DFFF range of illegal codepoints
417
reserved for UTF-16 surrogate pairs. Accumulate the results.
418
To check, 0x20 is added to each such octet, mapping the D8-DF
419
range to F8-FF and wrapping E0-FF values around. The max value
420
is then accumulated. */
421
#ifdef TEMPLATED_SIMD_LIB
422
u16_surrogate_accum =
423
simd_max_8(u16_surrogate_accum, simd<8>::add(x16hi[pk], simd<8>::constant<0x20>()));
424
/* The values FFFE and FFFF are excluded. */
425
u16_FFFE_FFFF = simd<8>::eq(simd_and(x16hi[pk],
426
simd_or(x16lo[pk], simd<8>::constant<1>())), simd<8>::constant<0xFF>());
427
u16_FFFE_FFFF_accum = simd_or(u16_FFFE_FFFF_accum, u16_FFFE_FFFF);
429
#ifndef TEMPLATED_SIMD_LIB
430
u16_surrogate_accum =
431
simd_max_8(u16_surrogate_accum, simd_add_8(x16hi[pk], simd_const_8(0x20)));
432
/* The values FFFE and FFFF are excluded. */
433
u16_FFFE_FFFF = simd_eq_8(simd_and(x16hi[pk],
434
simd_or(x16lo[pk], simd_const_8(1))), simd_const_8(0xFF));
435
u16_FFFE_FFFF_accum = simd_or(u16_FFFE_FFFF_accum, u16_FFFE_FFFF);
438
#ifdef TEMPLATED_SIMD_LIB
439
u16_surrogate_accum = simd<8>::eq(simd_or(u16_surrogate_accum, simd<8>::constant<0x07>()),
440
simd<8>::constant<0xFF>());
442
#ifndef TEMPLATED_SIMD_LIB
443
u16_surrogate_accum = simd_eq_8(simd_or(u16_surrogate_accum, simd_const_8(0x07)),
447
if (bitblock_has_bit(simd_or(u16_surrogate_accum, u16_FFFE_FFFF_accum)))
448
CharSetValidationError("UCS-2");
451
#ifndef X16HILO_ACCESS
452
printf("UCS_2_Lexer::Do_CharsetValidation not yet complete; assuming OK.\n");
457
void U32_Buffer::Validate_UTF32() {
458
#ifdef X32BYTEPLEX_ACCESS
459
int packs = (buffer_units - 1)/PACKSIZE + 1;
460
#ifdef TEMPLATED_SIMD_LIB
461
BytePack u32hh_accum = simd<8>::constant<0>();
462
BytePack u32hl_accum = simd<8>::constant<0>();
463
BytePack u32_surrogate_accum = simd<8>::constant<0>();
464
BytePack u32_FFFE_FFFF_accum = simd<8>::constant<0>();
466
#ifndef TEMPLATED_SIMD_LIB
467
BytePack u32hh_accum = simd_const_8(0);
468
BytePack u32hl_accum = simd_const_8(0);
469
BytePack u32_surrogate_accum = simd_const_8(0);
470
BytePack u32_FFFE_FFFF_accum = simd_const_8(0);
472
BytePack u32_BMP_select;
473
BytePack u32l_FFFE_FFFF;
474
for (int pk = 0; pk < packs; pk++) {
475
/* There can be no bits set in the high octet; "or" together
476
all octet values to check for any bit set. */
477
u32hh_accum = simd_or(u32hh_accum, x32hh[pk]);
478
/* The second octet has a max value of 0x10, corresponding to the
479
maximum Unicode code point value of 0x10FFFF. Accumulate the
480
maximum of all u32hl values observed. */
481
u32hl_accum = simd_max_8(u32hl_accum, x32hl[pk]);
482
/* The third octet cannot be in the range D8-DF if the second octet
483
is 0. This corresponds to the D800-DFFF range of illegal codepoints
484
reserved for UTF-16 surrogate pairs. Accumulate the results.
485
To check, 0x20 is added to each such octet, mapping the D8-DF
486
range to F8-FF and wrapping E0-FF values around. The max value
487
is then accumulated. */
488
#ifdef TEMPLATED_SIMD_LIB
489
u32_BMP_select = simd<8>::eq(x32hl[pk], simd<8>::constant<0>());
490
u32_surrogate_accum = simd_max_8(u32_surrogate_accum,
491
simd_and(u32_BMP_select, simd<8>::add(x32lh[pk], simd<8>::constant<0x20>())));
492
/* The low two octets cannot have the value FFFE or FFFF if
493
we're in the BMP (second octet is 0). */
494
u32l_FFFE_FFFF = simd<8>::eq(simd_and(x32lh[pk],
495
simd_or(x32ll[pk], simd<8>::constant<1>())),simd<8>::constant<0xFF>());
496
u32_FFFE_FFFF_accum = simd_or(u32_FFFE_FFFF_accum,
497
simd_and(u32_BMP_select, u32l_FFFE_FFFF));
499
#ifndef TEMPLATED_SIMD_LIB
500
u32_BMP_select = simd_eq_8(x32hl[pk], simd_const_8(0));
501
u32_surrogate_accum = simd_max_8(u32_surrogate_accum,
502
simd_and(u32_BMP_select, simd<8>::add(x32lh[pk], simd_const_8(0x20))));
503
/* The low two octets cannot have the value FFFE or FFFF if
504
we're in the BMP (second octet is 0). */
505
u32l_FFFE_FFFF = simd_eq_8(simd_and(x32lh[pk],
506
simd_or(x32ll[pk], simd_const_8(1))),simd_const_8(0xFF));
507
u32_FFFE_FFFF_accum = simd_or(u32_FFFE_FFFF_accum,
508
simd_and(u32_BMP_select, u32l_FFFE_FFFF));
511
#ifdef TEMPLATED_SIMD_LIB
512
u32hl_accum = simd_gt_8(u32hl_accum, simd<8>::constant<0x10>());
513
u32_surrogate_accum = simd<8>::eq(simd_or(u32_surrogate_accum, simd<8>::constant<0x07>()),
514
simd<8>::constant<0xFF>());
516
#ifndef TEMPLATED_SIMD_LIB
517
u32hl_accum = simd_gt_8(u32hl_accum, simd_const_8(0x10));
518
u32_surrogate_accum = simd_eq_8(simd_or(u32_surrogate_accum, simd_const_8(0x07)),
521
if (bitblock_has_bit(simd_or(simd_or(u32hh_accum, u32hl_accum),
522
simd_or(u32_surrogate_accum, u32_FFFE_FFFF_accum)))) {
523
CharSetValidationError("UTF-32");
526
#ifndef X32BYTEPLEX_ACCESS
527
printf("UTF_32_Lexer::Do_CharsetValidation not yet complete; assuming OK.\n");
531
Byteplex * Byteplex::ByteplexFactory(Entity_Info * e) {
533
if (likely(e->code_unit_size == SingleByte)) {
534
if (likely(e->code_unit_base == ASCII))
535
b = new X8_Buffer<ASCII>();
536
else b = new X8_Buffer<EBCDIC>();
538
else if (likely(e->code_unit_size == DoubleByte)) {
539
if (likely(e->byte_order == BigEndian))
540
b = new U16BE_Buffer();
541
else b = new U16LE_Buffer();
543
else switch (e->byte_order) {
544
case BigEndian: b = new U32BE_Buffer(); break;
545
case LittleEndian: b = new U32LE_Buffer(); break;
546
case Unusual_2143: b = new U32_2143_Buffer(); break;
547
case Unusual_3412: b = new U32_3412_Buffer(); break;
552
Byteplex * Byteplex::ByteplexFactory(Entity_Info * e, FILE * inputfile) {
553
Byteplex * b = ByteplexFactory(e);
554
b->infile = inputfile;
558
Byteplex * Byteplex::ByteplexFactory(Entity_Info * e, unsigned char * buffer_bytes, int buffer_size) {
559
Byteplex * b = ByteplexFactory(e);
561
memcpy(b->src_buffer, buffer_bytes, buffer_size);
562
//printf("buffer_bytes = %s\n", buffer_bytes);
563
b->units_in_buffer = buffer_size / e->code_unit_size;
564
b->packs_in_buffer = (b->units_in_buffer + PACKSIZE -1)/PACKSIZE;
569
int X8_Buffer<EBCDIC>::UTF8_Length(int name_pos, int lgth){
571
for (int i = name_pos; i < name_pos+lgth; i++) {
572
u8_lgth += /*TEMPORARY - NEED TO USE A TABLE FOR LOOKUP*/ 2;
578
int X8_Buffer<ASCII>::UTF8_Length(int name_pos, int lgth){
580
for (int i = name_pos; i < name_pos+lgth; i++) {
581
if (((unsigned char *)x8data)[i] < 0x80) u8_lgth += 1;
582
else u8_lgth += /*TEMPORARY - NEED TO USE A TABLE FOR LOOKUP*/ 1;
587
int UTF8_Buffer::UTF8_Length(int name_pos, int lgth){
591
int U16_Buffer::UTF8_Length(int name_pos, int lgth){
593
for (int i = name_pos; i < name_pos+lgth; i++) {
594
if (((unsigned char *)x8data)[i] < 0x80) u8_lgth += 1;
595
else if(((unsigned char *)x16hi)[i]<=0x7 || (((unsigned char *)x16hi)[i] >= 0xD8 &&((unsigned char *)x16hi)[i]<= 0xDF))
603
int U32_Buffer::UTF8_Length(int name_pos, int lgth){
605
unsigned char * u32hl = (unsigned char *) x32hl;
606
unsigned char * u32lh = (unsigned char *) x32lh;
607
for (int i = name_pos; i < name_pos+lgth; i++) {
608
if (((unsigned char *)x8data)[i] < 0x80) {
611
else if(u32hl[i] > 0)
613
else if(u32lh[i]<=0x7)
622
void X8_Buffer<ASCII>::to_UTF8(int name_pos, int lgth, char * u8_ptr){
623
memcpy(u8_ptr, &((char *)x8data)[name_pos], lgth);
625
// u8_ptr = copy_name(&((char *)x8data)[name_pos], lgth);
629
void X8_Buffer<EBCDIC>::to_UTF8(int name_pos, int lgth, char * u8_ptr){
632
void U16_Buffer::to_UTF8(int name_pos, int lgth, char * u8_ptr){
634
unsigned char * u16h = (unsigned char *) x16hi;
635
unsigned char * u16l = (unsigned char *) x16lo;
636
for (int i = name_pos; i < name_pos+lgth; i++) {
637
if (((unsigned char *)x8data)[i] < 0x80) {
638
u8_ptr[u8_lgth] = ((unsigned char *)x8data)[i];
641
else if (u16h[i]<=0x7) {
642
u8_ptr[u8_lgth] = 0xC0 + (u16h[i] << 2) + (u16l[i] >> 6);
643
u8_ptr[u8_lgth+1] = 0x80 + (u16l[i] & 0x3F);
646
else if ((u16h[i] >= 0xD8) && (u16h[i]<= 0xDB)){
647
char temp = ((u16h[i] & 0x03) << 2) + (u16l[i] >> 6) + 1;
648
u8_ptr[u8_lgth] = 0xF0 + (temp >> 2);
649
u8_ptr[u8_lgth+1] = 0x80 + ((temp & 0x03) << 4) + ((u16l[i] & 0x3F) >> 2);
650
u8_ptr[u8_lgth+2] = 0x80 + ((u16l[i] & 0x03) << 4) + ((u16h[i+1] & 0x03) << 2) + (u16l[i+1] >> 6);
651
u8_ptr[u8_lgth+3] = 0x80 + (u16l[i+1] & 0x3F);
656
u8_ptr[u8_lgth] = 0xE0 + (u16h[i] >> 4);
657
u8_ptr[u8_lgth+1] = 0x80 + ((u16h[i] & 0x0F) << 2) + (u16l[i] >> 6);
658
u8_ptr[u8_lgth+2] = 0x80 + (u16l[i] & 0x3F);
662
u8_ptr[u8_lgth] = '\0';
667
void U32_Buffer::to_UTF8(int name_pos, int lgth, char * u8_ptr){
669
unsigned char * u32hl = (unsigned char *) x32hl;
670
unsigned char * u32lh = (unsigned char *) x32lh;
671
unsigned char * u32ll = (unsigned char *) x32ll;
672
for (int i = name_pos; i < name_pos+lgth; i++) {
673
if (((unsigned char *)x8data)[i] < 0x80) {
674
u8_ptr[u8_lgth] = ((unsigned char *)x8data)[i];
677
else if(u32hl[i] > 0) {
678
u8_ptr[u8_lgth] = 0xF0 + (u32hl[i] >> 2);
679
u8_ptr[u8_lgth+1] = 0x80 + ((u32hl[i] & 0x03) << 4) + (u32lh[i] >> 4);
680
u8_ptr[u8_lgth+2] = 0x80 + ((u32lh[i] & 0x0F) << 2) + (u32ll[i] >> 6);
681
u8_ptr[u8_lgth+3] = 0x80 + (u32ll[i] & 0x3F);
684
else if(u32lh[i]<=0x7) {
685
u8_ptr[u8_lgth] = 0xC0 + (u32lh[i] << 2) + (u32ll[i] >> 6);
686
u8_ptr[u8_lgth+1] = 0x80 + (u32ll[i] & 0x3F);
690
u8_ptr[u8_lgth] = 0xE0 + (u32lh[i] >> 4);
691
u8_ptr[u8_lgth+1] = 0x80 + ((u32lh[i] & 0x0F) << 2) + (u32ll[i] >> 6);
692
u8_ptr[u8_lgth+2] = 0x80 + (u32ll[i] & 0x3F);
696
u8_ptr[u8_lgth] = '\0';