1
/* byteplex.h - Parallel byte stream module.
2
Copyright (c) 2008, Robert D. Cameron.
3
Licensed to the public under the Open Software License 3.0.
4
Licensed to International Characters, Inc., under the Academic
7
This module has as its goal the buffering of XML byte data and
8
transformation of 16-bit and 32-bit code unit data so that the
9
parsing engine is provided a uniform representation based on
10
the concept of an 8-bit pseudo-ASCII representation (x8data).
12
A Byteplex object provides buffers for one to six parallel data
13
streams based for an XML input entity, depending on the size of
15
1. In the case of 8-bit code units, a single byte stream
16
consisting of unmodified input data is maintained.
18
2. In the case of 16-bit code units (UTF-16 and UCS-2 families),
19
(a) the original code unit stream is maintained unmodified,
20
(b) the x16hi byte stream is established for the high byte
22
(c) the x16lo byte stream is established for the low byte
23
of each code unit, and
24
(d) x8data is established as the pseudo-ASCII byte stream,
25
with ASCII code units having their proper 8-bit values,
26
and all others having bit 0 set to 1.
27
3. In the case of 32-bit code units (UTF-32 family),
28
(a) the original code unit stream is maintained unmodified,
29
(b) the x32hh byte stream has high bytes of each code unit
30
(c) the x32hl byte stream has second bytes of each code unit
31
(d) the x32lh byte stream has third bytes of each code unit
32
(e) the x32hh byte stream has low bytes of each code unit, and
33
(f) x8data is established as the pseudo-ASCII byte stream,
34
with ASCII code units having their proper 8-bit values,
35
and all others having bit 0 set to 1.
37
The pseudo-ASCII representation is defined for both ASCII-based
38
and EBCDIC-based character sets such that all characters in
39
the ASCII repertoire (i.e., having Unicode code points from 0x00
40
to 0x7F), are represented as themselves and no non-ASCII character
41
is represented as a character in the ASCII repertoire.
49
#include "../lib/lib_simd.h"
51
/* The BytePack and the BitBlock are the two fundamental
52
types used by the parabix program for data held in
53
SIMD registers, representing, respectively, the byte-oriented
54
and bit-oriented views of character data.*/
56
typedef SIMD_type BytePack;
57
typedef SIMD_type BitBlock;
58
const int PACKSIZE = sizeof(SIMD_type);
59
const int BLOCKSIZE = sizeof(SIMD_type) * 8;
61
/* Define the size of buffer used for lexical analysis/parsing. */
62
const int BUFFER_BLOCKS = 781;
63
const int BUFFER_SIZE = BUFFER_BLOCKS * BLOCKSIZE;
65
/* When working near the end of a buffer, a bytespace test may involve
66
a multibyte literal. The bytespace buffer must always make available
67
a number of lookahead bytes at least equal to the maximum length of any
70
const int LOOKAHEAD_POSITIONS = 16;
71
const int BYTEPLEX_SIZE = BUFFER_SIZE + LOOKAHEAD_POSITIONS;
76
static Byteplex * ByteplexFactory(Entity_Info * e);
77
static Byteplex * ByteplexFactory(Entity_Info * e, FILE * inputfile);
78
static Byteplex * ByteplexFactory(Entity_Info * e, unsigned char * buffer_bytes, int buffer_size);
79
virtual void DoByteplex() = 0;
80
virtual void PreparePseudoASCII_Stream() = 0;
81
virtual void InitializeBuffer(unsigned char * src, int lgth) = 0;
82
virtual void AdvanceInputBuffer(int advance_amt) = 0;
83
virtual int UTF8_Length(int name_pos, int lgth)=0;
84
virtual void to_UTF8(int name_pos, int lgth, char * u8_ptr)=0;
85
/* Source code unit buffer. */
86
BytePack * src_buffer;
89
/* Pseudo-ASCII stream. */
95
int CopyAndFill(unsigned char * bytes_to_copy, int lgth, int bytes_to_read);
96
void Set_limits(int units_in_buffer);
101
/* The X8_Buffer template class is used for either ASCII- or EBCDIC-
102
based 8-bit code units.
103
The X8_Buffer<ASCII> class includes 7-bit ASCII
104
(with high-order bit 0), the ISO-8859 character sets and UTF-8.
106
The family of 8-bit EBCDIC based character sets are processed using
107
the X8_Buffer<EBCDIC> class.
110
template <CodeUnit_Base C>
111
class X8_Buffer : public Byteplex {
113
static const CodeUnit_Base Base = C;
114
static const CodeUnit_Size Size = SingleByte;
116
virtual ~X8_Buffer();
119
void PreparePseudoASCII_Stream();
120
void AdvanceInputBuffer(int advance_amt);
121
void InitializeBuffer(unsigned char * src, int lgth);
122
int UTF8_Length(int name_pos, int lgth);
123
void to_UTF8(int name_pos, int lgth, char * u8_ptr);
126
class UTF8_Buffer : public Byteplex {
128
static const CodeUnit_Base Base = ASCII;
129
static const CodeUnit_Size Size = SingleByte;
131
virtual ~UTF8_Buffer();
134
void PreparePseudoASCII_Stream();
135
void AdvanceInputBuffer(int advance_amt);
136
void InitializeBuffer(unsigned char * src, int lgth);
137
int UTF8_Length(int name_pos, int lgth);
138
void to_UTF8(int name_pos, int lgth, char * u8_ptr);
142
/* UTF-16 and UCS-2 character set families in BE and LE byte orders.
143
The U16LE and U16BE subclasses each provide a distinct byteplexer to
144
produce 2 parallel byte streams for the high and low bytes of each
145
16-bit code unit. Once byteplexing is complete, a generic pseudoASCII
146
conversion routine can be applied at the U16_Buffer level. */
148
class U16_Buffer : public Byteplex {
150
static const CodeUnit_Base Base = ASCII;
151
static const CodeUnit_Size Size = DoubleByte;
153
virtual ~U16_Buffer();
154
virtual void DoByteplex() = 0;
155
void PreparePseudoASCII_Stream();
156
void AdvanceInputBuffer(int advance_amt);
157
void Validate_UTF16();
158
void Validate_UCS2();
159
void InitializeBuffer(unsigned char * src, int lgth);
160
int UTF8_Length(int name_pos, int lgth);
161
void to_UTF8(int name_pos, int lgth, char * u8_ptr);
167
class U16LE_Buffer : public U16_Buffer {
173
class U16BE_Buffer : public U16_Buffer {
180
/* UTF-32/UCS-4 character sets in BE, LE, 2143 and 3412 byte orders.
181
Each subclass of U32_Buffer provide a distinct byteplexer to
182
produce the 4 parallel byte streams of Unicode data. Once
183
byteplexing is complete, a generic pseudoASCII routine can
185
class U32_Buffer : public Byteplex {
187
static const CodeUnit_Base Base = ASCII;
188
static const CodeUnit_Size Size = QuadByte;
190
virtual ~U32_Buffer();
191
virtual void DoByteplex() = 0;
192
void PreparePseudoASCII_Stream();
193
void AdvanceInputBuffer(int advance_amt);
194
void Validate_UTF32();
195
void InitializeBuffer(unsigned char * src, int lgth);
196
int UTF8_Length(int name_pos, int lgth);
197
void to_UTF8(int name_pos, int lgth, char * u8_ptr);
205
class U32LE_Buffer : public U32_Buffer {
211
class U32BE_Buffer : public U32_Buffer {
217
class U32_2143_Buffer : public U32_Buffer {
223
class U32_3412_Buffer : public U32_Buffer {
230
inline char * copy_name (char * s, int lgth){
231
char * d = new char[lgth+1];