/xmlbench/trunk

To get this branch, use:
bzr branch http://darksoft.org/webbzr/xmlbench/trunk
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
/*  byteplex.h - Parallel byte stream module.
    Copyright (c) 2008,  Robert D. Cameron.
    Licensed to the public under the Open Software License 3.0.
    Licensed to International Characters, Inc., under the Academic
    Free License 3.0.

    This module has as its goal the buffering of XML byte data and
    transformation of 16-bit and 32-bit code unit data so that the
    parsing engine is provided a uniform representation based on
    the concept of an 8-bit pseudo-ASCII representation (x8data).

    A Byteplex object provides buffers for one to six parallel data
    streams based for an XML input entity, depending on the size of
    character code units.  
       1.  In the case of 8-bit code units, a single byte stream
           consisting of unmodified input data is maintained.
           x8data = src_buffer
       2.  In the case of 16-bit code units (UTF-16 and UCS-2 families),
             (a) the original code unit stream is maintained unmodified,
             (b) the x16hi byte stream is established for the high byte
                 of each code unit,
             (c) the x16lo byte stream is established for the low byte
                 of each code unit, and
             (d) x8data is established as the pseudo-ASCII byte stream,
                 with ASCII code units having their proper 8-bit values,
                 and all others having bit 0 set to 1.
       3.  In the case of 32-bit code units (UTF-32 family),
             (a) the original code unit stream is maintained unmodified,
             (b) the x32hh byte stream has high bytes of each code unit
             (c) the x32hl byte stream has second bytes of each code unit
             (d) the x32lh byte stream has third bytes of each code unit
             (e) the x32hh byte stream has low bytes of each code unit, and
             (f) x8data is established as the pseudo-ASCII byte stream,
                 with ASCII code units having their proper 8-bit values,
                 and all others having bit 0 set to 1.

    The pseudo-ASCII representation is defined for both ASCII-based
    and EBCDIC-based character sets such that all characters in
    the ASCII repertoire (i.e., having Unicode code points from 0x00
    to 0x7F), are represented as themselves and no non-ASCII character
    is represented as a character in the ASCII repertoire.

*/

#ifndef BYTEPLEX_H
#define BYTEPLEX_H

#include "xmldecl.h"
#include "../lib/lib_simd.h"

/* The BytePack and the BitBlock are the two fundamental
   types used by the parabix program for data held in 
   SIMD registers, representing, respectively, the byte-oriented
   and bit-oriented views of character data.*/

typedef SIMD_type BytePack;
typedef SIMD_type BitBlock;
const int PACKSIZE = sizeof(SIMD_type);
const int BLOCKSIZE = sizeof(SIMD_type) * 8;

/* Define the size of buffer used for lexical analysis/parsing. */
const int BUFFER_BLOCKS = 781;
const int BUFFER_SIZE = BUFFER_BLOCKS * BLOCKSIZE;

/* When working near the end of a buffer, a bytespace test may involve
   a multibyte literal.  The bytespace buffer must always make available
   a number of lookahead bytes at least equal to the maximum length of any
   such literal. */

const int LOOKAHEAD_POSITIONS = 16;
const int BYTEPLEX_SIZE = BUFFER_SIZE + LOOKAHEAD_POSITIONS;

class Byteplex {
public:
	~Byteplex();
	static Byteplex * ByteplexFactory(Entity_Info * e);
	static Byteplex * ByteplexFactory(Entity_Info * e, FILE * inputfile);
	static Byteplex * ByteplexFactory(Entity_Info * e, unsigned char * buffer_bytes, int buffer_size);
	virtual void DoByteplex() = 0;
	virtual void PreparePseudoASCII_Stream() = 0;
	virtual	void InitializeBuffer(unsigned char * src, int lgth) = 0;
	virtual void AdvanceInputBuffer(int advance_amt) = 0;
	virtual int UTF8_Length(int name_pos, int lgth)=0;
	virtual void to_UTF8(int name_pos, int lgth, char * u8_ptr)=0;
	/* Source code unit buffer. */
	BytePack * src_buffer;
	int units_in_buffer;

	/* Pseudo-ASCII stream. */
	BytePack * x8data;

protected:
	FILE * infile;
	int packs_in_buffer;
	int CopyAndFill(unsigned char * bytes_to_copy, int lgth, int bytes_to_read);
	void Set_limits(int units_in_buffer);

};


/*  The X8_Buffer template class is used for either ASCII- or EBCDIC-
    based 8-bit code units.
    The X8_Buffer<ASCII> class includes 7-bit ASCII 
    (with high-order bit 0), the ISO-8859 character sets and UTF-8.

    The family of 8-bit EBCDIC based character sets are processed using
    the X8_Buffer<EBCDIC> class.
*/

template <CodeUnit_Base C>
class X8_Buffer : public Byteplex {
public:
	static const CodeUnit_Base Base = C;
	static const CodeUnit_Size Size = SingleByte;
	X8_Buffer();
	~X8_Buffer();
	
	void DoByteplex();
	void PreparePseudoASCII_Stream();
	void AdvanceInputBuffer(int advance_amt);
	void InitializeBuffer(unsigned char * src, int lgth);
	int UTF8_Length(int name_pos, int lgth);
	void to_UTF8(int name_pos, int lgth, char * u8_ptr);
};

class UTF8_Buffer : public Byteplex {
public:
	static const CodeUnit_Base Base = ASCII;
	static const CodeUnit_Size Size = SingleByte;
	UTF8_Buffer();
	~UTF8_Buffer();
	
	void DoByteplex();
	void PreparePseudoASCII_Stream();
	void AdvanceInputBuffer(int advance_amt);
	void InitializeBuffer(unsigned char * src, int lgth);
	int UTF8_Length(int name_pos, int lgth);
	void to_UTF8(int name_pos, int lgth, char * u8_ptr);
};


/*  UTF-16 and UCS-2 character set families in BE and LE byte orders. 
    The U16LE and U16BE subclasses each provide a distinct byteplexer to 
    produce 2 parallel byte streams for the high and low bytes of each
    16-bit code unit.  Once byteplexing is complete, a generic pseudoASCII 
    conversion routine can be applied at the U16_Buffer level. */

class U16_Buffer : public Byteplex {
public:
	static const CodeUnit_Base Base = ASCII;
	static const CodeUnit_Size Size = DoubleByte;
	U16_Buffer();
	~U16_Buffer();
	virtual void DoByteplex() = 0;
	void PreparePseudoASCII_Stream();
	void AdvanceInputBuffer(int advance_amt);
	void Validate_UTF16();
	void Validate_UCS2();
	void InitializeBuffer(unsigned char * src, int lgth);
	int UTF8_Length(int name_pos, int lgth);
	void to_UTF8(int name_pos, int lgth, char * u8_ptr);
protected:
	BytePack * x16hi;
	BytePack * x16lo;
};

class U16LE_Buffer : public U16_Buffer {
public:
	U16LE_Buffer();
	void DoByteplex();
};

class U16BE_Buffer : public U16_Buffer {
public:
	U16BE_Buffer();
	void DoByteplex();
};


/*  UTF-32/UCS-4 character sets in BE, LE, 2143 and 3412 byte orders. 
    Each subclass of U32_Buffer provide a distinct byteplexer to 
    produce the 4 parallel byte streams of Unicode data.  Once
    byteplexing is complete, a generic pseudoASCII routine can
    be applied. */
class U32_Buffer : public Byteplex {
public:
	static const CodeUnit_Base Base = ASCII;
	static const CodeUnit_Size Size = QuadByte;
	U32_Buffer();
	~U32_Buffer();
	virtual void DoByteplex() = 0;
	void PreparePseudoASCII_Stream();
	void AdvanceInputBuffer(int advance_amt);
	void Validate_UTF32();
	void InitializeBuffer(unsigned char * src, int lgth);
	int UTF8_Length(int name_pos, int lgth);
	void to_UTF8(int name_pos, int lgth, char * u8_ptr);
protected:
	BytePack * x32hh;
	BytePack * x32hl;
	BytePack * x32lh;
	BytePack * x32ll;
};

class U32LE_Buffer : public U32_Buffer {
public:
	U32LE_Buffer();
	void DoByteplex();
};

class U32BE_Buffer : public U32_Buffer {
public:
	U32BE_Buffer();
	void DoByteplex();
};

class U32_2143_Buffer : public U32_Buffer {
public:
	U32_2143_Buffer();
	void DoByteplex();
};

class U32_3412_Buffer : public U32_Buffer {
public:
	U32_3412_Buffer();
	void DoByteplex();
};


inline char * copy_name (char * s, int lgth){		
	char * d = new char[lgth+1];
	memcpy(d, s,lgth); 
	d[lgth] = '\0';	
	return d;
}

#endif