/xmlbench/trunk

To get this branch, use:
bzr branch http://darksoft.org/webbzr/xmlbench/trunk

« back to all changes in this revision

Viewing changes to parse/parabix.20090922/src/xmldecl.c

  • Committer: Suren A. Chilingaryan
  • Date: 2009-09-23 17:13:04 UTC
  • Revision ID: csa@dside.dyndns.org-20090923171304-osvtr4zqb29h11kd
Intel, Tango, Phobos, and RapidXML parsers; Memory benchmark scripts

Show diffs side-by-side

added added

removed removed

Lines of Context:
 
1
/*  xmldecl.c - Parsing XML and Text Declarations.
 
2
    Copyright (c) 2008, Robert D. Cameron.
 
3
    Licensed to the public under the Open Software License 3.0.
 
4
    Licensed to International Characters, Inc., under the Academic
 
5
    Free License 3.0.
 
6
 
 
7
*/
 
8
 
 
9
#include "byteplex.h"
 
10
#include "xmldecl.h"
 
11
#include "xml_error.h"
 
12
#include "multiliteral.h"
 
13
#include "bytelex.h"
 
14
 
 
15
Entity_Info::Entity_Info() {
 
16
        encoding = NULL;
 
17
}
 
18
Entity_Info::~Entity_Info() {
 
19
        delete [] encoding;
 
20
}
 
21
 
 
22
/* Signature-based character set family detection in accord with
 
23
   Appendix F of the XML 1.0 and 1.1 specifications. */
 
24
 
 
25
/* These definitions use b2int16 to determine appropriate doublebyte
 
26
   values based on endianness of the underlying architecture. */
 
27
static const int x0000 = b2int16<0x00, 0x00>::value;
 
28
static const int xFEFF = b2int16<0xFE, 0xFF>::value;
 
29
static const int xFFFE = b2int16<0xFF, 0xFE>::value;
 
30
static const int x003C = b2int16<0x00, 0x3C>::value;
 
31
static const int x3C00 = b2int16<0x3C, 0x00>::value;
 
32
static const int x4C6F = b2int16<0x4C, 0x6F>::value;
 
33
static const int xA794 = b2int16<0xA7, 0x94>::value;
 
34
static const int xEFBE = b2int16<0xEF, 0xBE>::value;
 
35
 
 
36
void Entity_Info::AnalyzeSignature(unsigned char * signature) {
 
37
        uint16_t * XML_dbl_byte = (uint16_t *) signature;
 
38
        switch (XML_dbl_byte[0]) {
 
39
                case x0000:
 
40
                        switch (XML_dbl_byte[1]) {
 
41
                                case xFEFF: set_charset_family(ASCII, QuadByte, BigEndian, 1);break;
 
42
                                case xFFFE: set_charset_family(ASCII, QuadByte, Unusual_2143, 1);break;
 
43
                                case x3C00: set_charset_family(ASCII, QuadByte, Unusual_2143, 0);break;
 
44
                                default: set_charset_family(ASCII, QuadByte, BigEndian, 0);
 
45
                        }
 
46
                        break;
 
47
                case xFEFF:
 
48
                        if (XML_dbl_byte[1] == x0000)
 
49
                                set_charset_family(ASCII, QuadByte, Unusual_3412, 1);
 
50
                        else set_charset_family(ASCII, DoubleByte, BigEndian, 1);
 
51
                        break;
 
52
                case xFFFE:
 
53
                        if (XML_dbl_byte[1] == x0000)
 
54
                                set_charset_family(ASCII, QuadByte, LittleEndian, 1);
 
55
                        else set_charset_family(ASCII, DoubleByte, LittleEndian, 1);
 
56
                        break;
 
57
                case x003C:
 
58
                        if (XML_dbl_byte[1] == x0000)
 
59
                                set_charset_family(ASCII, QuadByte, Unusual_3412, 0);
 
60
                        else set_charset_family(ASCII, DoubleByte, BigEndian, 0);
 
61
                        break;
 
62
                case x3C00:
 
63
                        if (XML_dbl_byte[1] == x0000)
 
64
                                set_charset_family(ASCII, QuadByte, LittleEndian, 0);
 
65
                        else set_charset_family(ASCII, DoubleByte, LittleEndian, 0);
 
66
                        break;
 
67
                case x4C6F:
 
68
                        if (XML_dbl_byte[1] == xA794)
 
69
                                set_charset_family(EBCDIC, SingleByte, BigEndian, 0);
 
70
                        else set_charset_family(ASCII, SingleByte, BigEndian, 0);
 
71
                        break;
 
72
                case xEFBE:
 
73
                        if (signature[2] == 0xBF)
 
74
                                set_charset_family(ASCII, SingleByte, BigEndian, 3);
 
75
                        else set_charset_family(ASCII, SingleByte, BigEndian, 0);
 
76
                        break;
 
77
                default:
 
78
                        set_charset_family(ASCII, SingleByte, BigEndian, 0);
 
79
        }
 
80
}
 
81
void Entity_Info::set_charset_family(CodeUnit_Base C, CodeUnit_Size S, CodeUnit_ByteOrder O, int B){
 
82
                code_unit_base = C;
 
83
                code_unit_size = S;
 
84
                byte_order = O;
 
85
                BOM_units = B;
 
86
 }
 
87
 
 
88
 
 
89
template <CodeUnit_Base C>
 
90
XML_Decl_Parser<C>::XML_Decl_Parser(Byteplex * b){
 
91
        byteplex = b;
 
92
        buffer_base_pos = 0;
 
93
        x8data = (unsigned char *) byteplex->x8data;
 
94
}
 
95
 
 
96
template <CodeUnit_Base C>
 
97
XML_Decl_Parser<C>::~XML_Decl_Parser(){
 
98
}
 
99
 
 
100
template <CodeUnit_Base C>
 
101
inline void XML_Decl_Parser<C>::DeclError() {
 
102
        DeclarationError(AbsPos());
 
103
}
 
104
 
 
105
template <CodeUnit_Base C>
 
106
inline int XML_Decl_Parser<C>::AbsPos() const {
 
107
        return  buffer_base_pos + buffer_rel_pos;
 
108
}
 
109
 
 
110
template <CodeUnit_Base C>
 
111
inline unsigned char * XML_Decl_Parser<C>::cur() const {
 
112
        return &x8data[buffer_rel_pos];
 
113
}
 
114
 
 
115
template <CodeUnit_Base C>
 
116
inline void XML_Decl_Parser<C>::Advance(int n) {
 
117
        buffer_rel_pos += n;
 
118
        if (buffer_rel_pos >= BYTEPLEX_SIZE) {
 
119
                 byteplex->AdvanceInputBuffer(BYTEPLEX_SIZE);
 
120
        }
 
121
}
 
122
 
 
123
template <CodeUnit_Base C>
 
124
inline void XML_Decl_Parser<C>::Scan_WS() {
 
125
        while (at_WhiteSpace_10<C>(cur())) Advance(1);
 
126
}
 
127
 
 
128
template <CodeUnit_Base C>
 
129
inline void XML_Decl_Parser<C>::ScanToQuote() {
 
130
        int quote_start_pos = buffer_rel_pos;   
 
131
        while (!AtQuote<C>(cur())) buffer_rel_pos+=1;
 
132
        if (buffer_rel_pos >= BYTEPLEX_SIZE) {
 
133
                byteplex->AdvanceInputBuffer(quote_start_pos);
 
134
                buffer_rel_pos -= quote_start_pos;
 
135
                buffer_base_pos += quote_start_pos;
 
136
                while (!AtQuote<C>(cur())) buffer_rel_pos+=1;
 
137
                if (buffer_rel_pos >= BYTEPLEX_SIZE) {
 
138
                        ImplementationLimitError("Encoding name exceeds BYTEPLEX_SIZE");
 
139
                }
 
140
        }
 
141
}
 
142
 
 
143
template <CodeUnit_Base C>
 
144
inline void XML_Decl_Parser<C>::ParseVersion(Entity_Info & e) {
 
145
        /* Skip "version" */
 
146
        Advance(7);
 
147
        Scan_WS();
 
148
        if (!AtChar<C,'='>(cur())) DeclError();
 
149
        Advance(1);
 
150
        Scan_WS();
 
151
        if (at_1_0<C>(cur())) e.version = XML_1_0;
 
152
        else if (at_1_1<C>(cur())) e.version = XML_1_1;
 
153
        else DeclError();
 
154
        Advance(5);
 
155
}
 
156
 
 
157
template <CodeUnit_Base C>
 
158
inline void XML_Decl_Parser<C>::ParseEncoding(Entity_Info & e) {
 
159
        /* Skip "encoding" */
 
160
        Advance(8);
 
161
        e.has_encoding_decl = true;
 
162
        Scan_WS();
 
163
        if (!AtChar<C,'='>(cur())) DeclError();
 
164
        Advance(1);
 
165
        Scan_WS();
 
166
        if (AtQuote<C>(cur())) {
 
167
                unsigned char quoteCh = cur()[0];
 
168
                Advance(1);
 
169
                int start_pos = AbsPos();
 
170
                ScanToQuote();
 
171
                if (cur()[0] != quoteCh) DeclError();
 
172
                int lgth = AbsPos() - start_pos;
 
173
                e.encoding = new unsigned char[lgth + 1];
 
174
                memcpy(e.encoding, &x8data[start_pos-buffer_base_pos], lgth);
 
175
                e.encoding[lgth] = '\0';
 
176
        }
 
177
        else DeclError();
 
178
        Advance(1);
 
179
}
 
180
 
 
181
template <CodeUnit_Base C>
 
182
inline void XML_Decl_Parser<C>::ParseStandalone(Entity_Info & e) {
 
183
        /* Skip "standalone" */
 
184
        Advance(10);
 
185
        Scan_WS();
 
186
        if (!AtChar<C,'='>(cur())) DeclError();
 
187
        Advance(1);
 
188
        Scan_WS();
 
189
        if (at_yes<C>(cur())) {Advance(5); e.standalone = Standalone_yes;}
 
190
        else if (at_no<C>(cur())) {Advance(4); e.standalone = Standalone_no;}
 
191
        else DeclError();
 
192
}
 
193
 
 
194
template <CodeUnit_Base C>
 
195
void XML_Decl_Parser<C>::ReadXMLInfo(Entity_Info & e) {
 
196
        e.version = no_XML_version_value;
 
197
        e.has_encoding_decl = false;
 
198
        e.standalone = Standalone_no_value;
 
199
        buffer_rel_pos = e.BOM_units;
 
200
        // It is possible that there is no XML declaration.
 
201
        if (!at_XmlDecl_start<C>(cur())) {
 
202
                e.content_start = AbsPos();
 
203
                return;
 
204
        }
 
205
        // Otherwise, the XML declaration exists and must have
 
206
        // at least version information.
 
207
        Advance(6);
 
208
        Scan_WS();
 
209
        if (!at_version<C>(cur())) DeclError();
 
210
        ParseVersion(e);
 
211
        if (at_PI_End<C>(cur())) {
 
212
                e.content_start = AbsPos()+2;
 
213
                return;
 
214
        }
 
215
        if (!at_WhiteSpace_10<C>(cur())) DeclError();
 
216
        Scan_WS();
 
217
        if (at_encoding<C>(cur())) {
 
218
                ParseEncoding(e);
 
219
                if (at_PI_End<C>(cur())) {
 
220
                        e.content_start = AbsPos()+2;
 
221
                        return;
 
222
                }
 
223
                if (!at_WhiteSpace_10<C>(cur())) DeclError();
 
224
                Scan_WS();
 
225
        }
 
226
        if (at_standalone<C>(cur())) {
 
227
                ParseStandalone(e);
 
228
                Scan_WS();
 
229
        }
 
230
        if (!at_PI_End<C>(cur())) DeclError();
 
231
        e.content_start = AbsPos()+2;
 
232
}
 
233
 
 
234
// Similar to reading the XML_declaration of the document entity,
 
235
// ReadTextDeclaration reads the text declaration of an external
 
236
// parsed entity.
 
237
template <CodeUnit_Base C>
 
238
void XML_Decl_Parser<C>::ReadTextDeclaration(Entity_Info & e) {
 
239
        e.version = no_XML_version_value;
 
240
        e.has_encoding_decl = false;
 
241
        e.standalone = Standalone_no_value;
 
242
        buffer_rel_pos = e.BOM_units;
 
243
        // It is possible that there is no text declaration.
 
244
        if (!at_XmlDecl_start<C>(cur())) {
 
245
                e.content_start = AbsPos();
 
246
                return;
 
247
        }
 
248
        // Otherwise, the text declaration exists and may have
 
249
        // version information.
 
250
        Advance(6);
 
251
        Scan_WS();
 
252
        if (at_version<C>(cur())) {
 
253
                ParseVersion(e);
 
254
                // Must have whitespace character before encoding declaration.
 
255
                if (!at_WhiteSpace_10<C>(cur())) DeclError();
 
256
                Scan_WS();
 
257
        }
 
258
        if (!at_encoding<C>(cur())) DeclError();
 
259
        ParseEncoding(e);
 
260
        Scan_WS();
 
261
        if (!at_PI_End<C>(cur())) DeclError();
 
262
        e.content_start = AbsPos()+2;
 
263
}
 
264
 
 
265
template <CodeUnit_Base C>
 
266
void XML_Decl_Parser<C>::ReadXMLorTextDecl(Entity_Info & e) {
 
267
        e.version = no_XML_version_value;
 
268
        e.has_encoding_decl = false;
 
269
        e.standalone = Standalone_no_value;
 
270
        buffer_rel_pos = e.BOM_units;
 
271
        // It is possible that there is no XML or text declaration.
 
272
        if (!at_XmlDecl_start<C>(cur())) {
 
273
                e.content_start = AbsPos();
 
274
                return;
 
275
        }
 
276
        // Otherwise, the XML or text declaration exists and may have
 
277
        // version information.
 
278
        Advance(6);
 
279
        Scan_WS();
 
280
        if (at_version<C>(cur())) {
 
281
                ParseVersion(e);
 
282
                if (at_PI_End<C>(cur())) {
 
283
                        e.content_start = AbsPos()+2;
 
284
                        return;
 
285
                }
 
286
                if (!at_WhiteSpace_10<C>(cur())) DeclError();
 
287
                Scan_WS();
 
288
                if (at_encoding<C>(cur())) {
 
289
                        ParseEncoding(e);
 
290
                        if (at_PI_End<C>(cur())) {
 
291
                                e.content_start = AbsPos()+2;
 
292
                                return;
 
293
                        }
 
294
                        if (!at_WhiteSpace_10<C>(cur())) DeclError();
 
295
                        Scan_WS();
 
296
                }
 
297
                if (at_standalone<C>(cur())) {
 
298
                        ParseStandalone(e);
 
299
                        Scan_WS();
 
300
                }
 
301
        }
 
302
        else {  // Without version, we can only have a text declaration,
 
303
                // in which case an encoding spec is required.
 
304
                if (!at_encoding<C>(cur())) DeclError();
 
305
                ParseEncoding();
 
306
                Scan_WS();
 
307
                // No standalone spec is allowed in a text declaration.
 
308
        }
 
309
        if (!at_PI_End<C>(cur())) DeclError();  
 
310
        e.content_start = AbsPos()+2;
 
311
}