/xmlbench/trunk

To get this branch, use:
bzr branch http://darksoft.org/webbzr/xmlbench/trunk

« back to all changes in this revision

Viewing changes to parse/parabix.20090211/src/xmldecl.c

  • Committer: Suren A. Chilingaryan
  • Date: 2009-09-23 17:13:04 UTC
  • Revision ID: csa@dside.dyndns.org-20090923171304-osvtr4zqb29h11kd
Intel, Tango, Phobos, and RapidXML parsers; Memory benchmark scripts

Show diffs side-by-side

added added

removed removed

Lines of Context:
1
 
/*  xmldecl.c - Parsing XML and Text Declarations.
2
 
    Copyright (c) 2008, Robert D. Cameron.
3
 
    Licensed to the public under the Open Software License 3.0.
4
 
    Licensed to International Characters, Inc., under the Academic
5
 
    Free License 3.0.
6
 
 
7
 
*/
8
 
 
9
 
#include "byteplex.h"
10
 
#include "xmldecl.h"
11
 
#include "xml_error.h"
12
 
#include "multiliteral.h"
13
 
#include "bytelex.h"
14
 
 
15
 
Entity_Info::Entity_Info() {
16
 
        encoding = NULL;
17
 
}
18
 
Entity_Info::~Entity_Info() {
19
 
        delete [] encoding;
20
 
}
21
 
 
22
 
/* Signature-based character set family detection in accord with
23
 
   Appendix F of the XML 1.0 and 1.1 specifications. */
24
 
 
25
 
/* These definitions use b2int16 to determine appropriate doublebyte
26
 
   values based on endianness of the underlying architecture. */
27
 
static const int x0000 = b2int16<0x00, 0x00>::value;
28
 
static const int xFEFF = b2int16<0xFE, 0xFF>::value;
29
 
static const int xFFFE = b2int16<0xFF, 0xFE>::value;
30
 
static const int x003C = b2int16<0x00, 0x3C>::value;
31
 
static const int x3C00 = b2int16<0x3C, 0x00>::value;
32
 
static const int x4C6F = b2int16<0x4C, 0x6F>::value;
33
 
static const int xA794 = b2int16<0xA7, 0x94>::value;
34
 
static const int xEFBE = b2int16<0xEF, 0xBE>::value;
35
 
 
36
 
void Entity_Info::AnalyzeSignature(unsigned char * signature) {
37
 
        uint16_t * XML_dbl_byte = (uint16_t *) signature;
38
 
        switch (XML_dbl_byte[0]) {
39
 
                case x0000:
40
 
                        switch (XML_dbl_byte[1]) {
41
 
                                case xFEFF: set_charset_family(ASCII, QuadByte, BigEndian, 1);break;
42
 
                                case xFFFE: set_charset_family(ASCII, QuadByte, Unusual_2143, 1);break;
43
 
                                case x3C00: set_charset_family(ASCII, QuadByte, Unusual_2143, 0);break;
44
 
                                default: set_charset_family(ASCII, QuadByte, BigEndian, 0);
45
 
                        }
46
 
                        break;
47
 
                case xFEFF:
48
 
                        if (XML_dbl_byte[1] == x0000)
49
 
                                set_charset_family(ASCII, QuadByte, Unusual_3412, 1);
50
 
                        else set_charset_family(ASCII, DoubleByte, BigEndian, 1);
51
 
                        break;
52
 
                case xFFFE:
53
 
                        if (XML_dbl_byte[1] == x0000)
54
 
                                set_charset_family(ASCII, QuadByte, LittleEndian, 1);
55
 
                        else set_charset_family(ASCII, DoubleByte, LittleEndian, 1);
56
 
                        break;
57
 
                case x003C:
58
 
                        if (XML_dbl_byte[1] == x0000)
59
 
                                set_charset_family(ASCII, QuadByte, Unusual_3412, 0);
60
 
                        else set_charset_family(ASCII, DoubleByte, BigEndian, 0);
61
 
                        break;
62
 
                case x3C00:
63
 
                        if (XML_dbl_byte[1] == x0000)
64
 
                                set_charset_family(ASCII, QuadByte, LittleEndian, 0);
65
 
                        else set_charset_family(ASCII, DoubleByte, LittleEndian, 0);
66
 
                        break;
67
 
                case x4C6F:
68
 
                        if (XML_dbl_byte[1] == xA794)
69
 
                                set_charset_family(EBCDIC, SingleByte, BigEndian, 0);
70
 
                        else set_charset_family(ASCII, SingleByte, BigEndian, 0);
71
 
                        break;
72
 
                case xEFBE:
73
 
                        if (signature[2] == 0xBF)
74
 
                                set_charset_family(ASCII, SingleByte, BigEndian, 3);
75
 
                        else set_charset_family(ASCII, SingleByte, BigEndian, 0);
76
 
                        break;
77
 
                default:
78
 
                        set_charset_family(ASCII, SingleByte, BigEndian, 0);
79
 
        }
80
 
}
81
 
void Entity_Info::set_charset_family(CodeUnit_Base C, CodeUnit_Size S, CodeUnit_ByteOrder O, int B){
82
 
                code_unit_base = C;
83
 
                code_unit_size = S;
84
 
                byte_order = O;
85
 
                BOM_units = B;
86
 
 }
87
 
 
88
 
 
89
 
template <CodeUnit_Base C>
90
 
XML_Decl_Parser<C>::XML_Decl_Parser(Byteplex * b){
91
 
        byteplex = b;
92
 
        buffer_base_pos = 0;
93
 
        x8data = (unsigned char *) byteplex->x8data;
94
 
}
95
 
 
96
 
template <CodeUnit_Base C>
97
 
XML_Decl_Parser<C>::~XML_Decl_Parser(){
98
 
}
99
 
 
100
 
template <CodeUnit_Base C>
101
 
inline void XML_Decl_Parser<C>::DeclError() {
102
 
        DeclarationError(AbsPos());
103
 
}
104
 
 
105
 
template <CodeUnit_Base C>
106
 
inline int XML_Decl_Parser<C>::AbsPos() const {
107
 
        return  buffer_base_pos + buffer_rel_pos;
108
 
}
109
 
 
110
 
template <CodeUnit_Base C>
111
 
inline unsigned char * XML_Decl_Parser<C>::cur() const {
112
 
        return &x8data[buffer_rel_pos];
113
 
}
114
 
 
115
 
template <CodeUnit_Base C>
116
 
inline void XML_Decl_Parser<C>::Advance(int n) {
117
 
        buffer_rel_pos += n;
118
 
        if (buffer_rel_pos >= BYTEPLEX_SIZE) {
119
 
                 byteplex->AdvanceInputBuffer(BYTEPLEX_SIZE);
120
 
        }
121
 
}
122
 
 
123
 
template <CodeUnit_Base C>
124
 
inline void XML_Decl_Parser<C>::Scan_WS() {
125
 
        while (at_WhiteSpace_10<C>(cur())) Advance(1);
126
 
}
127
 
 
128
 
template <CodeUnit_Base C>
129
 
inline void XML_Decl_Parser<C>::ScanToQuote() {
130
 
        int quote_start_pos = buffer_rel_pos;   
131
 
        while (!AtQuote<C>(cur())) buffer_rel_pos+=1;
132
 
        if (buffer_rel_pos >= BYTEPLEX_SIZE) {
133
 
                byteplex->AdvanceInputBuffer(quote_start_pos);
134
 
                buffer_rel_pos -= quote_start_pos;
135
 
                buffer_base_pos += quote_start_pos;
136
 
                while (!AtQuote<C>(cur())) buffer_rel_pos+=1;
137
 
                if (buffer_rel_pos >= BYTEPLEX_SIZE) {
138
 
                        ImplementationLimitError("Encoding name exceeds BYTEPLEX_SIZE");
139
 
                }
140
 
        }
141
 
}
142
 
 
143
 
template <CodeUnit_Base C>
144
 
inline void XML_Decl_Parser<C>::ParseVersion(Entity_Info & e) {
145
 
        /* Skip "version" */
146
 
        Advance(7);
147
 
        Scan_WS();
148
 
        if (!AtChar<C,'='>(cur())) DeclError();
149
 
        Advance(1);
150
 
        Scan_WS();
151
 
        if (at_1_0<C>(cur())) e.version = XML_1_0;
152
 
        else if (at_1_1<C>(cur())) e.version = XML_1_1;
153
 
        else DeclError();
154
 
        Advance(5);
155
 
}
156
 
 
157
 
template <CodeUnit_Base C>
158
 
inline void XML_Decl_Parser<C>::ParseEncoding(Entity_Info & e) {
159
 
        /* Skip "encoding" */
160
 
        Advance(8);
161
 
        e.has_encoding_decl = true;
162
 
        Scan_WS();
163
 
        if (!AtChar<C,'='>(cur())) DeclError();
164
 
        Advance(1);
165
 
        Scan_WS();
166
 
        if (AtQuote<C>(cur())) {
167
 
                unsigned char quoteCh = cur()[0];
168
 
                Advance(1);
169
 
                int start_pos = AbsPos();
170
 
                ScanToQuote();
171
 
                if (cur()[0] != quoteCh) DeclError();
172
 
                int lgth = AbsPos() - start_pos;
173
 
                e.encoding = new unsigned char[lgth + 1];
174
 
                memcpy(e.encoding, &x8data[start_pos-buffer_base_pos], lgth);
175
 
                e.encoding[lgth] = '\0';
176
 
        }
177
 
        else DeclError();
178
 
        Advance(1);
179
 
}
180
 
 
181
 
template <CodeUnit_Base C>
182
 
inline void XML_Decl_Parser<C>::ParseStandalone(Entity_Info & e) {
183
 
        /* Skip "standalone" */
184
 
        Advance(10);
185
 
        Scan_WS();
186
 
        if (!AtChar<C,'='>(cur())) DeclError();
187
 
        Advance(1);
188
 
        Scan_WS();
189
 
        if (at_yes<C>(cur())) {Advance(5); e.standalone = Standalone_yes;}
190
 
        else if (at_no<C>(cur())) {Advance(4); e.standalone = Standalone_no;}
191
 
        else DeclError();
192
 
}
193
 
 
194
 
template <CodeUnit_Base C>
195
 
void XML_Decl_Parser<C>::ReadXMLInfo(Entity_Info & e) {
196
 
        e.version = no_XML_version_value;
197
 
        e.has_encoding_decl = false;
198
 
        e.standalone = Standalone_no_value;
199
 
        buffer_rel_pos = e.BOM_units;
200
 
        // It is possible that there is no XML declaration.
201
 
        if (!at_XmlDecl_start<C>(cur())) {
202
 
                e.content_start = AbsPos();
203
 
                return;
204
 
        }
205
 
        // Otherwise, the XML declaration exists and must have
206
 
        // at least version information.
207
 
        Advance(6);
208
 
        Scan_WS();
209
 
        if (!at_version<C>(cur())) DeclError();
210
 
        ParseVersion(e);
211
 
        if (at_PI_End<C>(cur())) {
212
 
                e.content_start = AbsPos()+2;
213
 
                return;
214
 
        }
215
 
        if (!at_WhiteSpace_10<C>(cur())) DeclError();
216
 
        Scan_WS();
217
 
        if (at_encoding<C>(cur())) {
218
 
                ParseEncoding(e);
219
 
                if (at_PI_End<C>(cur())) {
220
 
                        e.content_start = AbsPos()+2;
221
 
                        return;
222
 
                }
223
 
                if (!at_WhiteSpace_10<C>(cur())) DeclError();
224
 
                Scan_WS();
225
 
        }
226
 
        if (at_standalone<C>(cur())) {
227
 
                ParseStandalone(e);
228
 
                Scan_WS();
229
 
        }
230
 
        if (!at_PI_End<C>(cur())) DeclError();
231
 
        e.content_start = AbsPos()+2;
232
 
}
233
 
 
234
 
// Similar to reading the XML_declaration of the document entity,
235
 
// ReadTextDeclaration reads the text declaration of an external
236
 
// parsed entity.
237
 
template <CodeUnit_Base C>
238
 
void XML_Decl_Parser<C>::ReadTextDeclaration(Entity_Info & e) {
239
 
        e.version = no_XML_version_value;
240
 
        e.has_encoding_decl = false;
241
 
        e.standalone = Standalone_no_value;
242
 
        buffer_rel_pos = e.BOM_units;
243
 
        // It is possible that there is no text declaration.
244
 
        if (!at_XmlDecl_start<C>(cur())) {
245
 
                e.content_start = AbsPos();
246
 
                return;
247
 
        }
248
 
        // Otherwise, the text declaration exists and may have
249
 
        // version information.
250
 
        Advance(6);
251
 
        Scan_WS();
252
 
        if (at_version<C>(cur())) {
253
 
                ParseVersion(e);
254
 
                // Must have whitespace character before encoding declaration.
255
 
                if (!at_WhiteSpace_10<C>(cur())) DeclError();
256
 
                Scan_WS();
257
 
        }
258
 
        if (!at_encoding<C>(cur())) DeclError();
259
 
        ParseEncoding(e);
260
 
        Scan_WS();
261
 
        if (!at_PI_End<C>(cur())) DeclError();
262
 
        e.content_start = AbsPos()+2;
263
 
}
264
 
 
265
 
template <CodeUnit_Base C>
266
 
void XML_Decl_Parser<C>::ReadXMLorTextDecl(Entity_Info & e) {
267
 
        e.version = no_XML_version_value;
268
 
        e.has_encoding_decl = false;
269
 
        e.standalone = Standalone_no_value;
270
 
        buffer_rel_pos = e.BOM_units;
271
 
        // It is possible that there is no XML or text declaration.
272
 
        if (!at_XmlDecl_start<C>(cur())) {
273
 
                e.content_start = AbsPos();
274
 
                return;
275
 
        }
276
 
        // Otherwise, the XML or text declaration exists and may have
277
 
        // version information.
278
 
        Advance(6);
279
 
        Scan_WS();
280
 
        if (at_version<C>(cur())) {
281
 
                ParseVersion(e);
282
 
                if (at_PI_End<C>(cur())) {
283
 
                        e.content_start = AbsPos()+2;
284
 
                        return;
285
 
                }
286
 
                if (!at_WhiteSpace_10<C>(cur())) DeclError();
287
 
                Scan_WS();
288
 
                if (at_encoding<C>(cur())) {
289
 
                        ParseEncoding(e);
290
 
                        if (at_PI_End<C>(cur())) {
291
 
                                e.content_start = AbsPos()+2;
292
 
                                return;
293
 
                        }
294
 
                        if (!at_WhiteSpace_10<C>(cur())) DeclError();
295
 
                        Scan_WS();
296
 
                }
297
 
                if (at_standalone<C>(cur())) {
298
 
                        ParseStandalone(e);
299
 
                        Scan_WS();
300
 
                }
301
 
        }
302
 
        else {  // Without version, we can only have a text declaration,
303
 
                // in which case an encoding spec is required.
304
 
                if (!at_encoding<C>(cur())) DeclError();
305
 
                ParseEncoding();
306
 
                Scan_WS();
307
 
                // No standalone spec is allowed in a text declaration.
308
 
        }
309
 
        if (!at_PI_End<C>(cur())) DeclError();  
310
 
        e.content_start = AbsPos()+2;
311
 
}