1
/* xmldecl.c - Parsing XML and Text Declarations.
2
Copyright (c) 2008, Robert D. Cameron.
3
Licensed to the public under the Open Software License 3.0.
4
Licensed to International Characters, Inc., under the Academic
11
#include "xml_error.h"
12
#include "multiliteral.h"
15
Entity_Info::Entity_Info() {
18
Entity_Info::~Entity_Info() {
22
/* Signature-based character set family detection in accord with
23
Appendix F of the XML 1.0 and 1.1 specifications. */
25
/* These definitions use b2int16 to determine appropriate doublebyte
26
values based on endianness of the underlying architecture. */
27
static const int x0000 = b2int16<0x00, 0x00>::value;
28
static const int xFEFF = b2int16<0xFE, 0xFF>::value;
29
static const int xFFFE = b2int16<0xFF, 0xFE>::value;
30
static const int x003C = b2int16<0x00, 0x3C>::value;
31
static const int x3C00 = b2int16<0x3C, 0x00>::value;
32
static const int x4C6F = b2int16<0x4C, 0x6F>::value;
33
static const int xA794 = b2int16<0xA7, 0x94>::value;
34
static const int xEFBE = b2int16<0xEF, 0xBE>::value;
36
void Entity_Info::AnalyzeSignature(unsigned char * signature) {
37
uint16_t * XML_dbl_byte = (uint16_t *) signature;
38
switch (XML_dbl_byte[0]) {
40
switch (XML_dbl_byte[1]) {
41
case xFEFF: set_charset_family(ASCII, QuadByte, BigEndian, 1);break;
42
case xFFFE: set_charset_family(ASCII, QuadByte, Unusual_2143, 1);break;
43
case x3C00: set_charset_family(ASCII, QuadByte, Unusual_2143, 0);break;
44
default: set_charset_family(ASCII, QuadByte, BigEndian, 0);
48
if (XML_dbl_byte[1] == x0000)
49
set_charset_family(ASCII, QuadByte, Unusual_3412, 1);
50
else set_charset_family(ASCII, DoubleByte, BigEndian, 1);
53
if (XML_dbl_byte[1] == x0000)
54
set_charset_family(ASCII, QuadByte, LittleEndian, 1);
55
else set_charset_family(ASCII, DoubleByte, LittleEndian, 1);
58
if (XML_dbl_byte[1] == x0000)
59
set_charset_family(ASCII, QuadByte, Unusual_3412, 0);
60
else set_charset_family(ASCII, DoubleByte, BigEndian, 0);
63
if (XML_dbl_byte[1] == x0000)
64
set_charset_family(ASCII, QuadByte, LittleEndian, 0);
65
else set_charset_family(ASCII, DoubleByte, LittleEndian, 0);
68
if (XML_dbl_byte[1] == xA794)
69
set_charset_family(EBCDIC, SingleByte, BigEndian, 0);
70
else set_charset_family(ASCII, SingleByte, BigEndian, 0);
73
if (signature[2] == 0xBF)
74
set_charset_family(ASCII, SingleByte, BigEndian, 3);
75
else set_charset_family(ASCII, SingleByte, BigEndian, 0);
78
set_charset_family(ASCII, SingleByte, BigEndian, 0);
81
void Entity_Info::set_charset_family(CodeUnit_Base C, CodeUnit_Size S, CodeUnit_ByteOrder O, int B){
89
template <CodeUnit_Base C>
90
XML_Decl_Parser<C>::XML_Decl_Parser(Byteplex * b){
93
x8data = (unsigned char *) byteplex->x8data;
96
template <CodeUnit_Base C>
97
XML_Decl_Parser<C>::~XML_Decl_Parser(){
100
template <CodeUnit_Base C>
101
inline void XML_Decl_Parser<C>::DeclError() {
102
DeclarationError(AbsPos());
105
template <CodeUnit_Base C>
106
inline int XML_Decl_Parser<C>::AbsPos() const {
107
return buffer_base_pos + buffer_rel_pos;
110
template <CodeUnit_Base C>
111
inline unsigned char * XML_Decl_Parser<C>::cur() const {
112
return &x8data[buffer_rel_pos];
115
template <CodeUnit_Base C>
116
inline void XML_Decl_Parser<C>::Advance(int n) {
118
if (buffer_rel_pos >= BYTEPLEX_SIZE) {
119
byteplex->AdvanceInputBuffer(BYTEPLEX_SIZE);
123
template <CodeUnit_Base C>
124
inline void XML_Decl_Parser<C>::Scan_WS() {
125
while (at_WhiteSpace_10<C>(cur())) Advance(1);
128
template <CodeUnit_Base C>
129
inline void XML_Decl_Parser<C>::ScanToQuote() {
130
int quote_start_pos = buffer_rel_pos;
131
while (!AtQuote<C>(cur())) buffer_rel_pos+=1;
132
if (buffer_rel_pos >= BYTEPLEX_SIZE) {
133
byteplex->AdvanceInputBuffer(quote_start_pos);
134
buffer_rel_pos -= quote_start_pos;
135
buffer_base_pos += quote_start_pos;
136
while (!AtQuote<C>(cur())) buffer_rel_pos+=1;
137
if (buffer_rel_pos >= BYTEPLEX_SIZE) {
138
ImplementationLimitError("Encoding name exceeds BYTEPLEX_SIZE");
143
template <CodeUnit_Base C>
144
inline void XML_Decl_Parser<C>::ParseVersion(Entity_Info & e) {
148
if (!AtChar<C,'='>(cur())) DeclError();
151
if (at_1_0<C>(cur())) e.version = XML_1_0;
152
else if (at_1_1<C>(cur())) e.version = XML_1_1;
157
template <CodeUnit_Base C>
158
inline void XML_Decl_Parser<C>::ParseEncoding(Entity_Info & e) {
159
/* Skip "encoding" */
161
e.has_encoding_decl = true;
163
if (!AtChar<C,'='>(cur())) DeclError();
166
if (AtQuote<C>(cur())) {
167
unsigned char quoteCh = cur()[0];
169
int start_pos = AbsPos();
171
if (cur()[0] != quoteCh) DeclError();
172
int lgth = AbsPos() - start_pos;
173
e.encoding = new unsigned char[lgth + 1];
174
memcpy(e.encoding, &x8data[start_pos-buffer_base_pos], lgth);
175
e.encoding[lgth] = '\0';
181
template <CodeUnit_Base C>
182
inline void XML_Decl_Parser<C>::ParseStandalone(Entity_Info & e) {
183
/* Skip "standalone" */
186
if (!AtChar<C,'='>(cur())) DeclError();
189
if (at_yes<C>(cur())) {Advance(5); e.standalone = Standalone_yes;}
190
else if (at_no<C>(cur())) {Advance(4); e.standalone = Standalone_no;}
194
template <CodeUnit_Base C>
195
void XML_Decl_Parser<C>::ReadXMLInfo(Entity_Info & e) {
196
e.version = no_XML_version_value;
197
e.has_encoding_decl = false;
198
e.standalone = Standalone_no_value;
199
buffer_rel_pos = e.BOM_units;
200
// It is possible that there is no XML declaration.
201
if (!at_XmlDecl_start<C>(cur())) {
202
e.content_start = AbsPos();
205
// Otherwise, the XML declaration exists and must have
206
// at least version information.
209
if (!at_version<C>(cur())) DeclError();
211
if (at_PI_End<C>(cur())) {
212
e.content_start = AbsPos()+2;
215
if (!at_WhiteSpace_10<C>(cur())) DeclError();
217
if (at_encoding<C>(cur())) {
219
if (at_PI_End<C>(cur())) {
220
e.content_start = AbsPos()+2;
223
if (!at_WhiteSpace_10<C>(cur())) DeclError();
226
if (at_standalone<C>(cur())) {
230
if (!at_PI_End<C>(cur())) DeclError();
231
e.content_start = AbsPos()+2;
234
// Similar to reading the XML_declaration of the document entity,
235
// ReadTextDeclaration reads the text declaration of an external
237
template <CodeUnit_Base C>
238
void XML_Decl_Parser<C>::ReadTextDeclaration(Entity_Info & e) {
239
e.version = no_XML_version_value;
240
e.has_encoding_decl = false;
241
e.standalone = Standalone_no_value;
242
buffer_rel_pos = e.BOM_units;
243
// It is possible that there is no text declaration.
244
if (!at_XmlDecl_start<C>(cur())) {
245
e.content_start = AbsPos();
248
// Otherwise, the text declaration exists and may have
249
// version information.
252
if (at_version<C>(cur())) {
254
// Must have whitespace character before encoding declaration.
255
if (!at_WhiteSpace_10<C>(cur())) DeclError();
258
if (!at_encoding<C>(cur())) DeclError();
261
if (!at_PI_End<C>(cur())) DeclError();
262
e.content_start = AbsPos()+2;
265
template <CodeUnit_Base C>
266
void XML_Decl_Parser<C>::ReadXMLorTextDecl(Entity_Info & e) {
267
e.version = no_XML_version_value;
268
e.has_encoding_decl = false;
269
e.standalone = Standalone_no_value;
270
buffer_rel_pos = e.BOM_units;
271
// It is possible that there is no XML or text declaration.
272
if (!at_XmlDecl_start<C>(cur())) {
273
e.content_start = AbsPos();
276
// Otherwise, the XML or text declaration exists and may have
277
// version information.
280
if (at_version<C>(cur())) {
282
if (at_PI_End<C>(cur())) {
283
e.content_start = AbsPos()+2;
286
if (!at_WhiteSpace_10<C>(cur())) DeclError();
288
if (at_encoding<C>(cur())) {
290
if (at_PI_End<C>(cur())) {
291
e.content_start = AbsPos()+2;
294
if (!at_WhiteSpace_10<C>(cur())) DeclError();
297
if (at_standalone<C>(cur())) {
302
else { // Without version, we can only have a text declaration,
303
// in which case an encoding spec is required.
304
if (!at_encoding<C>(cur())) DeclError();
307
// No standalone spec is allowed in a text declaration.
309
if (!at_PI_End<C>(cur())) DeclError();
310
e.content_start = AbsPos()+2;