1
/* xmldecl.h - Parsing XML and text declarations.
2
Copyright (c) 2008, Robert D. Cameron.
3
Licensed to the public under the Open Software License 3.0.
4
Licensed to International Characters, Inc., under the Academic
11
enum XML_version {XML_1_0, XML_1_1, no_XML_version_value};
12
/* Documents may be encoded in accord with either XML 1.0 or XML 1.1,
13
or there may be no XML version declared ("no value" in the
14
XML infoset parlance). */
16
enum CodeUnit_Base {ASCII, EBCDIC};
18
/* Code units of the underlying character set may be either ASCII-compatible
20
ASCII-compatibility means that any code units satisfy the following properties.
21
(1) Any code unit whose numeric value is in the ASinclude "byteplex.h"CII range (0 to 0x7F)
22
is a complete character sequence (single code unit sequence) representing
24
(2) Any code units above the ASCII range are non-ASCII code units.
25
No code units or code unit sequences containing a non-ASCII code unit
26
may represent an ASCII character. (This property ensures that
27
non-ASCII code units may be ignored in making ASCII-based parsing decisions).
28
EBCDIC-compatible, for the purposes of XML, means that the following property
29
applies.include "byteplex.h"
31
(*) Code units may form all or part of a code unit sequence representing
32
a character in the Unicode range 0 to 0x9F if and only if that code
33
unit has the same interpretation unde the basic EBCDIC code page cp037.
36
enum CodeUnit_Size {SingleByte = 1, DoubleByte = 2, QuadByte = 4};
37
/* ASCII, EBCDIC, ISO-8859-X and UTF-8 have 8-bit code units (singlebytes);
38
The UTF-16 and UCS-2 families have 16-bit code units (doublebyte);
39
The UTF-32/UCS-4 family has 32-bit code units. */
41
enum CodeUnit_ByteOrder {BigEndian, LittleEndian, Unusual_3412, Unusual_2143};
42
/* The byte order of 16-bit or 32-bit code units. The possibilities are:
43
BigEndian: UTF-16BE, UCS-2BE, UTF-16 or UCS-2 with a BigEndian byte order mark,
44
UTF-16 without a byte order mark,
45
UTF-32BE/UCS-4BE, or UTF-32/UCS-4 with a BigEndian byte order mark.
46
LittleEndian: UTF-16LE, UCS-2LE, UTF-16 or UCS-2 with a LittleEndian byte order mark.
47
UTF-32LE/UCS-4LE, or UTF-32/UCS-4 with a LittleEndian byte order mark.
48
Unusual_3412: Unusual octet order of UTF-32/UCS-4 with byte order mark FE FF 00 00
49
Unusual_2143: Unusual octet order of UTF-32/UCS-4 with byte order mark 00 00 FF FE.
52
enum XML_standalone {Standalone_yes, Standalone_no, Standalone_no_value};
53
/* Possible values depending on the optional standalone component of an
62
/* Information computed by analyzing the 4-byte initial signature
63
of an XML document. */
64
int BOM_units; /* no of initial code units for a Byte Order Mark */
66
CodeUnit_Base code_unit_base;
67
CodeUnit_Size code_unit_size;
68
CodeUnit_ByteOrder byte_order;
70
void AnalyzeSignature(unsigned char * signature);
72
/* Information computed from the XML or text declaration. */
74
bool has_encoding_decl;
75
unsigned char * encoding;
76
XML_standalone standalone;
77
int content_start; /* position after BOM and XML/text decl.*/
80
void set_charset_family(CodeUnit_Base C, CodeUnit_Size S, CodeUnit_ByteOrder O, int B);
86
template <CodeUnit_Base C>
87
class XML_Decl_Parser {
89
XML_Decl_Parser (Byteplex * b);
92
void ReadXMLInfo(Entity_Info & e);
93
void ReadTextDeclaration(Entity_Info & e);
94
// Generic version if type of external entity unknown.
95
void ReadXMLorTextDecl(Entity_Info & e);
101
unsigned char * x8data;
104
int buffer_limit_pos;
108
unsigned char * cur() const;
111
/* Bytespace parsing routines for internal use in ReadXMLInfo and
112
ReadTextDeclaration. */
116
void ParseVersion(Entity_Info & e);
117
void ParseEncoding(Entity_Info & e);
118
void ParseStandalone(Entity_Info & e);