1
/* bitlex.h - Lexical Item Stream Module.
2
Copyright (c) 2007, 2008, Robert D. Cameron.
3
Licensed to the public under the Open Software License 3.0.
4
Licensed to International Characters, Inc., under the Academic
15
/* Lexical items are particular characters, character classes
16
or character sequences significant for XML parsing. */
18
#define DIGIT_AND_HEX_ITEMS
22
NonWS = minLexicalItem,
23
MarkupStart, CD_End_check, Hyphen, QMark,
25
AmpHashSlash, /* The [&#/] stream */
27
#ifdef DIGIT_AND_HEX_ITEMS
31
maxLexicalItem = NameFollow};
33
const int LexicalItemCount = maxLexicalItem + 1;
38
/* The principal role of the lexical analyzer is to prepare
39
a set of parallel data streams for the parsing engine:
40
(a) an XML byte stream and (b) a set of parallel lexical
42
The XML byte stream consists of one byte for each character
43
code unit in the input stream (typically the input bytes
44
themselves for most 8-bit character sets, or a pseudo-ASCII
45
byte for 16-bit or 32-bit sets such as UTF-16, or UTF-32).
46
The lexical item streams are bit streams that mark with a
47
1 bit the positions of occurrences of each of the lexical
53
/* A BitStreamBuffer is a bit stream of BUFFER_BLOCKS consecutive
54
blocks, followed by a sentinel block to terminate bit scans. */
56
const int SENTINEL_BLOCKS = 1;
57
typedef BitBlock BitStreamBuffer[BUFFER_BLOCKS+SENTINEL_BLOCKS];
59
struct LexicalStreamSet {
60
BitStreamBuffer item_stream[LexicalItemCount];
64
class Lexer_Interface {
66
Lexer_Interface(Entity_Info * e, LexicalStreamSet *l);
68
void AnalyzeBuffer(BitBlockBasis * x8basis, int base_pos, int start_pos, int buffer_limit_pos);
71
Entity_Info * entity_Info;
72
virtual void Do_XML_10_WS_Control() = 0;
73
virtual void Do_MarkupStreams() = 0;
74
virtual void Do_XML_11_WS_Control() = 0;
75
virtual void Do_CharsetValidation() = 0;
77
BitBlockBasis * x8basis;
78
BitBlock * validation_stream;
79
LexicalStreamSet * parsing_engine_data;
84
template <CodeUnit_Base C>
85
class Lexer : public Lexer_Interface {
87
static Lexer_Interface * LexerFactory(Entity_Info * e,LexicalStreamSet *l);
90
Lexer(Entity_Info * e,LexicalStreamSet *l);
91
void Do_XML_10_WS_Control();
92
void Do_MarkupStreams();
93
virtual void Do_XML_11_WS_Control() = 0;
94
virtual void Do_CharsetValidation() = 0;
97
class UTF_8_Lexer : public Lexer<ASCII> {
99
UTF_8_Lexer(Entity_Info * e,LexicalStreamSet *l);
100
void Do_XML_11_WS_Control();
101
void Do_CharsetValidation();
104
class ASCII_7_Lexer : public Lexer<ASCII> {
106
ASCII_7_Lexer(Entity_Info * e,LexicalStreamSet *l);
107
void Do_XML_11_WS_Control();
108
void Do_CharsetValidation();
111
class EASCII_8_Lexer : public Lexer<ASCII> {
113
EASCII_8_Lexer(Entity_Info * e,LexicalStreamSet *l);
114
void Do_XML_11_WS_Control();
115
void Do_CharsetValidation();
118
/* 16-bit ASCII-based character sets: UTF-16 and UCS-2 families.
119
Whitespace and control processing is common to these families,
120
but character set validation differs for codepoints D800-DFFF,
121
used for surrogate pairs in UTF-16 and prohibitied in UCS-2. */
122
class U16_Lexer : public Lexer<ASCII> {
124
U16_Lexer(Entity_Info * e,LexicalStreamSet *l);
125
void Do_XML_11_WS_Control();
126
virtual void Do_CharsetValidation() = 0;
129
class UTF_16_Lexer : public U16_Lexer {
131
UTF_16_Lexer(Entity_Info * e,LexicalStreamSet *l);
132
void Do_CharsetValidation();
135
class UCS_2_Lexer : public U16_Lexer {
137
UCS_2_Lexer(Entity_Info * e,LexicalStreamSet *l);
138
void Do_CharsetValidation();
141
class UTF_32_Lexer : public Lexer<ASCII> {
143
UTF_32_Lexer(Entity_Info * e,LexicalStreamSet *l);
144
void Do_XML_11_WS_Control();
145
void Do_CharsetValidation();
148
class EBCDIC_Lexer: public Lexer<EBCDIC> {
150
EBCDIC_Lexer(Entity_Info * e,LexicalStreamSet *l);
151
void Do_XML_11_WS_Control();
152
void Do_CharsetValidation();