/xmlbench/trunk

To get this branch, use:
bzr branch http://darksoft.org/webbzr/xmlbench/trunk
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
/*  bitlex.h - Lexical Item Stream Module.
    Copyright (c) 2007, 2008, Robert D. Cameron.
    Licensed to the public under the Open Software License 3.0.
    Licensed to International Characters, Inc., under the Academic
    Free License 3.0.

*/
#ifndef BITLEX_H
#define BITLEX_H

#include "xmldecl.h"
#include "byteplex.h"
#include "bitplex.h"

/* Lexical items are particular characters, character classes
   or character sequences significant for XML parsing.  */

#define DIGIT_AND_HEX_ITEMS

enum lexical_item {
	minLexicalItem = 0,
	NonWS = minLexicalItem, 
	MarkupStart, CD_End_check, Hyphen, QMark,
#ifdef MARKUP_SORTING
	AmpHashSlash, /* The [&#/] stream */
#endif
#ifdef DIGIT_AND_HEX_ITEMS
	NonDigit, NonHex,
#endif
	Quote, NameFollow,
	maxLexicalItem = NameFollow};

const int LexicalItemCount = maxLexicalItem + 1;




/* The principal role of the lexical analyzer is to prepare
   a set of parallel data streams for the parsing engine:
   (a) an XML byte stream and (b) a set of parallel lexical
   item streams.
   The XML byte stream consists of one byte for each character 
   code unit in the input stream (typically the input bytes
   themselves for most 8-bit character sets, or a pseudo-ASCII 
   byte for 16-bit or 32-bit sets such as UTF-16, or UTF-32).
   The lexical item streams are bit streams that mark with a
   1 bit the positions of occurrences of each of the lexical
   items. 

*/


/* A BitStreamBuffer is a bit stream of BUFFER_BLOCKS consecutive
   blocks, followed by a sentinel block to terminate bit scans. */

const int SENTINEL_BLOCKS = 1;
typedef BitBlock BitStreamBuffer[BUFFER_BLOCKS+SENTINEL_BLOCKS];

struct LexicalStreamSet {
	BitStreamBuffer item_stream[LexicalItemCount];
};


class Lexer_Interface {
public:
	Lexer_Interface(Entity_Info * e, LexicalStreamSet *l);
	~Lexer_Interface();
	void AnalyzeBuffer(BitBlockBasis * x8basis, int base_pos, int start_pos, int buffer_limit_pos);

protected:
	Entity_Info * entity_Info;
	virtual void Do_XML_10_WS_Control() = 0;
	virtual void Do_MarkupStreams() = 0;
	virtual void Do_XML_11_WS_Control() = 0;
	virtual void Do_CharsetValidation() = 0;
	int lexer_base_pos;
	BitBlockBasis * x8basis;
	BitBlock * validation_stream;
	LexicalStreamSet * parsing_engine_data;
	int buffer_units;
	int buffer_blocks;
};

template <CodeUnit_Base C>
class Lexer : public Lexer_Interface {
public:
	static Lexer_Interface * LexerFactory(Entity_Info * e,LexicalStreamSet *l);

protected:
	Lexer(Entity_Info * e,LexicalStreamSet *l);
	void Do_XML_10_WS_Control();
	void Do_MarkupStreams();
	virtual void Do_XML_11_WS_Control() = 0;
	virtual void Do_CharsetValidation() = 0;
};

class UTF_8_Lexer : public Lexer<ASCII> {
public:
	UTF_8_Lexer(Entity_Info * e,LexicalStreamSet *l);
	void Do_XML_11_WS_Control();
	void Do_CharsetValidation();
};

class ASCII_7_Lexer : public Lexer<ASCII> {
public:
	ASCII_7_Lexer(Entity_Info * e,LexicalStreamSet *l);
	void Do_XML_11_WS_Control();
	void Do_CharsetValidation();
};

class EASCII_8_Lexer : public Lexer<ASCII> {
public:
	EASCII_8_Lexer(Entity_Info * e,LexicalStreamSet *l);
	void Do_XML_11_WS_Control();
	void Do_CharsetValidation();
};

/* 16-bit ASCII-based character sets: UTF-16 and UCS-2 families. 
   Whitespace and control processing is common to these families,
   but character set validation differs for codepoints D800-DFFF,
   used for surrogate pairs in UTF-16 and prohibitied in UCS-2. */
class U16_Lexer : public Lexer<ASCII> {
public:
	U16_Lexer(Entity_Info * e,LexicalStreamSet *l);
	void Do_XML_11_WS_Control();
	virtual void Do_CharsetValidation() = 0;
};

class UTF_16_Lexer : public U16_Lexer {
public:
	UTF_16_Lexer(Entity_Info * e,LexicalStreamSet *l);
	void Do_CharsetValidation();
};

class UCS_2_Lexer : public U16_Lexer {
public:
	UCS_2_Lexer(Entity_Info * e,LexicalStreamSet *l);
	void Do_CharsetValidation();
};

class UTF_32_Lexer : public Lexer<ASCII> {
public:
	UTF_32_Lexer(Entity_Info * e,LexicalStreamSet *l);
	void Do_XML_11_WS_Control();
	void Do_CharsetValidation();
};

class EBCDIC_Lexer: public Lexer<EBCDIC> {
public:
	EBCDIC_Lexer(Entity_Info * e,LexicalStreamSet *l);
	void Do_XML_11_WS_Control();
	void Do_CharsetValidation();
};


#endif