1
/* multiliteral.h - XML Multicharacter Recognizers.
2
Copyright (c) 2007, 2008, Robert D. Cameron.
3
Licensed to the public under the Open Software License 3.0.
4
Licensed to International Characters, Inc., under the Academic
7
This file provides a library of routines for the efficient recognition
8
of particular XML multicharacter sequences. Sequences of length 2 are
9
compared as 16 bit integers, sequences of length 3 or 4 are compared
10
as 32 bit integers and other sequences of length up to 8 are compared as
11
64 bit integers. The integer value for each XML multicharacter sequence
12
is determined as a compile-time constant for optimal efficiency.
14
All functions are declared inline; there is no corresponding multiliteral.c
17
#ifndef MULTILITERAL_H
18
#define MULTILITERAL_H
23
#include "charsets/ASCII_EBCDIC.h"
25
#if BYTE_ORDER == BIG_ENDIAN
26
const int LOW_BYTE_SHIFT = 8;
27
const int HIGH_BYTE_SHIFT = 0;
29
#if BYTE_ORDER == LITTLE_ENDIAN
30
const int LOW_BYTE_SHIFT = 0;
31
const int HIGH_BYTE_SHIFT = 8;
35
Helper metafunctions. Given 2, 4 or 8 characters comprising a sequence,
36
the c2int16, c4int32, and c8int64 functions determine the corresponding
37
16, 32 or 64 bit integer value. These are template metafunctions that
38
must be instantiated with constant arguments to be applied at compile time.
39
The functions may be instantiated for ASCII or EBCDIC based byte
41
For example, c2int16<ASCII, '<', '/'>::value produces the compile
42
time constant for the 16-bit value of an ASCII-based byte sequence
43
of the XML end tag opening delimiter.
46
template <unsigned char byte1, unsigned char byte2>
48
static uint16_t const value =
49
(((uint16_t) byte1) << LOW_BYTE_SHIFT) +
50
(((uint16_t) byte2) << HIGH_BYTE_SHIFT);
53
template <CodeUnit_Base C, unsigned char c1, unsigned char c2>
55
static uint16_t const value = b2int16<Ord<C,c1>::value, Ord<C,c2>::value>::value;
58
template <CodeUnit_Base C, unsigned char c1, unsigned char c2,
59
unsigned char c3, unsigned char c4>
61
static uint32_t const value =
62
(((uint32_t) c2int16<C,c1,c2>::value) << (2 * LOW_BYTE_SHIFT)) +
63
(((uint32_t) c2int16<C,c3,c4>::value) << (2 * HIGH_BYTE_SHIFT));
66
template <CodeUnit_Base C, unsigned char c1, unsigned char c2,
67
unsigned char c3, unsigned char c4,
68
unsigned char c5, unsigned char c6,
69
unsigned char c7, unsigned char c8>
71
static uint64_t const value =
72
(((uint64_t) c4int32<C, c1, c2, c3, c4>::value) << (4 * LOW_BYTE_SHIFT)) +
73
(((uint64_t) c4int32<C, c5, c6, c7, c8>::value) << (4 * HIGH_BYTE_SHIFT));
77
/* Specialized helpers for 3, 5, 6, and 7 character combinations. */
79
template <CodeUnit_Base C, unsigned char c1, unsigned char c2,
82
static uint32_t const value = c4int32<C, c1, c2, c3, 0>::value;
85
template <CodeUnit_Base C, unsigned char c1, unsigned char c2,
86
unsigned char c3, unsigned char c4,
89
static uint64_t const value = c8int64<C, c1, c2, c3, c4, c5, 0, 0, 0>::value;
92
template <CodeUnit_Base C, unsigned char c1, unsigned char c2,
93
unsigned char c3, unsigned char c4,
94
unsigned char c5, unsigned char c6>
96
static uint64_t const value = c8int64<C, c1, c2, c3, c4, c5, c6, 0, 0>::value;
99
template <CodeUnit_Base C, unsigned char c1, unsigned char c2,
100
unsigned char c3, unsigned char c4,
101
unsigned char c5, unsigned char c6,
104
static uint64_t const value = c8int64<C, c1, c2, c3, c4, c5, c6, c7, 0>::value;
109
A second set of helper functions determines 16, 32, or 64 bit integer
110
values from character arrays.
111
Precondition: the character array is allocated with at least the
112
number of required characters in each case. */
113
static inline uint16_t s2int16(unsigned char s[]) {
114
return * ((uint16_t *) s);
117
static inline uint32_t s4int32(unsigned char s[]) {
118
return * ((uint32_t *) s);
121
static inline uint64_t s8int64(unsigned char s[]) {
122
return * ((uint64_t *) s);
125
static inline uint32_t s3int32(unsigned char s[]) {
126
return s4int32(s) & (0xFFFFFF << LOW_BYTE_SHIFT);
129
static inline uint64_t s5int64(unsigned char s[]) {
130
return s8int64(s) & (0xFFFFFFFFFFULL << (3 * LOW_BYTE_SHIFT));
133
static inline uint64_t s6int64(unsigned char s[]) {
134
return s8int64(s) & (0xFFFFFFFFFFFFULL << (2 * LOW_BYTE_SHIFT));
137
static inline uint64_t s7int64(unsigned char s[]) {
138
return s8int64(s) & (0xFFFFFFFFFFFFFFULL << LOW_BYTE_SHIFT);
141
template <CodeUnit_Base C, unsigned char c1, unsigned char c2>
142
static inline bool caseless_comp(unsigned char s[]) {
143
const uint16_t lc = c2int16<C, UC2lc<c1>::value, UC2lc<c2>::value>::value;
144
const uint16_t UC = c2int16<C, lc2UC<c1>::value, lc2UC<c2>::value>::value;
145
const uint16_t case_mask = lc ^ UC;
146
const uint16_t canon = lc & ~case_mask;
147
return (s2int16(s) & ~case_mask) == canon;
150
template <CodeUnit_Base C, unsigned char c1, unsigned char c2, unsigned char c3>
151
static inline bool caseless_comp(unsigned char s[]) {
152
const uint32_t lc = c3int32<C, UC2lc<c1>::value, UC2lc<c2>::value, UC2lc<c3>::value>::value;
153
const uint32_t UC = c3int32<C, lc2UC<c1>::value, lc2UC<c2>::value, lc2UC<c3>::value>::value;
154
const uint32_t case_mask = lc ^ UC;
155
const uint32_t canon = lc & ~case_mask;
156
return (s3int32(s) & ~case_mask) == canon;
159
template <CodeUnit_Base C, unsigned char c1, unsigned char c2,
160
unsigned char c3, unsigned char c4>
161
static inline bool caseless_comp(unsigned char s[]) {
162
const uint32_t lc = c4int32<C, UC2lc<c1>::value, UC2lc<c2>::value,
163
UC2lc<c3>::value, UC2lc<c4>::value>::value;
164
const uint32_t UC = c4int32<C, lc2UC<c1>::value, lc2UC<c2>::value,
165
lc2UC<c3>::value, lc2UC<c4>::value>::value;
166
const uint32_t case_mask = lc ^ UC;
167
const uint32_t canon = lc & ~case_mask;
168
return (s4int32(s) & ~case_mask) == canon;
171
template <CodeUnit_Base C, unsigned char c1, unsigned char c2,
172
unsigned char c3, unsigned char c4,
174
static inline bool caseless_comp(unsigned char s[]) {
175
const uint64_t lc = c5int64<C, UC2lc<c1>::value, UC2lc<c2>::value,
176
UC2lc<c3>::value, UC2lc<c4>::value,
177
UC2lc<c5>::value>::value;
178
const uint64_t UC = c5int64<C, lc2UC<c1>::value, lc2UC<c2>::value,
179
lc2UC<c3>::value, lc2UC<c4>::value,
180
lc2UC<c5>::value>::value;
181
const uint64_t case_mask = lc ^ UC;
182
const uint64_t canon = lc & ~case_mask;
183
return (s5int64(s) & ~case_mask) == canon;
186
template <CodeUnit_Base C, unsigned char c1, unsigned char c2,
187
unsigned char c3, unsigned char c4,
188
unsigned char c5, unsigned char c6>
189
static inline bool caseless_comp(unsigned char s[]) {
190
const uint64_t lc = c6int64<C, UC2lc<c1>::value, UC2lc<c2>::value,
191
UC2lc<c3>::value, UC2lc<c4>::value,
192
UC2lc<c5>::value, UC2lc<c6>::value>::value;
193
const uint64_t UC = c6int64<C, lc2UC<c1>::value, lc2UC<c2>::value,
194
lc2UC<c3>::value, lc2UC<c4>::value,
195
lc2UC<c5>::value, lc2UC<c6>::value>::value;
196
const uint64_t case_mask = lc ^ UC;
197
const uint64_t canon = lc & ~case_mask;
198
return (s6int64(s) & ~case_mask) == canon;
201
template <CodeUnit_Base C, unsigned char c1, unsigned char c2,
202
unsigned char c3, unsigned char c4,
203
unsigned char c5, unsigned char c6,
205
static inline bool caseless_comp(unsigned char s[]) {
206
const uint64_t lc = c7int64<C, UC2lc<c1>::value, UC2lc<c2>::value,
207
UC2lc<c3>::value, UC2lc<c4>::value,
208
UC2lc<c5>::value, UC2lc<c6>::value,
209
UC2lc<c7>::value>::value;
210
const uint64_t UC = c7int64<C, lc2UC<c1>::value, lc2UC<c2>::value,
211
lc2UC<c3>::value, lc2UC<c4>::value,
212
lc2UC<c5>::value, lc2UC<c6>::value,
213
lc2UC<c7>::value>::value;
214
const uint64_t case_mask = lc ^ UC;
215
const uint64_t canon = lc & ~case_mask;
216
return (s7int64(s) & ~case_mask) == canon;
219
template <CodeUnit_Base C, unsigned char c1, unsigned char c2,
220
unsigned char c3, unsigned char c4,
221
unsigned char c5, unsigned char c6,
222
unsigned char c7, unsigned char c8>
223
static inline bool caseless_comp(unsigned char s[]) {
224
const uint64_t lc = c8int64<C, UC2lc<c1>::value, UC2lc<c2>::value,
225
UC2lc<c3>::value, UC2lc<c4>::value,
226
UC2lc<c5>::value, UC2lc<c6>::value,
227
UC2lc<c7>::value, UC2lc<c8>::value>::value;
228
const uint64_t UC = c8int64<C, lc2UC<c1>::value, lc2UC<c2>::value,
229
lc2UC<c3>::value, lc2UC<c4>::value,
230
lc2UC<c5>::value, lc2UC<c6>::value,
231
lc2UC<c7>::value, lc2UC<c8>::value>::value;
232
const uint64_t case_mask = lc ^ UC;
233
const uint64_t canon = lc & ~case_mask;
234
return (s8int64(s) & ~case_mask) == canon;