Libparserutils
src
charset
encodings
utf8impl.h
Go to the documentation of this file.
1
/*
2
* This file is part of LibParserUtils.
3
* Licensed under the MIT License,
4
* http://www.opensource.org/licenses/mit-license.php
5
* Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
6
*/
7
8
#ifndef parserutils_charset_encodings_utf8impl_h_
9
#define parserutils_charset_encodings_utf8impl_h_
10
15
#include <stdbool.h>
16
#include <stdlib.h>
17
#include <string.h>
18
20
extern
const
uint8_t
numContinuations
[256];
21
34
#define UTF8_TO_UCS4(s, len, ucs4, clen, error) \
35
do { \
36
uint32_t c, min; \
37
uint8_t n; \
38
uint8_t i; \
39
\
40
error = PARSERUTILS_OK; \
41
\
42
if (s == NULL || ucs4 == NULL || clen == NULL) { \
43
error = PARSERUTILS_BADPARM; \
44
break; \
45
} \
46
\
47
if (len == 0) { \
48
error = PARSERUTILS_NEEDDATA; \
49
break; \
50
} \
51
\
52
c = s[0]; \
53
\
54
if (c < 0x80) { \
55
n = 1; \
56
min = 0; \
57
} else if ((c & 0xE0) == 0xC0) { \
58
c &= 0x1F; \
59
n = 2; \
60
min = 0x80; \
61
} else if ((c & 0xF0) == 0xE0) { \
62
c &= 0x0F; \
63
n = 3; \
64
min = 0x800; \
65
} else if ((c & 0xF8) == 0xF0) { \
66
c &= 0x07; \
67
n = 4; \
68
min = 0x10000; \
69
} else if ((c & 0xFC) == 0xF8) { \
70
c &= 0x03; \
71
n = 5; \
72
min = 0x200000; \
73
} else if ((c & 0xFE) == 0xFC) { \
74
c &= 0x01; \
75
n = 6; \
76
min = 0x4000000; \
77
} else { \
78
error = PARSERUTILS_INVALID; \
79
break; \
80
} \
81
\
82
if (len < n) { \
83
error = PARSERUTILS_NEEDDATA; \
84
break; \
85
} \
86
\
87
for (i = 1; i < n; i++) { \
88
uint32_t t = s[i]; \
89
\
90
if ((t & 0xC0) != 0x80) { \
91
error = PARSERUTILS_INVALID; \
92
break; \
93
} \
94
\
95
c <<= 6; \
96
c |= t & 0x3F; \
97
} \
98
\
99
if (error == PARSERUTILS_OK) { \
100
/* Detect overlong sequences, surrogates and fffe/ffff */
\
101
if (c < min || (c >= 0xD800 && c <= 0xDFFF) || \
102
c == 0xFFFE || c == 0xFFFF) { \
103
error = PARSERUTILS_INVALID; \
104
break; \
105
} \
106
\
107
*ucs4 = c; \
108
*clen = n; \
109
} \
110
} while(0)
111
123
#define UTF8_FROM_UCS4(ucs4, s, len, error) \
124
do { \
125
uint8_t *buf; \
126
uint8_t l = 0; \
127
\
128
error = PARSERUTILS_OK; \
129
\
130
if (s == NULL || *s == NULL || len == NULL) { \
131
error = PARSERUTILS_BADPARM; \
132
break; \
133
} \
134
\
135
if (ucs4 < 0x80) { \
136
l = 1; \
137
} else if (ucs4 < 0x800) { \
138
l = 2; \
139
} else if (ucs4 < 0x10000) { \
140
l = 3; \
141
} else if (ucs4 < 0x200000) { \
142
l = 4; \
143
} else if (ucs4 < 0x4000000) { \
144
l = 5; \
145
} else if (ucs4 <= 0x7FFFFFFF) { \
146
l = 6; \
147
} else { \
148
error = PARSERUTILS_INVALID; \
149
break; \
150
} \
151
\
152
if (l > *len) { \
153
error = PARSERUTILS_NOMEM; \
154
break; \
155
} \
156
\
157
buf = *s; \
158
\
159
if (l == 1) { \
160
buf[0] = (uint8_t) ucs4; \
161
} else { \
162
uint8_t i; \
163
for (i = l; i > 1; i--) { \
164
buf[i - 1] = 0x80 | (ucs4 & 0x3F); \
165
ucs4 >>= 6; \
166
} \
167
buf[0] = ~((1 << (8 - l)) - 1) | ucs4; \
168
} \
169
\
170
*s += l; \
171
*len -= l; \
172
} while(0)
173
182
#define UTF8_LENGTH(s, max, len, error) \
183
do { \
184
const uint8_t *end = s + max; \
185
int l = 0; \
186
\
187
error = PARSERUTILS_OK; \
188
\
189
if (s == NULL || len == NULL) { \
190
error = PARSERUTILS_BADPARM; \
191
break; \
192
} \
193
\
194
while (s < end) { \
195
uint32_t c = s[0]; \
196
\
197
if ((c & 0x80) == 0x00) \
198
s += 1; \
199
else if ((c & 0xE0) == 0xC0) \
200
s += 2; \
201
else if ((c & 0xF0) == 0xE0) \
202
s += 3; \
203
else if ((c & 0xF8) == 0xF0) \
204
s += 4; \
205
else if ((c & 0xFC) == 0xF8) \
206
s += 5; \
207
else if ((c & 0xFE) == 0xFC) \
208
s += 6; \
209
else { \
210
error = PARSERUTILS_INVALID; \
211
break; \
212
} \
213
\
214
l++; \
215
} \
216
\
217
if (error == PARSERUTILS_OK) \
218
*len = l; \
219
} while(0)
220
228
#define UTF8_CHAR_BYTE_LENGTH(s, len, error) \
229
do { \
230
if (s == NULL || len == NULL) { \
231
error = PARSERUTILS_BADPARM; \
232
break; \
233
} \
234
\
235
*len = numContinuations[s[0]] + 1
/* Start byte */
; \
236
\
237
error = PARSERUTILS_OK; \
238
} while(0)
239
249
#define UTF8_PREV(s, off, prevoff, error) \
250
do { \
251
if (s == NULL || prevoff == NULL) { \
252
error = PARSERUTILS_BADPARM; \
253
break; \
254
} \
255
\
256
while (off != 0 && (s[--off] & 0xC0) == 0x80) \
257
/* do nothing */
; \
258
\
259
*prevoff = off; \
260
\
261
error = PARSERUTILS_OK; \
262
} while(0)
263
274
#define UTF8_NEXT(s, len, off, nextoff, error) \
275
do { \
276
if (s == NULL || off >= len || nextoff == NULL) { \
277
error = PARSERUTILS_BADPARM; \
278
break; \
279
} \
280
\
281
/* Skip current start byte (if present - may be mid-sequence) */
\
282
if (s[off] < 0x80 || (s[off] & 0xC0) == 0xC0) \
283
off++; \
284
\
285
while (off < len && (s[off] & 0xC0) == 0x80) \
286
off++; \
287
\
288
*nextoff = off; \
289
\
290
error = PARSERUTILS_OK; \
291
} while(0)
292
303
#define UTF8_NEXT_PARANOID(s, len, off, nextoff, error) \
304
do { \
305
uint8_t c; \
306
\
307
error = PARSERUTILS_OK; \
308
\
309
if (s == NULL || off >= len || nextoff == NULL) { \
310
error = PARSERUTILS_BADPARM; \
311
break; \
312
} \
313
\
314
c = s[off]; \
315
\
316
/* If we're mid-sequence, simply advance to next byte */
\
317
if (!(c < 0x80 || (c & 0xC0) == 0xC0)) { \
318
off++; \
319
} else { \
320
uint32_t nCont = numContinuations[c]; \
321
uint32_t nToSkip; \
322
\
323
if (off + nCont + 1 >= len) { \
324
error = PARSERUTILS_NEEDDATA; \
325
break; \
326
} \
327
\
328
/* Verify continuation bytes */
\
329
for (nToSkip = 1; nToSkip <= nCont; nToSkip++) { \
330
if ((s[off + nToSkip] & 0xC0) != 0x80) \
331
break; \
332
} \
333
\
334
/* Skip over the valid bytes */
\
335
off += nToSkip; \
336
} \
337
\
338
*nextoff = off; \
339
} while(0)
340
341
#endif
numContinuations
const uint8_t numContinuations[256]
Number of continuation bytes for a given start byte.
Definition:
utf8.c:20
Generated by
1.9.5