12 #include <parserutils/charset/mibenum.h> 26 const uint8_t **
name, uint32_t *namelen,
27 const uint8_t **value, uint32_t *valuelen);
44 uint16_t *mibenum, uint32_t *source)
48 if (data == NULL || mibenum == NULL || source == NULL)
49 return PARSERUTILS_BADPARM;
68 return PARSERUTILS_OK;
79 goto default_encoding;
87 return PARSERUTILS_OK;
116 if (charset != parserutils_charset_mibenum_from_name(
117 "UTF-32",
SLEN(
"UTF-32")) &&
118 charset != parserutils_charset_mibenum_from_name(
119 "UTF-32LE",
SLEN(
"UTF-32LE")) &&
120 charset != parserutils_charset_mibenum_from_name(
121 "UTF-32BE",
SLEN(
"UTF-32BE"))) {
126 return PARSERUTILS_OK;
140 charset = parserutils_charset_mibenum_from_name(
"Windows-1252",
141 SLEN(
"Windows-1252"));
143 charset = parserutils_charset_mibenum_from_name(
"ISO-8859-1",
149 return PARSERUTILS_OK;
170 if (data[0] == 0xFE && data[1] == 0xFF) {
171 return parserutils_charset_mibenum_from_name(
"UTF-16BE",
173 }
else if (data[0] == 0xFF && data[1] == 0xFE) {
174 return parserutils_charset_mibenum_from_name(
"UTF-16LE",
176 }
else if (data[0] == 0xEF && data[1] == 0xBB && data[2] == 0xBF) {
177 return parserutils_charset_mibenum_from_name(
"UTF-8",
185 (pos < end - SLEN(a) && \ 186 strncasecmp((const char *) pos, a, SLEN(a)) == 0) 189 while (pos < end - SLEN(a)) { \ 195 if (pos == end - SLEN(a)) \ 199 (a == 0x09 || a == 0x0a || a == 0x0c || \ 200 a == 0x0d || a == 0x20 || a == 0x2f) 211 const uint8_t *pos = data;
218 end = pos +
min(512,
len);
227 }
else if (
PEEK(
"<meta")) {
228 if (pos +
SLEN(
"<meta") >= end - 1)
233 pos +=
SLEN(
"<meta");
244 }
else if ((
PEEK(
"</") && (pos < end - 3 &&
245 (0x41 <= (*(pos + 2) & ~ 0x20) &&
246 (*(pos + 2) & ~ 0x20) <= 0x5A))) ||
247 (pos < end - 2 && *pos ==
'<' &&
248 (0x41 <= (*(pos + 1) & ~ 0x20) &&
249 (*(pos + 1) & ~ 0x20) <= 0x5A))) {
257 *pos ==
'>' || *pos ==
'<')
303 const uint8_t *value;
304 uint32_t namelen, valuelen;
305 uint16_t mibenum = 0;
307 if (pos == NULL || *pos == NULL || end == NULL)
312 &
name, &namelen, &value, &valuelen)) {
316 if (namelen ==
SLEN(
"charset") && valuelen > 0 &&
317 strncasecmp((
const char *)
name,
"charset",
318 SLEN(
"charset")) == 0) {
325 while (valuelen > 0 &&
ISSPACE(value[valuelen - 1]))
328 mibenum = parserutils_charset_mibenum_from_name(
329 (
const char *) value, valuelen);
331 }
else if (namelen ==
SLEN(
"content") && valuelen > 0 &&
332 strncasecmp((
const char *)
name,
"content",
333 SLEN(
"content")) == 0) {
339 if (mibenum == parserutils_charset_mibenum_from_name(
340 "UTF-16LE",
SLEN(
"UTF-16LE")) ||
342 parserutils_charset_mibenum_from_name(
343 "UTF-16BE",
SLEN(
"UTF-16BE")) ||
345 parserutils_charset_mibenum_from_name(
346 "UTF-16",
SLEN(
"UTF-16"))) {
347 mibenum = parserutils_charset_mibenum_from_name(
348 "UTF-8",
SLEN(
"UTF-8"));
372 const uint8_t *tentative = NULL;
373 uint32_t tentative_len = 0;
378 end = value + valuelen;
381 while (value < end) {
394 while (value < end &&
ISSPACE(*value)) {
402 if (value < end -
SLEN(
"charset") &&
403 strncasecmp((
const char *) value,
404 "charset",
SLEN(
"charset")) != 0)
407 value +=
SLEN(
"charset");
410 while (value < end &&
ISSPACE(*value)) {
424 while (value < end &&
ISSPACE(*value)) {
436 while (++value < end && *value !=
'"') {
445 }
else if (*value ==
'\'') {
446 while (++value < end && *value !=
'\'') {
456 while (value < end && !
ISSPACE(*value)) {
463 if (tentative != NULL) {
464 return parserutils_charset_mibenum_from_name(
465 (
const char *) tentative, tentative_len);
487 const uint8_t **
name, uint32_t *namelen,
488 const uint8_t **value, uint32_t *valuelen)
492 if (data == NULL || *data == NULL || end == NULL ||
name == NULL ||
493 namelen == NULL || value == NULL || valuelen == NULL)
499 while (pos < end && (
ISSPACE(*pos) || *pos ==
'/')) {
524 *value = (
const uint8_t *)
"";
540 if (*pos ==
'/' || *pos ==
'<' || *pos ==
'>') {
561 while (pos < end &&
ISSPACE(*pos)) {
582 while (pos < end &&
ISSPACE(*pos)) {
593 if (*pos ==
'\'' || *pos ==
'"') {
595 const uint8_t *quote = pos;
598 while (++pos < end) {
600 if (*pos == *quote) {
601 *value = (quote + 1);
621 if (*pos ==
'<' || *pos ==
'>') {
634 if (
ISSPACE(*pos) || *pos ==
'<' || *pos ==
'>') {
669 assert(*charset != 0);
672 if (*charset == parserutils_charset_mibenum_from_name(
673 "ISO-8859-1",
SLEN(
"ISO-8859-1"))) {
674 tmp = parserutils_charset_mibenum_from_name(
675 "Windows-1252",
SLEN(
"Windows-1252"));
676 assert(tmp != 0 &&
"Windows-1252 MUST be supported");
678 }
else if (*charset == parserutils_charset_mibenum_from_name(
679 "ISO-8859-9",
SLEN(
"ISO-8859-9"))) {
680 tmp = parserutils_charset_mibenum_from_name(
681 "Windows-1254",
SLEN(
"Windows-1254"));
683 }
else if (*charset == parserutils_charset_mibenum_from_name(
684 "ISO-8859-11",
SLEN(
"ISO-8859-11"))) {
685 tmp = parserutils_charset_mibenum_from_name(
686 "Windows-874",
SLEN(
"Windows-874"));
688 }
else if (*charset == parserutils_charset_mibenum_from_name(
689 "KS_C_5601-1987",
SLEN(
"KS_C_5601-1987")) ||
690 *charset == parserutils_charset_mibenum_from_name(
691 "EUC-KR",
SLEN(
"EUC-KR"))) {
692 tmp = parserutils_charset_mibenum_from_name(
693 "Windows-949",
SLEN(
"Windows-949"));
695 }
else if (*charset == parserutils_charset_mibenum_from_name(
696 "TIS-620",
SLEN(
"TIS-620"))) {
697 tmp = parserutils_charset_mibenum_from_name(
698 "Windows-874",
SLEN(
"Windows-874"));
700 }
else if (*charset == parserutils_charset_mibenum_from_name(
701 "x-x-big5",
SLEN(
"x-x-big5"))) {
702 tmp = parserutils_charset_mibenum_from_name(
703 "Big5",
SLEN(
"Big5"));
705 }
else if (*charset == parserutils_charset_mibenum_from_name(
706 "GB2312",
SLEN(
"GB2312")) ||
707 *charset == parserutils_charset_mibenum_from_name(
708 "GB_2312-80",
SLEN(
"GB_2312-80"))) {
709 tmp = parserutils_charset_mibenum_from_name(
Charset may be changed with further data.
static uint16_t hubbub_charset_read_bom(const uint8_t *data, size_t len)
Inspect the beginning of a buffer of data for the presence of a UTF Byte Order Mark.
static bool hubbub_charset_get_attribute(const uint8_t **data, const uint8_t *end, const uint8_t **name, uint32_t *namelen, const uint8_t **value, uint32_t *valuelen)
Extract an attribute from the data stream.
static uint16_t hubbub_charset_scan_meta(const uint8_t *data, size_t len)
Search for a meta charset within a buffer of data.
static uint16_t hubbub_charset_parse_attributes(const uint8_t **pos, const uint8_t *end)
Parse attributes on a meta tag.
uint16_t hubbub_charset_parse_content(const uint8_t *value, uint32_t valuelen)
Parse a content= attribute's value.
parserutils_error hubbub_charset_extract(const uint8_t *data, size_t len, uint16_t *mibenum, uint32_t *source)
Extract a charset from a chunk of data.
void hubbub_charset_fix_charset(uint16_t *charset)
Fix charsets, according to the override table in HTML5, section 8.2.2.2.