Commit fa42c37f authored by Kenichi Handa's avatar Kenichi Handa
Browse files

Add comments on coding-category-utf-8,

coding-category-utf-16-be, and coding-category-utf-16-le.
(coding_category_name): Include "coding-category-utf-8",
"coding-category-utf-16-be", and "coding-category-utf-16-le".
(UTF_8_1_OCTET_P) (UTF_8_EXTRA_OCTET_P) (UTF_8_2_OCTET_LEADING_P)
(UTF_8_3_OCTET_LEADING_P) (UTF_8_4_OCTET_LEADING_P)
(UTF_8_5_OCTET_LEADING_P) (UTF_8_6_OCTET_LEADING_P): New macros.
(detect_coding_utf_8): New function.
(UTF_16_INVALID_P) (TF_16_HIGH_SURROGATE_P)
(UTF_16_LOW_SURROGATE_P): New macros.
(detect_coding_utf_16): New function.
(detect_coding_mask): Fix bug of returning wrong mask bits in the
case that detect_coding_XXX returns a mask not set in
priorities[i].
(detect_eol_type_in_2_octet_form): New function.
(detect_eol): If cooding->category_idx is for UTF-16, call
detect_eol_type_in_2_octet_form instead of dectect_eol_type.
(detect_coding_system): Don't include `nil' coding-system in the
result.
(Fupdate_coding_systems_internal): Update all coding-categories.
parent 62537270
...@@ -362,6 +362,9 @@ char *coding_category_name[CODING_CATEGORY_IDX_MAX] = { ...@@ -362,6 +362,9 @@ char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
"coding-category-iso-8-else", "coding-category-iso-8-else",
"coding-category-ccl", "coding-category-ccl",
"coding-category-big5", "coding-category-big5",
"coding-category-utf-8",
"coding-category-utf-16-be",
"coding-category-utf-16-le",
"coding-category-raw-text", "coding-category-raw-text",
"coding-category-binary" "coding-category-binary"
}; };
...@@ -2348,6 +2351,89 @@ detect_coding_big5 (src, src_end) ...@@ -2348,6 +2351,89 @@ detect_coding_big5 (src, src_end)
return CODING_CATEGORY_MASK_BIG5; return CODING_CATEGORY_MASK_BIG5;
} }
/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
Check if a text is encoded in UTF-8. If it is, return
CODING_CATEGORY_MASK_UTF_8, else return 0. */
#define UTF_8_1_OCTET_P(c) ((c) < 0x80)
#define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
#define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
#define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
#define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
#define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
#define UTF_8_6_OCTET_LEADING_P(c) (((c) & 0xFE) == 0xFC)
int
detect_coding_utf_8 (src, src_end)
unsigned char *src, *src_end;
{
unsigned char c;
int seq_maybe_bytes;
while (src < src_end)
{
c = *src++;
if (UTF_8_1_OCTET_P (c))
continue;
else if (UTF_8_2_OCTET_LEADING_P (c))
seq_maybe_bytes = 1;
else if (UTF_8_3_OCTET_LEADING_P (c))
seq_maybe_bytes = 2;
else if (UTF_8_4_OCTET_LEADING_P (c))
seq_maybe_bytes = 3;
else if (UTF_8_5_OCTET_LEADING_P (c))
seq_maybe_bytes = 4;
else if (UTF_8_6_OCTET_LEADING_P (c))
seq_maybe_bytes = 5;
else
return 0;
do
{
if (src >= src_end)
return CODING_CATEGORY_MASK_UTF_8;
c = *src++;
if (!UTF_8_EXTRA_OCTET_P (c))
return 0;
seq_maybe_bytes--;
}
while (seq_maybe_bytes > 0);
}
return CODING_CATEGORY_MASK_UTF_8;
}
/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
Check if a text is encoded in UTF-16 Big Endian (endian == 1) or
Little Endian (otherwise). If it is, return
CODING_CATEGORY_MASK_UTF_16_BE or CODING_CATEGORY_MASK_UTF_16_LE,
else return 0. */
#define UTF_16_INVALID_P(val) \
(((val) == 0xFFFE) \
|| ((val) == 0xFFFF))
#define UTF_16_HIGH_SURROGATE_P(val) \
(((val) & 0xD800) == 0xD800)
#define UTF_16_LOW_SURROGATE_P(val) \
(((val) & 0xDC00) == 0xDC00)
int
detect_coding_utf_16 (src, src_end)
unsigned char *src, *src_end;
{
if ((src + 1) >= src_end) return 0;
if ((src[0] == 0xFF) && (src[1] == 0xFE))
return CODING_CATEGORY_MASK_UTF_16_LE;
else if ((src[0] == 0xFE) && (src[1] == 0xFF))
return CODING_CATEGORY_MASK_UTF_16_BE;
return 0;
}
/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */ If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
...@@ -3453,6 +3539,26 @@ setup_raw_text_coding_system (coding) ...@@ -3453,6 +3539,26 @@ setup_raw_text_coding_system (coding)
as BIG5. Assigned the coding-system (Lisp symbol) as BIG5. Assigned the coding-system (Lisp symbol)
`cn-big5' by default. `cn-big5' by default.
o coding-category-utf-8
The category for a coding system which has the same code range
as UTF-8 (cf. RFC2279). Assigned the coding-system (Lisp
symbol) `utf-8' by default.
o coding-category-utf-16-be
The category for a coding system in which a text has an
Unicode signature (cf. Unicode Standard) in the order of BIG
endian at the head. Assigned the coding-system (Lisp symbol)
`utf-16-be' by default.
o coding-category-utf-16-le
The category for a coding system in which a text has an
Unicode signature (cf. Unicode Standard) in the order of
LITTLE endian at the head. Assigned the coding-system (Lisp
symbol) `utf-16-le' by default.
o coding-category-ccl o coding-category-ccl
The category for a coding system of which encoder/decoder is The category for a coding system of which encoder/decoder is
...@@ -3481,7 +3587,10 @@ int ascii_skip_code[256]; ...@@ -3481,7 +3587,10 @@ int ascii_skip_code[256];
/* Detect how a text of length SRC_BYTES pointed by SOURCE is encoded. /* Detect how a text of length SRC_BYTES pointed by SOURCE is encoded.
If it detects possible coding systems, return an integer in which If it detects possible coding systems, return an integer in which
appropriate flag bits are set. Flag bits are defined by macros appropriate flag bits are set. Flag bits are defined by macros
CODING_CATEGORY_MASK_XXX in `coding.h'. CODING_CATEGORY_MASK_XXX in `coding.h'. If PRIORITIES is non-NULL,
it should point the table `coding_priorities'. In that case, only
the flag bit for a coding system of the highest priority is set in
the returned value.
How many ASCII characters are at the head is returned as *SKIP. */ How many ASCII characters are at the head is returned as *SKIP. */
...@@ -3492,8 +3601,8 @@ detect_coding_mask (source, src_bytes, priorities, skip) ...@@ -3492,8 +3601,8 @@ detect_coding_mask (source, src_bytes, priorities, skip)
{ {
register unsigned char c; register unsigned char c;
unsigned char *src = source, *src_end = source + src_bytes; unsigned char *src = source, *src_end = source + src_bytes;
unsigned int mask; unsigned int mask, utf16_examined_p, iso2022_examined_p;
int i; int i, idx;
/* At first, skip all ASCII characters and control characters except /* At first, skip all ASCII characters and control characters except
for three ISO2022 specific control characters. */ for three ISO2022 specific control characters. */
...@@ -3528,7 +3637,14 @@ detect_coding_mask (source, src_bytes, priorities, skip) ...@@ -3528,7 +3637,14 @@ detect_coding_mask (source, src_bytes, priorities, skip)
goto label_loop_detect_coding; goto label_loop_detect_coding;
} }
if (priorities) if (priorities)
goto label_return_highest_only; {
for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
{
if (mask & priorities[i])
return priorities[i];
}
return CODING_CATEGORY_MASK_RAW_TEXT;
}
} }
else else
{ {
...@@ -3537,8 +3653,12 @@ detect_coding_mask (source, src_bytes, priorities, skip) ...@@ -3537,8 +3653,12 @@ detect_coding_mask (source, src_bytes, priorities, skip)
if (c < 0xA0) if (c < 0xA0)
{ {
/* C is the first byte of SJIS character code, /* C is the first byte of SJIS character code,
or a leading-code of Emacs' internal format (emacs-mule). */ or a leading-code of Emacs' internal format (emacs-mule),
try = CODING_CATEGORY_MASK_SJIS | CODING_CATEGORY_MASK_EMACS_MULE; or the first byte of UTF-16. */
try = (CODING_CATEGORY_MASK_SJIS
| CODING_CATEGORY_MASK_EMACS_MULE
| CODING_CATEGORY_MASK_UTF_16_BE
| CODING_CATEGORY_MASK_UTF_16_LE);
/* Or, if C is a special latin extra code, /* Or, if C is a special latin extra code,
or is an ISO2022 specific control code of C1 (SS2 or SS3), or is an ISO2022 specific control code of C1 (SS2 or SS3),
...@@ -3559,11 +3679,15 @@ detect_coding_mask (source, src_bytes, priorities, skip) ...@@ -3559,11 +3679,15 @@ detect_coding_mask (source, src_bytes, priorities, skip)
else else
/* C is a character of ISO2022 in graphic plane right, /* C is a character of ISO2022 in graphic plane right,
or a SJIS's 1-byte character code (i.e. JISX0201), or a SJIS's 1-byte character code (i.e. JISX0201),
or the first byte of BIG5's 2-byte code. */ or the first byte of BIG5's 2-byte code,
or the first byte of UTF-8/16. */
try = (CODING_CATEGORY_MASK_ISO_8_ELSE try = (CODING_CATEGORY_MASK_ISO_8_ELSE
| CODING_CATEGORY_MASK_ISO_8BIT | CODING_CATEGORY_MASK_ISO_8BIT
| CODING_CATEGORY_MASK_SJIS | CODING_CATEGORY_MASK_SJIS
| CODING_CATEGORY_MASK_BIG5); | CODING_CATEGORY_MASK_BIG5
| CODING_CATEGORY_MASK_UTF_8
| CODING_CATEGORY_MASK_UTF_16_BE
| CODING_CATEGORY_MASK_UTF_16_LE);
/* Or, we may have to consider the possibility of CCL. */ /* Or, we may have to consider the possibility of CCL. */
if (coding_system_table[CODING_CATEGORY_IDX_CCL] if (coding_system_table[CODING_CATEGORY_IDX_CCL]
...@@ -3572,26 +3696,40 @@ detect_coding_mask (source, src_bytes, priorities, skip) ...@@ -3572,26 +3696,40 @@ detect_coding_mask (source, src_bytes, priorities, skip)
try |= CODING_CATEGORY_MASK_CCL; try |= CODING_CATEGORY_MASK_CCL;
mask = 0; mask = 0;
utf16_examined_p = iso2022_examined_p = 0;
if (priorities) if (priorities)
{ {
for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++) for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
{ {
if (priorities[i] & try & CODING_CATEGORY_MASK_ISO) if (!iso2022_examined_p
mask = detect_coding_iso2022 (src, src_end); && (priorities[i] & try & CODING_CATEGORY_MASK_ISO))
{
mask |= detect_coding_iso2022 (src, src_end);
iso2022_examined_p = 1;
}
else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS) else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS)
mask = detect_coding_sjis (src, src_end); mask |= detect_coding_sjis (src, src_end);
else if (priorities[i] & try & CODING_CATEGORY_MASK_UTF_8)
mask |= detect_coding_utf_8 (src, src_end);
else if (!utf16_examined_p
&& (priorities[i] & try &
CODING_CATEGORY_MASK_UTF_16_BE_LE))
{
mask |= detect_coding_utf_16 (src, src_end);
utf16_examined_p = 1;
}
else if (priorities[i] & try & CODING_CATEGORY_MASK_BIG5) else if (priorities[i] & try & CODING_CATEGORY_MASK_BIG5)
mask = detect_coding_big5 (src, src_end); mask |= detect_coding_big5 (src, src_end);
else if (priorities[i] & try & CODING_CATEGORY_MASK_EMACS_MULE) else if (priorities[i] & try & CODING_CATEGORY_MASK_EMACS_MULE)
mask = detect_coding_emacs_mule (src, src_end); mask |= detect_coding_emacs_mule (src, src_end);
else if (priorities[i] & try & CODING_CATEGORY_MASK_CCL) else if (priorities[i] & try & CODING_CATEGORY_MASK_CCL)
mask = detect_coding_ccl (src, src_end); mask |= detect_coding_ccl (src, src_end);
else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT) else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT)
mask = CODING_CATEGORY_MASK_RAW_TEXT; mask |= CODING_CATEGORY_MASK_RAW_TEXT;
else if (priorities[i] & CODING_CATEGORY_MASK_BINARY) else if (priorities[i] & CODING_CATEGORY_MASK_BINARY)
mask = CODING_CATEGORY_MASK_BINARY; mask |= CODING_CATEGORY_MASK_BINARY;
if (mask) if (mask & priorities[i])
goto label_return_highest_only; return priorities[i];
} }
return CODING_CATEGORY_MASK_RAW_TEXT; return CODING_CATEGORY_MASK_RAW_TEXT;
} }
...@@ -3601,20 +3739,16 @@ detect_coding_mask (source, src_bytes, priorities, skip) ...@@ -3601,20 +3739,16 @@ detect_coding_mask (source, src_bytes, priorities, skip)
mask |= detect_coding_sjis (src, src_end); mask |= detect_coding_sjis (src, src_end);
if (try & CODING_CATEGORY_MASK_BIG5) if (try & CODING_CATEGORY_MASK_BIG5)
mask |= detect_coding_big5 (src, src_end); mask |= detect_coding_big5 (src, src_end);
if (try & CODING_CATEGORY_MASK_UTF_8)
mask |= detect_coding_utf_8 (src, src_end);
if (try & CODING_CATEGORY_MASK_UTF_16_BE_LE)
mask |= detect_coding_utf_16 (src, src_end);
if (try & CODING_CATEGORY_MASK_EMACS_MULE) if (try & CODING_CATEGORY_MASK_EMACS_MULE)
mask |= detect_coding_emacs_mule (src, src_end); mask |= detect_coding_emacs_mule (src, src_end);
if (try & CODING_CATEGORY_MASK_CCL) if (try & CODING_CATEGORY_MASK_CCL)
mask |= detect_coding_ccl (src, src_end); mask |= detect_coding_ccl (src, src_end);
} }
return (mask | CODING_CATEGORY_MASK_RAW_TEXT | CODING_CATEGORY_MASK_BINARY); return (mask | CODING_CATEGORY_MASK_RAW_TEXT | CODING_CATEGORY_MASK_BINARY);
label_return_highest_only:
for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
{
if (mask & priorities[i])
return priorities[i];
}
return CODING_CATEGORY_MASK_RAW_TEXT;
} }
/* Detect how a text of length SRC_BYTES pointed by SRC is encoded. /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
...@@ -3710,6 +3844,76 @@ detect_eol_type (source, src_bytes, skip) ...@@ -3710,6 +3844,76 @@ detect_eol_type (source, src_bytes, skip)
return eol_type; return eol_type;
} }
/* Like detect_eol_type, but detect EOL type in 2-octet
big-endian/little-endian format for coding systems utf-16-be and
utf-16-le. */
static int
detect_eol_type_in_2_octet_form (source, src_bytes, skip, big_endian_p)
unsigned char *source;
int src_bytes, *skip;
{
unsigned char *src = source, *src_end = src + src_bytes;
unsigned int c1, c2;
int total = 0; /* How many end-of-lines are found so far. */
int eol_type = CODING_EOL_UNDECIDED;
int this_eol_type;
int msb, lsb;
if (big_endian_p)
msb = 0, lsb = 1;
else
msb = 1, lsb = 0;
*skip = 0;
while ((src + 1) < src_end && total < MAX_EOL_CHECK_COUNT)
{
c1 = (src[msb] << 8) | (src[lsb]);
src += 2;
if (c1 == '\n' || c1 == '\r')
{
if (*skip == 0)
*skip = src - 2 - source;
total++;
if (c1 == '\n')
{
this_eol_type = CODING_EOL_LF;
}
else
{
if ((src + 1) >= src_end)
{
this_eol_type = CODING_EOL_CR;
}
else
{
c2 = (src[msb] << 8) | (src[lsb]);
if (c2 == '\n')
this_eol_type = CODING_EOL_CRLF, src += 2;
else
this_eol_type = CODING_EOL_CR;
}
}
if (eol_type == CODING_EOL_UNDECIDED)
/* This is the first end-of-line. */
eol_type = this_eol_type;
else if (eol_type != this_eol_type)
{
/* The found type is different from what found before. */
eol_type = CODING_EOL_INCONSISTENT;
break;
}
}
}
if (*skip == 0)
*skip = src_end - source;
return eol_type;
}
/* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
is encoded. If it detects an appropriate format of end-of-line, it is encoded. If it detects an appropriate format of end-of-line, it
sets the information in *CODING. */ sets the information in *CODING. */
...@@ -3722,7 +3926,20 @@ detect_eol (coding, src, src_bytes) ...@@ -3722,7 +3926,20 @@ detect_eol (coding, src, src_bytes)
{ {
Lisp_Object val; Lisp_Object val;
int skip; int skip;
int eol_type = detect_eol_type (src, src_bytes, &skip); int eol_type;
switch (coding->category_idx)
{
case CODING_CATEGORY_IDX_UTF_16_BE:
eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 1);
break;
case CODING_CATEGORY_IDX_UTF_16_LE:
eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 0);
break;
default:
eol_type = detect_eol_type (src, src_bytes, &skip);
break;
}
if (coding->heading_ascii > skip) if (coding->heading_ascii > skip)
coding->heading_ascii = skip; coding->heading_ascii = skip;
...@@ -5216,13 +5433,17 @@ detect_coding_system (src, src_bytes, highest) ...@@ -5216,13 +5433,17 @@ detect_coding_system (src, src_bytes, highest)
/* At first, gather possible coding systems in VAL. */ /* At first, gather possible coding systems in VAL. */
val = Qnil; val = Qnil;
for (tmp = Vcoding_category_list; !NILP (tmp); tmp = XCDR (tmp)) for (tmp = Vcoding_category_list; CONSP (tmp); tmp = XCDR (tmp))
{ {
int idx Lisp_Object category_val, category_index;
= XFASTINT (Fget (XCAR (tmp), Qcoding_category_index));
if (coding_mask & (1 << idx)) category_index = Fget (XCAR (tmp), Qcoding_category_index);
category_val = Fsymbol_value (XCAR (tmp));
if (!NILP (category_val)
&& NATNUMP (category_index)
&& (coding_mask & (1 << XFASTINT (category_index))))
{ {
val = Fcons (Fsymbol_value (XCAR (tmp)), val); val = Fcons (category_val, val);
if (highest) if (highest)
break; break;
} }
...@@ -5231,7 +5452,7 @@ detect_coding_system (src, src_bytes, highest) ...@@ -5231,7 +5452,7 @@ detect_coding_system (src, src_bytes, highest)
val = Fnreverse (val); val = Fnreverse (val);
/* Then, replace the elements with subsidiary coding systems. */ /* Then, replace the elements with subsidiary coding systems. */
for (tmp = val; !NILP (tmp); tmp = XCDR (tmp)) for (tmp = val; CONSP (tmp); tmp = XCDR (tmp))
{ {
if (eol_type != CODING_EOL_UNDECIDED if (eol_type != CODING_EOL_UNDECIDED
&& eol_type != CODING_EOL_INCONSISTENT) && eol_type != CODING_EOL_INCONSISTENT)
...@@ -5712,17 +5933,13 @@ which is a list of all the arguments given to this function.") ...@@ -5712,17 +5933,13 @@ which is a list of all the arguments given to this function.")
DEFUN ("update-coding-systems-internal", Fupdate_coding_systems_internal, DEFUN ("update-coding-systems-internal", Fupdate_coding_systems_internal,
Supdate_coding_systems_internal, 0, 0, 0, Supdate_coding_systems_internal, 0, 0, 0,
"Update internal database for ISO2022 and CCL based coding systems.\n\ "Update internal database for ISO2022 and CCL based coding systems.\n\
When values of the following coding categories are changed, you must\n\ When values of any coding categories are changed, you must\n\
call this function:\n\ call this function")
coding-category-iso-7, coding-category-iso-7-tight,\n\
coding-category-iso-8-1, coding-category-iso-8-2,\n\
coding-category-iso-7-else, coding-category-iso-8-else,\n\
coding-category-ccl")
() ()
{ {
int i; int i;
for (i = CODING_CATEGORY_IDX_ISO_7; i <= CODING_CATEGORY_IDX_CCL; i++) for (i = CODING_CATEGORY_IDX_EMACS_MULE; i < CODING_CATEGORY_IDX_MAX; i++)
{ {
Lisp_Object val; Lisp_Object val;
...@@ -5767,7 +5984,7 @@ This function is internal use only.") ...@@ -5767,7 +5984,7 @@ This function is internal use only.")
} }
/* If coding-category-list is valid and contains all coding /* If coding-category-list is valid and contains all coding
categories, `i' should be CODING_CATEGORY_IDX_MAX now. If not, categories, `i' should be CODING_CATEGORY_IDX_MAX now. If not,
the following code saves Emacs from craching. */ the following code saves Emacs from crashing. */
while (i < CODING_CATEGORY_IDX_MAX) while (i < CODING_CATEGORY_IDX_MAX)
coding_priorities[i++] = CODING_CATEGORY_MASK_RAW_TEXT; coding_priorities[i++] = CODING_CATEGORY_MASK_RAW_TEXT;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment