Commit e6d2f155 authored by Kenichi Handa's avatar Kenichi Handa
Browse files

Fix the setting of buffer-file-coding-system on, for instance, C-x RET c unix...

Fix the setting of buffer-file-coding-system on, for instance, C-x RET c unix RET _FILE_OF_DOS_EOL_TYPE_ RET.
parent 59c88671
2013-05-22 Kenichi Handa <handa@gnu.org>
The following changes are to fix the setting of
buffer-file-coding-system on, for instance, C-x RET c unix RET
_FILE_OF_DOS_EOL_TYPE_ RET.
* coding.h (struct coding_system): New member detected_utf8_chars.
* coding.c (detect_coding_utf_8): Count characters and check EOL
format. Include CATEGORY_MASK_UTF_8_AUTO in detect_info->found if
BOM is there.
(setup_coding_system): Do not initialize coding->head_ascii.
(check_ascii): Do not set coding->eol_seen but update it. Do not
call adjust_coding_eol_type here.
(detect_coding): Fix detection of BOM for utf-8 and utf-16. If
the eol-type of CODING is already specified, adjust the eol type
of the found coding-system.
(decode_coding_gap): Cancel previous change. Utilize the
character numbers counted by detect_coding_utf_8. Fix detection
of BOM for utf-8.
2013-05-08 Paul Eggert <eggert@cs.ucla.edu> 2013-05-08 Paul Eggert <eggert@cs.ucla.edu>
* xterm.h (GTK_PREREQ): New macro. * xterm.h (GTK_PREREQ): New macro.
......
...@@ -1125,6 +1125,14 @@ alloc_destination (struct coding_system *coding, ptrdiff_t nbytes, ...@@ -1125,6 +1125,14 @@ alloc_destination (struct coding_system *coding, ptrdiff_t nbytes,
*buf++ = id; \ *buf++ = id; \
} while (0) } while (0)
/* Bitmasks for coding->eol_seen. */
#define EOL_SEEN_NONE 0
#define EOL_SEEN_LF 1
#define EOL_SEEN_CR 2
#define EOL_SEEN_CRLF 4
/*** 2. Emacs' internal format (emacs-utf-8) ***/ /*** 2. Emacs' internal format (emacs-utf-8) ***/
...@@ -1147,6 +1155,9 @@ alloc_destination (struct coding_system *coding, ptrdiff_t nbytes, ...@@ -1147,6 +1155,9 @@ alloc_destination (struct coding_system *coding, ptrdiff_t nbytes,
#define UTF_8_BOM_2 0xBB #define UTF_8_BOM_2 0xBB
#define UTF_8_BOM_3 0xBF #define UTF_8_BOM_3 0xBF
/* Unlike the other detect_coding_XXX, this function counts number of
characters and check EOL format. */
static bool static bool
detect_coding_utf_8 (struct coding_system *coding, detect_coding_utf_8 (struct coding_system *coding,
struct coding_detection_info *detect_info) struct coding_detection_info *detect_info)
...@@ -1156,11 +1167,23 @@ detect_coding_utf_8 (struct coding_system *coding, ...@@ -1156,11 +1167,23 @@ detect_coding_utf_8 (struct coding_system *coding,
bool multibytep = coding->src_multibyte; bool multibytep = coding->src_multibyte;
ptrdiff_t consumed_chars = 0; ptrdiff_t consumed_chars = 0;
bool bom_found = 0; bool bom_found = 0;
bool found = 0; int nchars = coding->head_ascii;
int eol_seen = coding->eol_seen;
detect_info->checked |= CATEGORY_MASK_UTF_8; detect_info->checked |= CATEGORY_MASK_UTF_8;
/* A coding system of this category is always ASCII compatible. */ /* A coding system of this category is always ASCII compatible. */
src += coding->head_ascii; src += nchars;
if (src == coding->source /* BOM should be at the head. */
&& src + 3 < src_end /* BOM is 3-byte long. */
&& src[0] == UTF_8_BOM_1
&& src[1] == UTF_8_BOM_2
&& src[2] == UTF_8_BOM_3)
{
bom_found = 1;
src += 3;
nchars++;
}
while (1) while (1)
{ {
...@@ -1169,13 +1192,29 @@ detect_coding_utf_8 (struct coding_system *coding, ...@@ -1169,13 +1192,29 @@ detect_coding_utf_8 (struct coding_system *coding,
src_base = src; src_base = src;
ONE_MORE_BYTE (c); ONE_MORE_BYTE (c);
if (c < 0 || UTF_8_1_OCTET_P (c)) if (c < 0 || UTF_8_1_OCTET_P (c))
continue; {
nchars++;
if (c == '\r')
{
if (src < src_end && *src == '\n')
{
eol_seen |= EOL_SEEN_CRLF;
src++;
nchars++;
}
else
eol_seen |= EOL_SEEN_CR;
}
else if (c == '\n')
eol_seen |= EOL_SEEN_LF;
continue;
}
ONE_MORE_BYTE (c1); ONE_MORE_BYTE (c1);
if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1)) if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
break; break;
if (UTF_8_2_OCTET_LEADING_P (c)) if (UTF_8_2_OCTET_LEADING_P (c))
{ {
found = 1; nchars++;
continue; continue;
} }
ONE_MORE_BYTE (c2); ONE_MORE_BYTE (c2);
...@@ -1183,10 +1222,7 @@ detect_coding_utf_8 (struct coding_system *coding, ...@@ -1183,10 +1222,7 @@ detect_coding_utf_8 (struct coding_system *coding,
break; break;
if (UTF_8_3_OCTET_LEADING_P (c)) if (UTF_8_3_OCTET_LEADING_P (c))
{ {
found = 1; nchars++;
if (src_base == coding->source
&& c == UTF_8_BOM_1 && c1 == UTF_8_BOM_2 && c2 == UTF_8_BOM_3)
bom_found = 1;
continue; continue;
} }
ONE_MORE_BYTE (c3); ONE_MORE_BYTE (c3);
...@@ -1194,7 +1230,7 @@ detect_coding_utf_8 (struct coding_system *coding, ...@@ -1194,7 +1230,7 @@ detect_coding_utf_8 (struct coding_system *coding,
break; break;
if (UTF_8_4_OCTET_LEADING_P (c)) if (UTF_8_4_OCTET_LEADING_P (c))
{ {
found = 1; nchars++;
continue; continue;
} }
ONE_MORE_BYTE (c4); ONE_MORE_BYTE (c4);
...@@ -1202,7 +1238,7 @@ detect_coding_utf_8 (struct coding_system *coding, ...@@ -1202,7 +1238,7 @@ detect_coding_utf_8 (struct coding_system *coding,
break; break;
if (UTF_8_5_OCTET_LEADING_P (c)) if (UTF_8_5_OCTET_LEADING_P (c))
{ {
found = 1; nchars++;
continue; continue;
} }
break; break;
...@@ -1219,14 +1255,17 @@ detect_coding_utf_8 (struct coding_system *coding, ...@@ -1219,14 +1255,17 @@ detect_coding_utf_8 (struct coding_system *coding,
if (bom_found) if (bom_found)
{ {
/* The first character 0xFFFE doesn't necessarily mean a BOM. */ /* The first character 0xFFFE doesn't necessarily mean a BOM. */
detect_info->found |= CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG; detect_info->found |= CATEGORY_MASK_UTF_8_AUTO | CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
} }
else else
{ {
detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG; detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
if (found) if (nchars < src_end - coding->source)
detect_info->found |= CATEGORY_MASK_UTF_8_NOSIG; /* The found characters are less than source bytes, which
means that we found a valid non-ASCII characters. */
detect_info->found |= CATEGORY_MASK_UTF_8_AUTO | CATEGORY_MASK_UTF_8_NOSIG;
} }
coding->detected_utf8_chars = nchars;
return 1; return 1;
} }
...@@ -5622,7 +5661,6 @@ setup_coding_system (Lisp_Object coding_system, struct coding_system *coding) ...@@ -5622,7 +5661,6 @@ setup_coding_system (Lisp_Object coding_system, struct coding_system *coding)
eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id); eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
coding->mode = 0; coding->mode = 0;
coding->head_ascii = -1;
if (VECTORP (eol_type)) if (VECTORP (eol_type))
coding->common_flags = (CODING_REQUIRE_DECODING_MASK coding->common_flags = (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_DETECTION_MASK); | CODING_REQUIRE_DETECTION_MASK);
...@@ -6074,46 +6112,35 @@ complement_process_encoding_system (Lisp_Object coding_system) ...@@ -6074,46 +6112,35 @@ complement_process_encoding_system (Lisp_Object coding_system)
*/ */
#define EOL_SEEN_NONE 0
#define EOL_SEEN_LF 1
#define EOL_SEEN_CR 2
#define EOL_SEEN_CRLF 4
static Lisp_Object adjust_coding_eol_type (struct coding_system *coding, static Lisp_Object adjust_coding_eol_type (struct coding_system *coding,
int eol_seen); int eol_seen);
/* Return the number of ASCII characters at the head of the source. /* Return the number of ASCII characters at the head of the source.
By side effects, set coding->head_ascii and coding->eol_seen. The By side effects, set coding->head_ascii and update
value of coding->eol_seen is "logical or" of EOL_SEEN_LF, coding->eol_seen. The value of coding->eol_seen is "logical or" of
EOL_SEEN_CR, and EOL_SEEN_CRLF, but the value is reliable only when EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but the value is
all the source bytes are ASCII. */ reliable only when all the source bytes are ASCII. */
static int static int
check_ascii (struct coding_system *coding) check_ascii (struct coding_system *coding)
{ {
const unsigned char *src, *end; const unsigned char *src, *end;
Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id); Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
int eol_seen; int eol_seen = coding->eol_seen;
eol_seen = (VECTORP (eol_type) ? EOL_SEEN_NONE
: EQ (eol_type, Qunix) ? EOL_SEEN_LF
: EQ (eol_type, Qdos) ? EOL_SEEN_CRLF
: EOL_SEEN_CR);
coding_set_source (coding); coding_set_source (coding);
src = coding->source; src = coding->source;
end = src + coding->src_bytes; end = src + coding->src_bytes;
if (inhibit_eol_conversion if (inhibit_eol_conversion
|| eol_seen != EOL_SEEN_NONE) || SYMBOLP (eol_type))
{ {
/* We don't have to check EOL format. */ /* We don't have to check EOL format. */
while (src < end && !( *src & 0x80)) src++; while (src < end && !( *src & 0x80))
if (inhibit_eol_conversion)
{ {
eol_seen = EOL_SEEN_LF; if (*src++ == '\n')
adjust_coding_eol_type (coding, eol_seen); eol_seen |= EOL_SEEN_LF;
} }
} }
else else
...@@ -6171,7 +6198,7 @@ static int ...@@ -6171,7 +6198,7 @@ static int
check_utf_8 (struct coding_system *coding) check_utf_8 (struct coding_system *coding)
{ {
const unsigned char *src, *end; const unsigned char *src, *end;
int eol_seen = coding->eol_seen; int eol_seen;
int nchars = coding->head_ascii; int nchars = coding->head_ascii;
if (coding->head_ascii < 0) if (coding->head_ascii < 0)
...@@ -6181,7 +6208,7 @@ check_utf_8 (struct coding_system *coding) ...@@ -6181,7 +6208,7 @@ check_utf_8 (struct coding_system *coding)
src = coding->source + coding->head_ascii; src = coding->source + coding->head_ascii;
/* We look ahead one byte for CR LF. */ /* We look ahead one byte for CR LF. */
end = coding->source + coding->src_bytes - 1; end = coding->source + coding->src_bytes - 1;
eol_seen = coding->eol_seen;
while (src < end) while (src < end)
{ {
int c = *src; int c = *src;
...@@ -6402,6 +6429,8 @@ detect_coding (struct coding_system *coding) ...@@ -6402,6 +6429,8 @@ detect_coding (struct coding_system *coding)
{ {
const unsigned char *src, *src_end; const unsigned char *src, *src_end;
unsigned int saved_mode = coding->mode; unsigned int saved_mode = coding->mode;
Lisp_Object found = Qnil;
Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
coding->consumed = coding->consumed_char = 0; coding->consumed = coding->consumed_char = 0;
coding->produced = coding->produced_char = 0; coding->produced = coding->produced_char = 0;
...@@ -6409,6 +6438,7 @@ detect_coding (struct coding_system *coding) ...@@ -6409,6 +6438,7 @@ detect_coding (struct coding_system *coding)
src_end = coding->source + coding->src_bytes; src_end = coding->source + coding->src_bytes;
coding->eol_seen = EOL_SEEN_NONE;
/* If we have not yet decided the text encoding type, detect it /* If we have not yet decided the text encoding type, detect it
now. */ now. */
if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided)) if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
...@@ -6418,7 +6448,6 @@ detect_coding (struct coding_system *coding) ...@@ -6418,7 +6448,6 @@ detect_coding (struct coding_system *coding)
bool null_byte_found = 0, eight_bit_found = 0; bool null_byte_found = 0, eight_bit_found = 0;
coding->head_ascii = 0; coding->head_ascii = 0;
coding->eol_seen = EOL_SEEN_NONE;
detect_info.checked = detect_info.found = detect_info.rejected = 0; detect_info.checked = detect_info.found = detect_info.rejected = 0;
for (src = coding->source; src < src_end; src++) for (src = coding->source; src < src_end; src++)
{ {
...@@ -6529,32 +6558,58 @@ detect_coding (struct coding_system *coding) ...@@ -6529,32 +6558,58 @@ detect_coding (struct coding_system *coding)
} }
else if ((*(this->detector)) (coding, &detect_info) else if ((*(this->detector)) (coding, &detect_info)
&& detect_info.found & (1 << category)) && detect_info.found & (1 << category))
{ break;
if (category == coding_category_utf_16_auto)
{
if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
category = coding_category_utf_16_le;
else
category = coding_category_utf_16_be;
}
break;
}
} }
} }
if (i < coding_category_raw_text) if (i < coding_category_raw_text)
setup_coding_system (CODING_ID_NAME (this->id), coding); {
if (category == coding_category_utf_8_auto)
{
Lisp_Object coding_systems;
coding_systems = AREF (CODING_ID_ATTRS (this->id),
coding_attr_utf_bom);
if (CONSP (coding_systems))
{
if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
found = XCAR (coding_systems);
else
found = XCDR (coding_systems);
}
else
found = CODING_ID_NAME (this->id);
}
else if (category == coding_category_utf_16_auto)
{
Lisp_Object coding_systems;
coding_systems = AREF (CODING_ID_ATTRS (this->id),
coding_attr_utf_bom);
if (CONSP (coding_systems))
{
if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
found = XCAR (coding_systems);
else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
found = XCDR (coding_systems);
}
else
found = CODING_ID_NAME (this->id);
}
else
found = CODING_ID_NAME (this->id);
}
else if (null_byte_found) else if (null_byte_found)
setup_coding_system (Qno_conversion, coding); found = Qno_conversion;
else if ((detect_info.rejected & CATEGORY_MASK_ANY) else if ((detect_info.rejected & CATEGORY_MASK_ANY)
== CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY)
setup_coding_system (Qraw_text, coding); found = Qraw_text;
else if (detect_info.rejected) else if (detect_info.rejected)
for (i = 0; i < coding_category_raw_text; i++) for (i = 0; i < coding_category_raw_text; i++)
if (! (detect_info.rejected & (1 << coding_priorities[i]))) if (! (detect_info.rejected & (1 << coding_priorities[i])))
{ {
this = coding_categories + coding_priorities[i]; this = coding_categories + coding_priorities[i];
setup_coding_system (CODING_ID_NAME (this->id), coding); found = CODING_ID_NAME (this->id);
break; break;
} }
} }
...@@ -6570,12 +6625,8 @@ detect_coding (struct coding_system *coding) ...@@ -6570,12 +6625,8 @@ detect_coding (struct coding_system *coding)
detect_info.found = detect_info.rejected = 0; detect_info.found = detect_info.rejected = 0;
if (check_ascii (coding) == coding->src_bytes) if (check_ascii (coding) == coding->src_bytes)
{ {
int head_ascii = coding->head_ascii; if (CONSP (coding_systems))
found = XCDR (coding_systems);
if (coding->eol_seen != EOL_SEEN_NONE)
adjust_coding_eol_type (coding, coding->eol_seen);
setup_coding_system (XCDR (coding_systems), coding);
coding->head_ascii = head_ascii;
} }
else else
{ {
...@@ -6583,9 +6634,9 @@ detect_coding (struct coding_system *coding) ...@@ -6583,9 +6634,9 @@ detect_coding (struct coding_system *coding)
&& detect_coding_utf_8 (coding, &detect_info)) && detect_coding_utf_8 (coding, &detect_info))
{ {
if (detect_info.found & CATEGORY_MASK_UTF_8_SIG) if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
setup_coding_system (XCAR (coding_systems), coding); found = XCAR (coding_systems);
else else
setup_coding_system (XCDR (coding_systems), coding); found = XCDR (coding_systems);
} }
} }
} }
...@@ -6599,16 +6650,28 @@ detect_coding (struct coding_system *coding) ...@@ -6599,16 +6650,28 @@ detect_coding (struct coding_system *coding)
= AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom); = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
detect_info.found = detect_info.rejected = 0; detect_info.found = detect_info.rejected = 0;
coding->head_ascii = 0; coding->head_ascii = 0;
coding->eol_seen = EOL_SEEN_NONE;
if (CONSP (coding_systems) if (CONSP (coding_systems)
&& detect_coding_utf_16 (coding, &detect_info)) && detect_coding_utf_16 (coding, &detect_info))
{ {
if (detect_info.found & CATEGORY_MASK_UTF_16_LE) if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
setup_coding_system (XCAR (coding_systems), coding); found = XCAR (coding_systems);
else if (detect_info.found & CATEGORY_MASK_UTF_16_BE) else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
setup_coding_system (XCDR (coding_systems), coding); found = XCDR (coding_systems);
} }
} }
if (! NILP (found))
{
int specified_eol = (VECTORP (eol_type) ? EOL_SEEN_NONE
: EQ (eol_type, Qdos) ? EOL_SEEN_CRLF
: EQ (eol_type, Qmac) ? EOL_SEEN_CR
: EOL_SEEN_LF);
setup_coding_system (found, coding);
if (specified_eol != EOL_SEEN_NONE)
adjust_coding_eol_type (coding, specified_eol);
}
coding->mode = saved_mode; coding->mode = saved_mode;
} }
...@@ -7729,6 +7792,9 @@ decode_coding_gap (struct coding_system *coding, ...@@ -7729,6 +7792,9 @@ decode_coding_gap (struct coding_system *coding,
coding->dst_pos_byte = PT_BYTE; coding->dst_pos_byte = PT_BYTE;
coding->dst_multibyte = ! NILP (BVAR (current_buffer, enable_multibyte_characters)); coding->dst_multibyte = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
coding->head_ascii = -1;
coding->detected_utf8_chars = -1;
coding->eol_seen = EOL_SEEN_NONE;
if (CODING_REQUIRE_DETECTION (coding)) if (CODING_REQUIRE_DETECTION (coding))
detect_coding (coding); detect_coding (coding);
attrs = CODING_ID_ATTRS (coding->id); attrs = CODING_ID_ATTRS (coding->id);
...@@ -7743,17 +7809,38 @@ decode_coding_gap (struct coding_system *coding, ...@@ -7743,17 +7809,38 @@ decode_coding_gap (struct coding_system *coding,
chars = check_ascii (coding); chars = check_ascii (coding);
if (chars != bytes) if (chars != bytes)
{ {
/* There exists a non-ASCII byte. */
if (EQ (CODING_ATTR_TYPE (attrs), Qutf_8)) if (EQ (CODING_ATTR_TYPE (attrs), Qutf_8))
chars = check_utf_8 (coding); {
if (coding->detected_utf8_chars >= 0)
chars = coding->detected_utf8_chars;
else
chars = check_utf_8 (coding);
if (CODING_UTF_8_BOM (coding) != utf_without_bom
&& coding->head_ascii == 0
&& coding->source[0] == UTF_8_BOM_1
&& coding->source[1] == UTF_8_BOM_2
&& coding->source[2] == UTF_8_BOM_3)
{
chars--;
bytes -= 3;
coding->src_bytes -= 3;
}
}
else else
chars = -1; chars = -1;
} }
if (chars >= 0) if (chars >= 0)
{ {
if (coding->eol_seen != EOL_SEEN_NONE) Lisp_Object eol_type;
adjust_coding_eol_type (coding, coding->eol_seen);
if (coding->eol_seen == EOL_SEEN_CR) eol_type = CODING_ID_EOL_TYPE (coding->id);
if (VECTORP (eol_type))
{
if (coding->eol_seen != EOL_SEEN_NONE)
eol_type = adjust_coding_eol_type (coding, coding->eol_seen);
}
if (EQ (eol_type, Qmac))
{ {
unsigned char *src_end = GAP_END_ADDR; unsigned char *src_end = GAP_END_ADDR;
unsigned char *src = src_end - coding->src_bytes; unsigned char *src = src_end - coding->src_bytes;
...@@ -7764,7 +7851,7 @@ decode_coding_gap (struct coding_system *coding, ...@@ -7764,7 +7851,7 @@ decode_coding_gap (struct coding_system *coding,
src[-1] = '\n'; src[-1] = '\n';
} }
} }
else if (coding->eol_seen == EOL_SEEN_CRLF) else if (EQ (eol_type, Qdos))
{ {
unsigned char *src = GAP_END_ADDR; unsigned char *src = GAP_END_ADDR;
unsigned char *src_beg = src - coding->src_bytes; unsigned char *src_beg = src - coding->src_bytes;
......
...@@ -444,6 +444,8 @@ struct coding_system ...@@ -444,6 +444,8 @@ struct coding_system
the eol format. */ the eol format. */
ptrdiff_t head_ascii; ptrdiff_t head_ascii;
ptrdiff_t detected_utf8_chars;
/* Used internally in coding.c. See the comment of detect_ascii. */ /* Used internally in coding.c. See the comment of detect_ascii. */
int eol_seen; int eol_seen;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment