Commit e19c3639 authored by Kenichi Handa's avatar Kenichi Handa

(encode_coding_utf_8): Initialize produced_chars to 0.

(decode_coding_utf_16): Fix converting high and low bytes to
code-point.
(encode_coding_utf_16): Substitute coding->default_char for
non-Unicode characters.
(decode_coding): Don't call record_insert here.
(setup_coding_system): Initialize `surrogate' of
coding->spec.utf_16 to 0.
(EMIT_ONE_BYTE): Fix for multibyte case.
parent ed9d8bda
...@@ -46,31 +46,23 @@ Boston, MA 02111-1307, USA. */ ...@@ -46,31 +46,23 @@ Boston, MA 02111-1307, USA. */
CODING SYSTEM CODING SYSTEM
Coding system is an encoding mechanism of one or more character Coding system is an object for a encoding mechanism that contains
sets. Here's a list of coding system types supported by Emacs. information about how to convert byte sequence to character
When we say "decode", it means converting a text encoded by some sequences and vice versa. When we say "decode", it means converting
coding system into Emacs' internal format (emacs-utf-8), and when we a byte sequence of a specific coding system into a character
say "encode", it means converting a text of emacs-utf-8 to some sequence that is represented by Emacs' internal coding system
other coding system. `emacs-utf-8', and when we say "encode", it means converting a
character sequence of emacs-utf-8 to a byte sequence of a specific
Emacs represents a coding system by a Lisp symbol. Each symbol is a coding system.
key to the hash table Vcharset_hash_table. This hash table
associates the symbol to the corresponding detailed specifications. In Emacs Lisp, a coding system is represented by a Lisp symbol. In
C level, a coding system is represented by a vector of attributes
Before using a coding system for decoding and encoding, we setup a stored in the hash table Vcharset_hash_table. The conversion from a
structure of type `struct coding_system'. This structure keeps coding system symbol to attributes vector is done by looking up
various information about a specific code conversion (e.g. the Vcharset_hash_table by the symbol.
location of source and destination data).
Coding systems are classified into the following types depending on
Coding systems are classified into the following types by how to the mechanism of encoding. Here's a brief descrition about type.
represent a character in a byte sequence. Here's a brief descrition
about type.
o Emacs' internal format (emacs-utf-8)
The extended UTF-8 which allows eight-bit raw bytes mixed with
character codes. Emacs holds characters in buffers and strings by
this format.
o UTF-8 o UTF-8
...@@ -137,6 +129,13 @@ END-OF-LINE FORMAT ...@@ -137,6 +129,13 @@ END-OF-LINE FORMAT
independent, any coding system described above can take any format independent, any coding system described above can take any format
of end-of-line (except for no-conversion). of end-of-line (except for no-conversion).
STRUCT CODING_SYSTEM
Before using a coding system for code conversion (i.e. decoding and
encoding), we setup a structure of type `struct coding_system'.
This structure keeps various information about a specific code
conversion (e.g. the location of source and destination data).
*/ */
/* COMMON MACROS */ /* COMMON MACROS */
...@@ -818,19 +817,27 @@ static int detected_mask[coding_category_raw_text] = ...@@ -818,19 +817,27 @@ static int detected_mask[coding_category_raw_text] =
/* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2. */ /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2. */
#define EMIT_TWO_BYTES(c1, c2) \ #define EMIT_TWO_BYTES(c1, c2) \
do { \ do { \
produced_chars += 2; \ produced_chars += 2; \
if (multibytep) \ if (multibytep) \
{ \ { \
CHAR_STRING_ADVANCE ((int) (c1), dst); \ int ch; \
CHAR_STRING_ADVANCE ((int) (c2), dst); \ \
} \ ch = (c1); \
else \ if (ch >= 0x80) \
{ \ ch = BYTE8_TO_CHAR (ch); \
*dst++ = (c1); \ CHAR_STRING_ADVANCE (ch, dst); \
*dst++ = (c2); \ ch = (c2); \
} \ if (ch >= 0x80) \
ch = BYTE8_TO_CHAR (ch); \
CHAR_STRING_ADVANCE (ch, dst); \
} \
else \
{ \
*dst++ = (c1); \
*dst++ = (c2); \
} \
} while (0) } while (0)
...@@ -889,10 +896,14 @@ coding_set_source (coding) ...@@ -889,10 +896,14 @@ coding_set_source (coding)
coding->source = GAP_END_ADDR + coding->src_pos_byte; coding->source = GAP_END_ADDR + coding->src_pos_byte;
else else
{ {
if (coding->src_pos < GPT struct buffer *buf = XBUFFER (coding->src_object);
&& coding->src_pos + coding->src_chars >= GPT) EMACS_INT beg_byte = BUF_BEG_BYTE (buf);
move_gap_both (coding->src_pos, coding->src_pos_byte); EMACS_INT gpt_byte = BUF_GPT_BYTE (buf);
coding->source = BYTE_POS_ADDR (coding->src_pos_byte); unsigned char *beg_addr = BUF_BEG_ADDR (buf);
coding->source = beg_addr + coding->src_pos_byte - 1;
if (coding->src_pos_byte >= gpt_byte)
coding->source += BUF_GAP_SIZE (buf);
} }
} }
else if (STRINGP (coding->src_object)) else if (STRINGP (coding->src_object))
...@@ -1182,7 +1193,7 @@ encode_coding_utf_8 (coding) ...@@ -1182,7 +1193,7 @@ encode_coding_utf_8 (coding)
int *charbuf_end = charbuf + coding->charbuf_used; int *charbuf_end = charbuf + coding->charbuf_used;
unsigned char *dst = coding->destination + coding->produced; unsigned char *dst = coding->destination + coding->produced;
unsigned char *dst_end = coding->destination + coding->dst_bytes; unsigned char *dst_end = coding->destination + coding->dst_bytes;
int produced_chars; int produced_chars = 0;
int c; int c;
if (multibytep) if (multibytep)
...@@ -1290,7 +1301,7 @@ decode_coding_utf_16 (coding) ...@@ -1290,7 +1301,7 @@ decode_coding_utf_16 (coding)
src_base = src; src_base = src;
ONE_MORE_BYTE (c1); ONE_MORE_BYTE (c1);
ONE_MORE_BYTE (c2); ONE_MORE_BYTE (c2);
c = (c1 << 16) | c2; c = (c1 << 8) | c2;
if (bom == utf_16_with_bom) if (bom == utf_16_with_bom)
{ {
if (endian == utf_16_big_endian if (endian == utf_16_big_endian
...@@ -1333,7 +1344,7 @@ decode_coding_utf_16 (coding) ...@@ -1333,7 +1344,7 @@ decode_coding_utf_16 (coding)
ONE_MORE_BYTE (c1); ONE_MORE_BYTE (c1);
ONE_MORE_BYTE (c2); ONE_MORE_BYTE (c2);
c = (endian == utf_16_big_endian c = (endian == utf_16_big_endian
? ((c1 << 16) | c2) : ((c2 << 16) | c1)); ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
if (surrogate) if (surrogate)
{ {
if (! UTF_16_LOW_SURROGATE_P (c)) if (! UTF_16_LOW_SURROGATE_P (c))
...@@ -1404,8 +1415,8 @@ encode_coding_utf_16 (coding) ...@@ -1404,8 +1415,8 @@ encode_coding_utf_16 (coding)
{ {
ASSURE_DESTINATION (safe_room); ASSURE_DESTINATION (safe_room);
c = *charbuf++; c = *charbuf++;
if (c >= 0x110000) if (c >= MAX_UNICODE_CHAR)
c = 0xFFFF; c = coding->default_char;
if (c < 0x10000) if (c < 0x10000)
{ {
...@@ -4504,6 +4515,7 @@ setup_coding_system (coding_system, coding) ...@@ -4504,6 +4515,7 @@ setup_coding_system (coding_system, coding)
val = AREF (attrs, coding_attr_utf_16_endian); val = AREF (attrs, coding_attr_utf_16_endian);
CODING_UTF_16_ENDIAN (coding) = (NILP (val) ? utf_16_big_endian CODING_UTF_16_ENDIAN (coding) = (NILP (val) ? utf_16_big_endian
: utf_16_little_endian); : utf_16_little_endian);
CODING_UTF_16_SURROGATE (coding) = 0;
coding->detector = detect_coding_utf_16; coding->detector = detect_coding_utf_16;
coding->decoder = decode_coding_utf_16; coding->decoder = decode_coding_utf_16;
coding->encoder = encode_coding_utf_16; coding->encoder = encode_coding_utf_16;
...@@ -5458,11 +5470,6 @@ decode_coding (coding) ...@@ -5458,11 +5470,6 @@ decode_coding (coding)
coding->consumed = coding->src_bytes; coding->consumed = coding->src_bytes;
} }
if (BUFFERP (coding->dst_object))
{
record_insert (coding->dst_pos, coding->produced_char);
}
return coding->result; return coding->result;
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment