1 /* Coding system handler (conversion, detection, etc).
2    Copyright (C) 2001-2021 Free Software Foundation, Inc.
3    Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
4      2005, 2006, 2007, 2008, 2009, 2010, 2011
5      National Institute of Advanced Industrial Science and Technology (AIST)
6      Registration Number H14PRO021
7    Copyright (C) 2003
8      National Institute of Advanced Industrial Science and Technology (AIST)
9      Registration Number H13PRO009
10 
11 This file is part of GNU Emacs.
12 
13 GNU Emacs is free software: you can redistribute it and/or modify
14 it under the terms of the GNU General Public License as published by
15 the Free Software Foundation, either version 3 of the License, or (at
16 your option) any later version.
17 
18 GNU Emacs is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
21 GNU General Public License for more details.
22 
23 You should have received a copy of the GNU General Public License
24 along with GNU Emacs.  If not, see <https://www.gnu.org/licenses/>.  */
25 
26 /*** TABLE OF CONTENTS ***
27 
28   0. General comments
29   1. Preamble
30   2. Emacs' internal format (emacs-utf-8) handlers
31   3. UTF-8 handlers
32   4. UTF-16 handlers
33   5. Charset-base coding systems handlers
34   6. emacs-mule (old Emacs' internal format) handlers
35   7. ISO2022 handlers
36   8. Shift-JIS and BIG5 handlers
37   9. CCL handlers
38   10. C library functions
39   11. Emacs Lisp library functions
40   12. Postamble
41 
42 */
43 
44 /*** 0. General comments ***
45 
46 
47 CODING SYSTEM
48 
49   A coding system is an object for an encoding mechanism that contains
50   information about how to convert byte sequences to character
51   sequences and vice versa.  When we say "decode", it means converting
52   a byte sequence of a specific coding system into a character
53   sequence that is represented by Emacs' internal coding system
54   `emacs-utf-8', and when we say "encode", it means converting a
55   character sequence of emacs-utf-8 to a byte sequence of a specific
56   coding system.
57 
58   In Emacs Lisp, a coding system is represented by a Lisp symbol.  On
59   the C level, a coding system is represented by a vector of attributes
60   stored in the hash table Vcharset_hash_table.  The conversion from
61   coding system symbol to attributes vector is done by looking up
62   Vcharset_hash_table by the symbol.
63 
64   Coding systems are classified into the following types depending on
65   the encoding mechanism.  Here's a brief description of the types.
66 
67   o UTF-8
68 
69   o UTF-16
70 
71   o Charset-base coding system
72 
73   A coding system defined by one or more (coded) character sets.
74   Decoding and encoding are done by a code converter defined for each
75   character set.
76 
77   o Old Emacs internal format (emacs-mule)
78 
79   The coding system adopted by old versions of Emacs (20 and 21).
80 
81   o ISO2022-base coding system
82 
83   The most famous coding system for multiple character sets.  X's
84   Compound Text, various EUCs (Extended Unix Code), and coding systems
85   used in the Internet communication such as ISO-2022-JP are all
86   variants of ISO2022.
87 
88   o SJIS (or Shift-JIS or MS-Kanji-Code)
89 
90   A coding system to encode character sets: ASCII, JISX0201, and
91   JISX0208.  Widely used for PC's in Japan.  Details are described in
92   section 8.
93 
94   o BIG5
95 
96   A coding system to encode character sets: ASCII and Big5.  Widely
97   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
98   described in section 8.  In this file, when we write "big5" (all
99   lowercase), we mean the coding system, and when we write "Big5"
100   (capitalized), we mean the character set.
101 
102   o CCL
103 
104   If a user wants to decode/encode text encoded in a coding system
105   not listed above, he can supply a decoder and an encoder for it in
106   CCL (Code Conversion Language) programs.  Emacs executes the CCL
107   program while decoding/encoding.
108 
109   o Raw-text
110 
111   A coding system for text containing raw eight-bit data.  Emacs
112   treats each byte of source text as a character (except for
113   end-of-line conversion).
114 
115   o No-conversion
116 
117   Like raw text, but don't do end-of-line conversion.
118 
119 
120 END-OF-LINE FORMAT
121 
122   How text end-of-line is encoded depends on operating system.  For
123   instance, Unix's format is just one byte of LF (line-feed) code,
124   whereas DOS's format is two-byte sequence of `carriage-return' and
125   `line-feed' codes.  Classic Mac OS's format is usually one byte of
126   `carriage-return'.
127 
128   Since text character encoding and end-of-line encoding are
129   independent, any coding system described above can take any format
130   of end-of-line (except for no-conversion).
131 
132 STRUCT CODING_SYSTEM
133 
134   Before using a coding system for code conversion (i.e. decoding and
135   encoding), we setup a structure of type `struct coding_system'.
136   This structure keeps various information about a specific code
137   conversion (e.g. the location of source and destination data).
138 
139 */
140 
141 /* COMMON MACROS */
142 
143 
144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
145 
146   These functions check if a byte sequence specified as a source in
147   CODING conforms to the format of XXX, and update the members of
148   DETECT_INFO.
149 
150   Return true if the byte sequence conforms to XXX.
151 
152   Below is the template of these functions.  */
153 
154 #if 0
155 static bool
156 detect_coding_XXX (struct coding_system *coding,
157 		   struct coding_detection_info *detect_info)
158 {
159   const unsigned char *src = coding->source;
160   const unsigned char *src_end = coding->source + coding->src_bytes;
161   bool multibytep = coding->src_multibyte;
162   ptrdiff_t consumed_chars = 0;
163   int found = 0;
164   ...;
165 
166   while (1)
167     {
168       /* Get one byte from the source.  If the source is exhausted, jump
169 	 to no_more_source:.  */
170       ONE_MORE_BYTE (c);
171 
172       if (! __C_conforms_to_XXX___ (c))
173 	break;
174       if (! __C_strongly_suggests_XXX__ (c))
175 	found = CATEGORY_MASK_XXX;
176     }
177   /* The byte sequence is invalid for XXX.  */
178   detect_info->rejected |= CATEGORY_MASK_XXX;
179   return 0;
180 
181  no_more_source:
182   /* The source exhausted successfully.  */
183   detect_info->found |= found;
184   return 1;
185 }
186 #endif
187 
188 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
189 
190   These functions decode a byte sequence specified as a source by
191   CODING.  The resulting multibyte text goes to a place pointed to by
192   CODING->charbuf, the length of which should not exceed
193   CODING->charbuf_size;
194 
195   These functions set the information of original and decoded texts in
196   CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
197   They also set CODING->result to one of CODING_RESULT_XXX indicating
198   how the decoding is finished.
199 
200   Below is the template of these functions.  */
201 
202 #if 0
203 static void
204 decode_coding_XXXX (struct coding_system *coding)
205 {
206   const unsigned char *src = coding->source + coding->consumed;
207   const unsigned char *src_end = coding->source + coding->src_bytes;
208   /* SRC_BASE remembers the start position in source in each loop.
209      The loop will be exited when there's not enough source code, or
210      when there's no room in CHARBUF for a decoded character.  */
211   const unsigned char *src_base;
212   /* A buffer to produce decoded characters.  */
213   int *charbuf = coding->charbuf + coding->charbuf_used;
214   int *charbuf_end = coding->charbuf + coding->charbuf_size;
215   bool multibytep = coding->src_multibyte;
216 
217   while (1)
218     {
219       src_base = src;
220       if (charbuf < charbuf_end)
221 	/* No more room to produce a decoded character.  */
222 	break;
223       ONE_MORE_BYTE (c);
224       /* Decode it. */
225     }
226 
227  no_more_source:
228   if (src_base < src_end
229       && coding->mode & CODING_MODE_LAST_BLOCK)
230     /* If the source ends by partial bytes to construct a character,
231        treat them as eight-bit raw data.  */
232     while (src_base < src_end && charbuf < charbuf_end)
233       *charbuf++ = *src_base++;
234   /* Remember how many bytes and characters we consumed.  If the
235      source is multibyte, the bytes and chars are not identical.  */
236   coding->consumed = coding->consumed_char = src_base - coding->source;
237   /* Remember how many characters we produced.  */
238   coding->charbuf_used = charbuf - coding->charbuf;
239 }
240 #endif
241 
242 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
243 
244   These functions encode SRC_BYTES length text at SOURCE of Emacs'
245   internal multibyte format by CODING.  The resulting byte sequence
246   goes to a place pointed to by DESTINATION, the length of which
247   should not exceed DST_BYTES.
248 
249   These functions set the information of original and encoded texts in
250   the members produced, produced_char, consumed, and consumed_char of
251   the structure *CODING.  They also set the member result to one of
252   CODING_RESULT_XXX indicating how the encoding finished.
253 
254   DST_BYTES zero means that source area and destination area are
255   overlapped, which means that we can produce an encoded text until it
256   reaches at the head of not-yet-encoded source text.
257 
258   Below is a template of these functions.  */
259 #if 0
260 static void
261 encode_coding_XXX (struct coding_system *coding)
262 {
263   bool multibytep = coding->dst_multibyte;
264   int *charbuf = coding->charbuf;
265   int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
266   unsigned char *dst = coding->destination + coding->produced;
267   unsigned char *dst_end = coding->destination + coding->dst_bytes;
268   unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
269   ptrdiff_t produced_chars = 0;
270 
271   for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
272     {
273       int c = *charbuf;
274       /* Encode C into DST, and increment DST.  */
275     }
276  label_no_more_destination:
277   /* How many chars and bytes we produced.  */
278   coding->produced_char += produced_chars;
279   coding->produced = dst - coding->destination;
280 }
281 #endif
282 
283 
284 /*** 1. Preamble ***/
285 
286 #include <config.h>
287 
288 #ifdef HAVE_WCHAR_H
289 #include <wchar.h>
290 #endif /* HAVE_WCHAR_H */
291 
292 #include "lisp.h"
293 #include "character.h"
294 #include "buffer.h"
295 #include "charset.h"
296 #include "ccl.h"
297 #include "composite.h"
298 #include "coding.h"
299 #include "termhooks.h"
300 #include "pdumper.h"
301 
302 Lisp_Object Vcoding_system_hash_table;
303 
304 /* Coding-systems are handed between Emacs Lisp programs and C internal
305    routines by the following three variables.  */
306 /* Coding system to be used to encode text for terminal display when
307    terminal coding system is nil.  */
308 struct coding_system safe_terminal_coding;
309 
310 /* Two special coding systems.  */
311 static Lisp_Object Vsjis_coding_system;
312 static Lisp_Object Vbig5_coding_system;
313 
314 /* ISO2022 section */
315 
316 #define CODING_ISO_INITIAL(coding, reg)			\
317   (XFIXNUM (AREF (AREF (CODING_ID_ATTRS ((coding)->id),	\
318 		     coding_attr_iso_initial),		\
319 	       reg)))
320 
321 
322 #define CODING_ISO_REQUEST(coding, charset_id)		\
323   (((charset_id) <= (coding)->max_charset_id		\
324     ? ((coding)->safe_charsets[charset_id] != 255	\
325        ? (coding)->safe_charsets[charset_id]		\
326        : -1)						\
327     : -1))
328 
329 
330 #define CODING_ISO_FLAGS(coding)	\
331   ((coding)->spec.iso_2022.flags)
332 #define CODING_ISO_DESIGNATION(coding, reg)	\
333   ((coding)->spec.iso_2022.current_designation[reg])
334 #define CODING_ISO_INVOCATION(coding, plane)	\
335   ((coding)->spec.iso_2022.current_invocation[plane])
336 #define CODING_ISO_SINGLE_SHIFTING(coding)	\
337   ((coding)->spec.iso_2022.single_shifting)
338 #define CODING_ISO_BOL(coding)	\
339   ((coding)->spec.iso_2022.bol)
340 #define CODING_ISO_INVOKED_CHARSET(coding, plane)	\
341   (CODING_ISO_INVOCATION (coding, plane) < 0 ? -1	\
342    : CODING_ISO_DESIGNATION (coding, CODING_ISO_INVOCATION (coding, plane)))
343 #define CODING_ISO_CMP_STATUS(coding)	\
344   (&(coding)->spec.iso_2022.cmp_status)
345 #define CODING_ISO_EXTSEGMENT_LEN(coding)	\
346   ((coding)->spec.iso_2022.ctext_extended_segment_len)
347 #define CODING_ISO_EMBEDDED_UTF_8(coding)	\
348   ((coding)->spec.iso_2022.embedded_utf_8)
349 
350 /* Control characters of ISO2022.  */
351 			/* code */	/* function */
352 #define ISO_CODE_SO	0x0E		/* shift-out */
353 #define ISO_CODE_SI	0x0F		/* shift-in */
354 #define ISO_CODE_SS2_7	0x19		/* single-shift-2 for 7-bit code */
355 #define ISO_CODE_ESC	0x1B		/* escape */
356 #define ISO_CODE_SS2	0x8E		/* single-shift-2 */
357 #define ISO_CODE_SS3	0x8F		/* single-shift-3 */
358 #define ISO_CODE_CSI	0x9B		/* control-sequence-introducer */
359 
360 /* All code (1-byte) of ISO2022 is classified into one of the
361    followings.  */
362 enum iso_code_class_type
363   {
364     ISO_control_0,		/* Control codes in the range
365 				   0x00..0x1F and 0x7F, except for the
366 				   following 5 codes.  */
367     ISO_shift_out,		/* ISO_CODE_SO (0x0E) */
368     ISO_shift_in,		/* ISO_CODE_SI (0x0F) */
369     ISO_single_shift_2_7,	/* ISO_CODE_SS2_7 (0x19) */
370     ISO_escape,			/* ISO_CODE_ESC (0x1B) */
371     ISO_control_1,		/* Control codes in the range
372 				   0x80..0x9F, except for the
373 				   following 3 codes.  */
374     ISO_single_shift_2,		/* ISO_CODE_SS2 (0x8E) */
375     ISO_single_shift_3,		/* ISO_CODE_SS3 (0x8F) */
376     ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
377     ISO_0x20_or_0x7F,		/* Codes of the values 0x20 or 0x7F.  */
378     ISO_graphic_plane_0,	/* Graphic codes in the range 0x21..0x7E.  */
379     ISO_0xA0_or_0xFF,		/* Codes of the values 0xA0 or 0xFF.  */
380     ISO_graphic_plane_1		/* Graphic codes in the range 0xA1..0xFE.  */
381   };
382 
383 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
384     `iso-flags' attribute of an iso2022 coding system.  */
385 
386 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
387    instead of the correct short-form sequence (e.g. ESC $ A).  */
388 #define CODING_ISO_FLAG_LONG_FORM	0x0001
389 
390 /* If set, reset graphic planes and registers at end-of-line to the
391    initial state.  */
392 #define CODING_ISO_FLAG_RESET_AT_EOL	0x0002
393 
394 /* If set, reset graphic planes and registers before any control
395    characters to the initial state.  */
396 #define CODING_ISO_FLAG_RESET_AT_CNTL	0x0004
397 
398 /* If set, encode by 7-bit environment.  */
399 #define CODING_ISO_FLAG_SEVEN_BITS	0x0008
400 
401 /* If set, use locking-shift function.  */
402 #define CODING_ISO_FLAG_LOCKING_SHIFT	0x0010
403 
404 /* If set, use single-shift function.  Overwrite
405    CODING_ISO_FLAG_LOCKING_SHIFT.  */
406 #define CODING_ISO_FLAG_SINGLE_SHIFT	0x0020
407 
408 /* If set, use designation escape sequence.  */
409 #define CODING_ISO_FLAG_DESIGNATION	0x0040
410 
411 /* If set, produce revision number sequence.  */
412 #define CODING_ISO_FLAG_REVISION	0x0080
413 
414 /* If set, produce ISO6429's direction specifying sequence.  */
415 #define CODING_ISO_FLAG_DIRECTION	0x0100
416 
417 /* If set, assume designation states are reset at beginning of line on
418    output.  */
419 #define CODING_ISO_FLAG_INIT_AT_BOL	0x0200
420 
421 /* If set, designation sequence should be placed at beginning of line
422    on output.  */
423 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
424 
425 /* If set, do not encode unsafe characters on output.  */
426 #define CODING_ISO_FLAG_SAFE		0x0800
427 
428 /* If set, extra latin codes (128..159) are accepted as a valid code
429    on input.  */
430 #define CODING_ISO_FLAG_LATIN_EXTRA	0x1000
431 
432 #define CODING_ISO_FLAG_COMPOSITION	0x2000
433 
434 /* #define CODING_ISO_FLAG_EUC_TW_SHIFT	0x4000 */
435 
436 #define CODING_ISO_FLAG_USE_ROMAN	0x8000
437 
438 #define CODING_ISO_FLAG_USE_OLDJIS	0x10000
439 
440 #define CODING_ISO_FLAG_LEVEL_4		0x20000
441 
442 #define CODING_ISO_FLAG_FULL_SUPPORT	0x100000
443 
444 /* A character to be produced on output if encoding of the original
445    character is prohibited by CODING_ISO_FLAG_SAFE.  */
446 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION  '?'
447 
448 /* UTF-8 section */
449 #define CODING_UTF_8_BOM(coding)	\
450   ((coding)->spec.utf_8_bom)
451 
452 /* UTF-16 section */
453 #define CODING_UTF_16_BOM(coding)	\
454   ((coding)->spec.utf_16.bom)
455 
456 #define CODING_UTF_16_ENDIAN(coding)	\
457   ((coding)->spec.utf_16.endian)
458 
459 #define CODING_UTF_16_SURROGATE(coding)	\
460   ((coding)->spec.utf_16.surrogate)
461 
462 
463 /* CCL section */
464 #define CODING_CCL_DECODER(coding)	\
465   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
466 #define CODING_CCL_ENCODER(coding)	\
467   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
468 #define CODING_CCL_VALIDS(coding)					   \
469   (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
470 
471 /* Index for each coding category in `coding_categories' */
472 
473 enum coding_category
474   {
475     coding_category_iso_7,
476     coding_category_iso_7_tight,
477     coding_category_iso_8_1,
478     coding_category_iso_8_2,
479     coding_category_iso_7_else,
480     coding_category_iso_8_else,
481     coding_category_utf_8_auto,
482     coding_category_utf_8_nosig,
483     coding_category_utf_8_sig,
484     coding_category_utf_16_auto,
485     coding_category_utf_16_be,
486     coding_category_utf_16_le,
487     coding_category_utf_16_be_nosig,
488     coding_category_utf_16_le_nosig,
489     coding_category_charset,
490     coding_category_sjis,
491     coding_category_big5,
492     coding_category_ccl,
493     coding_category_emacs_mule,
494     /* All above are targets of code detection.  */
495     coding_category_raw_text,
496     coding_category_undecided,
497     coding_category_max
498   };
499 
500 /* Definitions of flag bits used in detect_coding_XXXX.  */
501 #define CATEGORY_MASK_ISO_7		(1 << coding_category_iso_7)
502 #define CATEGORY_MASK_ISO_7_TIGHT	(1 << coding_category_iso_7_tight)
503 #define CATEGORY_MASK_ISO_8_1		(1 << coding_category_iso_8_1)
504 #define CATEGORY_MASK_ISO_8_2		(1 << coding_category_iso_8_2)
505 #define CATEGORY_MASK_ISO_7_ELSE	(1 << coding_category_iso_7_else)
506 #define CATEGORY_MASK_ISO_8_ELSE	(1 << coding_category_iso_8_else)
507 #define CATEGORY_MASK_UTF_8_AUTO	(1 << coding_category_utf_8_auto)
508 #define CATEGORY_MASK_UTF_8_NOSIG	(1 << coding_category_utf_8_nosig)
509 #define CATEGORY_MASK_UTF_8_SIG		(1 << coding_category_utf_8_sig)
510 #define CATEGORY_MASK_UTF_16_AUTO	(1 << coding_category_utf_16_auto)
511 #define CATEGORY_MASK_UTF_16_BE		(1 << coding_category_utf_16_be)
512 #define CATEGORY_MASK_UTF_16_LE		(1 << coding_category_utf_16_le)
513 #define CATEGORY_MASK_UTF_16_BE_NOSIG	(1 << coding_category_utf_16_be_nosig)
514 #define CATEGORY_MASK_UTF_16_LE_NOSIG	(1 << coding_category_utf_16_le_nosig)
515 #define CATEGORY_MASK_CHARSET		(1 << coding_category_charset)
516 #define CATEGORY_MASK_SJIS		(1 << coding_category_sjis)
517 #define CATEGORY_MASK_BIG5		(1 << coding_category_big5)
518 #define CATEGORY_MASK_CCL		(1 << coding_category_ccl)
519 #define CATEGORY_MASK_EMACS_MULE	(1 << coding_category_emacs_mule)
520 #define CATEGORY_MASK_RAW_TEXT		(1 << coding_category_raw_text)
521 
522 /* This value is returned if detect_coding_mask () find nothing other
523    than ASCII characters.  */
524 #define CATEGORY_MASK_ANY		\
525   (CATEGORY_MASK_ISO_7			\
526    | CATEGORY_MASK_ISO_7_TIGHT		\
527    | CATEGORY_MASK_ISO_8_1		\
528    | CATEGORY_MASK_ISO_8_2		\
529    | CATEGORY_MASK_ISO_7_ELSE		\
530    | CATEGORY_MASK_ISO_8_ELSE		\
531    | CATEGORY_MASK_UTF_8_AUTO		\
532    | CATEGORY_MASK_UTF_8_NOSIG		\
533    | CATEGORY_MASK_UTF_8_SIG		\
534    | CATEGORY_MASK_UTF_16_AUTO		\
535    | CATEGORY_MASK_UTF_16_BE		\
536    | CATEGORY_MASK_UTF_16_LE		\
537    | CATEGORY_MASK_UTF_16_BE_NOSIG	\
538    | CATEGORY_MASK_UTF_16_LE_NOSIG	\
539    | CATEGORY_MASK_CHARSET		\
540    | CATEGORY_MASK_SJIS			\
541    | CATEGORY_MASK_BIG5			\
542    | CATEGORY_MASK_CCL			\
543    | CATEGORY_MASK_EMACS_MULE)
544 
545 
546 #define CATEGORY_MASK_ISO_7BIT \
547   (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
548 
549 #define CATEGORY_MASK_ISO_8BIT \
550   (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
551 
552 #define CATEGORY_MASK_ISO_ELSE \
553   (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
554 
555 #define CATEGORY_MASK_ISO_ESCAPE	\
556   (CATEGORY_MASK_ISO_7			\
557    | CATEGORY_MASK_ISO_7_TIGHT		\
558    | CATEGORY_MASK_ISO_7_ELSE		\
559    | CATEGORY_MASK_ISO_8_ELSE)
560 
561 #define CATEGORY_MASK_ISO	\
562   (  CATEGORY_MASK_ISO_7BIT	\
563      | CATEGORY_MASK_ISO_8BIT	\
564      | CATEGORY_MASK_ISO_ELSE)
565 
566 #define CATEGORY_MASK_UTF_16		\
567   (CATEGORY_MASK_UTF_16_AUTO		\
568    | CATEGORY_MASK_UTF_16_BE		\
569    | CATEGORY_MASK_UTF_16_LE		\
570    | CATEGORY_MASK_UTF_16_BE_NOSIG	\
571    | CATEGORY_MASK_UTF_16_LE_NOSIG)
572 
573 #define CATEGORY_MASK_UTF_8	\
574   (CATEGORY_MASK_UTF_8_AUTO	\
575    | CATEGORY_MASK_UTF_8_NOSIG	\
576    | CATEGORY_MASK_UTF_8_SIG)
577 
578 /* Table of coding categories (Lisp symbols).  This variable is for
579    internal use only.  */
580 static Lisp_Object Vcoding_category_table;
581 
582 /* Table of coding-categories ordered by priority.  */
583 static enum coding_category coding_priorities[coding_category_max];
584 
585 /* Nth element is a coding context for the coding system bound to the
586    Nth coding category.  */
587 static struct coding_system coding_categories[coding_category_max];
588 
589 /* Encode a flag that can be nil, something else, or t as -1, 0, 1.  */
590 
591 static int
encode_inhibit_flag(Lisp_Object flag)592 encode_inhibit_flag (Lisp_Object flag)
593 {
594   return NILP (flag) ? -1 : EQ (flag, Qt);
595 }
596 
597 /* True if the value of ENCODED_FLAG says a flag should be treated as set.
598    1 means yes, -1 means no, 0 means ask the user variable VAR.  */
599 
600 static bool
inhibit_flag(int encoded_flag,bool var)601 inhibit_flag (int encoded_flag, bool var)
602 {
603   return 0 < encoded_flag + var;
604 }
605 
606 #define CODING_GET_INFO(coding, attrs, charset_list)	\
607   do {							\
608     (attrs) = CODING_ID_ATTRS ((coding)->id);		\
609     (charset_list) = CODING_ATTR_CHARSET_LIST (attrs);	\
610   } while (false)
611 
612 /* True if CODING's destination can be grown.  */
613 
614 static bool
growable_destination(struct coding_system * coding)615 growable_destination (struct coding_system *coding)
616 {
617   return STRINGP (coding->dst_object) || BUFFERP (coding->dst_object);
618 }
619 
620 
621 /* Safely get one byte from the source text pointed by SRC which ends
622    at SRC_END, and set C to that byte.  If there are not enough bytes
623    in the source, it jumps to 'no_more_source'.  If MULTIBYTEP,
624    and a multibyte character is found at SRC, set C to the
625    negative value of the character code.  The caller should declare
626    and set these variables appropriately in advance:
627 	src, src_end, multibytep */
628 
629 #define ONE_MORE_BYTE(c)				\
630   do {							\
631     if (src == src_end)					\
632       {							\
633 	if (src_base < src)				\
634 	  record_conversion_result			\
635 	    (coding, CODING_RESULT_INSUFFICIENT_SRC);	\
636 	goto no_more_source;				\
637       }							\
638     c = *src++;						\
639     if (multibytep && (c & 0x80))			\
640       {							\
641 	if ((c & 0xFE) == 0xC0)				\
642 	  c = ((c & 1) << 6) | *src++;			\
643 	else						\
644 	  {						\
645 	    src--;					\
646 	    c = - string_char_advance (&src);		\
647 	    record_conversion_result			\
648 	      (coding, CODING_RESULT_INVALID_SRC);	\
649 	  }						\
650       }							\
651     consumed_chars++;					\
652   } while (0)
653 
654 /* Safely get two bytes from the source text pointed by SRC which ends
655    at SRC_END, and set C1 and C2 to those bytes while skipping the
656    heading multibyte characters.  If there are not enough bytes in the
657    source, it jumps to 'no_more_source'.  If MULTIBYTEP and
658    a multibyte character is found for C2, set C2 to the negative value
659    of the character code.  The caller should declare and set these
660    variables appropriately in advance:
661 	src, src_end, multibytep
662    It is intended that this macro is used in detect_coding_utf_16.  */
663 
664 #define TWO_MORE_BYTES(c1, c2)				\
665   do {							\
666     do {						\
667       if (src == src_end)				\
668 	goto no_more_source;				\
669       c1 = *src++;					\
670       if (multibytep && (c1 & 0x80))			\
671 	{						\
672 	  if ((c1 & 0xFE) == 0xC0)			\
673 	    c1 = ((c1 & 1) << 6) | *src++;		\
674 	  else						\
675 	    {						\
676 	      src += BYTES_BY_CHAR_HEAD (c1) - 1;	\
677 	      c1 = -1;					\
678 	    }						\
679 	}						\
680     } while (c1 < 0);					\
681     if (src == src_end)					\
682       goto no_more_source;				\
683     c2 = *src++;					\
684     if (multibytep && (c2 & 0x80))			\
685       {							\
686 	if ((c2 & 0xFE) == 0xC0)			\
687 	  c2 = ((c2 & 1) << 6) | *src++;		\
688 	else						\
689 	  c2 = -1;					\
690       }							\
691   } while (0)
692 
693 
694 /* Store a byte C in the place pointed by DST and increment DST to the
695    next free point, and increment PRODUCED_CHARS.  The caller should
696    assure that C is 0..127, and declare and set the variable `dst'
697    appropriately in advance.
698 */
699 
700 
701 #define EMIT_ONE_ASCII_BYTE(c)	\
702   do {				\
703     produced_chars++;		\
704     *dst++ = (c);		\
705   } while (0)
706 
707 
708 /* Like EMIT_ONE_ASCII_BYTE but store two bytes; C1 and C2.  */
709 
710 #define EMIT_TWO_ASCII_BYTES(c1, c2)	\
711   do {					\
712     produced_chars += 2;		\
713     *dst++ = (c1), *dst++ = (c2);	\
714   } while (0)
715 
716 
717 /* Store a byte C in the place pointed by DST and increment DST to the
718    next free point, and increment PRODUCED_CHARS.  If MULTIBYTEP,
719    store in an appropriate multibyte form.  The caller should
720    declare and set the variables `dst' and `multibytep' appropriately
721    in advance.  */
722 
723 #define EMIT_ONE_BYTE(c)		\
724   do {					\
725     produced_chars++;			\
726     if (multibytep)			\
727       {					\
728 	unsigned ch = (c);		\
729 	if (ch >= 0x80)			\
730 	  ch = BYTE8_TO_CHAR (ch);	\
731 	dst += CHAR_STRING (ch, dst);	\
732       }					\
733     else				\
734       *dst++ = (c);			\
735   } while (0)
736 
737 
738 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2.  */
739 
740 #define EMIT_TWO_BYTES(c1, c2)		\
741   do {					\
742     produced_chars += 2;		\
743     if (multibytep)			\
744       {					\
745 	unsigned ch;			\
746 					\
747 	ch = (c1);			\
748 	if (ch >= 0x80)			\
749 	  ch = BYTE8_TO_CHAR (ch);	\
750 	dst += CHAR_STRING (ch, dst);	\
751 	ch = (c2);			\
752 	if (ch >= 0x80)			\
753 	  ch = BYTE8_TO_CHAR (ch);	\
754 	dst += CHAR_STRING (ch, dst);	\
755       }					\
756     else				\
757       {					\
758 	*dst++ = (c1);			\
759 	*dst++ = (c2);			\
760       }					\
761   } while (0)
762 
763 
764 #define EMIT_THREE_BYTES(c1, c2, c3)	\
765   do {					\
766     EMIT_ONE_BYTE (c1);			\
767     EMIT_TWO_BYTES (c2, c3);		\
768   } while (0)
769 
770 
771 #define EMIT_FOUR_BYTES(c1, c2, c3, c4)		\
772   do {						\
773     EMIT_TWO_BYTES (c1, c2);			\
774     EMIT_TWO_BYTES (c3, c4);			\
775   } while (0)
776 
777 
778 static void
record_conversion_result(struct coding_system * coding,enum coding_result_code result)779 record_conversion_result (struct coding_system *coding,
780 			  enum coding_result_code result)
781 {
782   coding->result = result;
783   switch (result)
784     {
785     case CODING_RESULT_INSUFFICIENT_SRC:
786       Vlast_code_conversion_error = Qinsufficient_source;
787       break;
788     case CODING_RESULT_INVALID_SRC:
789       Vlast_code_conversion_error = Qinvalid_source;
790       break;
791     case CODING_RESULT_INTERRUPT:
792       Vlast_code_conversion_error = Qinterrupted;
793       break;
794     case CODING_RESULT_INSUFFICIENT_DST:
795       /* Don't record this error in Vlast_code_conversion_error
796 	 because it happens just temporarily and is resolved when the
797 	 whole conversion is finished.  */
798       break;
799     case CODING_RESULT_SUCCESS:
800       break;
801     default:
802       Vlast_code_conversion_error = intern ("Unknown error");
803     }
804 }
805 
806 /* These wrapper macros are used to preserve validity of pointers into
807    buffer text across calls to decode_char, encode_char, etc, which
808    could cause relocation of buffers if it loads a charset map,
809    because loading a charset map allocates large structures.  */
810 
811 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
812   do {									     \
813     ptrdiff_t offset;							     \
814 									     \
815     charset_map_loaded = 0;						     \
816     c = DECODE_CHAR (charset, code);					     \
817     if (charset_map_loaded						     \
818 	&& (offset = coding_change_source (coding)))			     \
819       {									     \
820 	src += offset;							     \
821 	src_base += offset;						     \
822 	src_end += offset;						     \
823       }									     \
824   } while (0)
825 
826 #define CODING_ENCODE_CHAR(coding, dst, dst_end, charset, c, code)	\
827   do {									\
828     ptrdiff_t offset;							\
829 									\
830     charset_map_loaded = 0;						\
831     code = ENCODE_CHAR (charset, c);					\
832     if (charset_map_loaded						\
833 	&& (offset = coding_change_destination (coding)))		\
834       {									\
835 	dst += offset;							\
836 	dst_end += offset;						\
837       }									\
838   } while (0)
839 
840 #define CODING_CHAR_CHARSET(coding, dst, dst_end, c, charset_list, code_return, charset) \
841   do {									\
842     ptrdiff_t offset;							\
843 									\
844     charset_map_loaded = 0;						\
845     charset = char_charset (c, charset_list, code_return);		\
846     if (charset_map_loaded						\
847 	&& (offset = coding_change_destination (coding)))		\
848       {									\
849 	dst += offset;							\
850 	dst_end += offset;						\
851       }									\
852   } while (0)
853 
854 #define CODING_CHAR_CHARSET_P(coding, dst, dst_end, c, charset, result)	\
855   do {									\
856     ptrdiff_t offset;							\
857 									\
858     charset_map_loaded = 0;						\
859     result = CHAR_CHARSET_P (c, charset);				\
860     if (charset_map_loaded						\
861 	&& (offset = coding_change_destination (coding)))		\
862       {									\
863 	dst += offset;							\
864 	dst_end += offset;						\
865       }									\
866   } while (0)
867 
868 
869 /* If there are at least BYTES length of room at dst, allocate memory
870    for coding->destination and update dst and dst_end.  We don't have
871    to take care of coding->source which will be relocated.  It is
872    handled by calling coding_set_source in encode_coding.  */
873 
874 #define ASSURE_DESTINATION(bytes)				\
875   do {								\
876     if (dst + (bytes) >= dst_end)				\
877       {								\
878 	ptrdiff_t more_bytes = charbuf_end - charbuf + (bytes);	\
879 								\
880 	dst = alloc_destination (coding, more_bytes, dst);	\
881 	dst_end = coding->destination + coding->dst_bytes;	\
882       }								\
883   } while (0)
884 
885 
886 /* Store multibyte form of the character C in P, and advance P to the
887    end of the multibyte form.  This used to be like adding CHAR_STRING
888    without ever calling MAYBE_UNIFY_CHAR, but nowadays we don't call
889    MAYBE_UNIFY_CHAR in CHAR_STRING.  */
890 
891 #define CHAR_STRING_ADVANCE_NO_UNIFY(c, p) ((p) += CHAR_STRING (c, p))
892 
893 /* Return the character code of character whose multibyte form is at
894    P, and advance P to the end of the multibyte form.  This used to be
895    like string_char_advance without ever calling MAYBE_UNIFY_CHAR, but
896    nowadays string_char_advance doesn't call MAYBE_UNIFY_CHAR.  */
897 
898 #define STRING_CHAR_ADVANCE_NO_UNIFY(p) string_char_advance (&(p))
899 
900 /* Set coding->source from coding->src_object.  */
901 
902 static void
coding_set_source(struct coding_system * coding)903 coding_set_source (struct coding_system *coding)
904 {
905   if (BUFFERP (coding->src_object))
906     {
907       struct buffer *buf = XBUFFER (coding->src_object);
908 
909       if (coding->src_pos < 0)
910 	coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
911       else
912 	coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
913     }
914   else if (STRINGP (coding->src_object))
915     {
916       coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
917     }
918   else
919     {
920       /* Otherwise, the source is C string and is never relocated
921 	 automatically.  Thus we don't have to update anything.  */
922     }
923 }
924 
925 
926 /* Set coding->source from coding->src_object, and return how many
927    bytes coding->source was changed.  */
928 
929 static ptrdiff_t
coding_change_source(struct coding_system * coding)930 coding_change_source (struct coding_system *coding)
931 {
932   const unsigned char *orig = coding->source;
933   coding_set_source (coding);
934   return coding->source - orig;
935 }
936 
937 
938 /* Set coding->destination from coding->dst_object.  */
939 
940 static void
coding_set_destination(struct coding_system * coding)941 coding_set_destination (struct coding_system *coding)
942 {
943   if (BUFFERP (coding->dst_object))
944     {
945       if (BUFFERP (coding->src_object) && coding->src_pos < 0)
946 	{
947 	  coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
948 	  coding->dst_bytes = (GAP_END_ADDR
949 			       - (coding->src_bytes - coding->consumed)
950 			       - coding->destination);
951 	}
952       else
953 	{
954 	  /* We are sure that coding->dst_pos_byte is before the gap
955 	     of the buffer. */
956 	  coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
957 				 + coding->dst_pos_byte - BEG_BYTE);
958 	  coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
959 			       - coding->destination);
960 	}
961     }
962   else
963     {
964       /* Otherwise, the destination is C string and is never relocated
965 	 automatically.  Thus we don't have to update anything.  */
966     }
967 }
968 
969 
970 /* Set coding->destination from coding->dst_object, and return how
971    many bytes coding->destination was changed.  */
972 
973 static ptrdiff_t
coding_change_destination(struct coding_system * coding)974 coding_change_destination (struct coding_system *coding)
975 {
976   const unsigned char *orig = coding->destination;
977   coding_set_destination (coding);
978   return coding->destination - orig;
979 }
980 
981 
982 static void
coding_alloc_by_realloc(struct coding_system * coding,ptrdiff_t bytes)983 coding_alloc_by_realloc (struct coding_system *coding, ptrdiff_t bytes)
984 {
985   ptrdiff_t newbytes;
986   if (INT_ADD_WRAPV (coding->dst_bytes, bytes, &newbytes)
987       || SIZE_MAX < newbytes)
988     string_overflow ();
989   coding->destination = xrealloc (coding->destination, newbytes);
990   coding->dst_bytes = newbytes;
991 }
992 
993 static void
coding_alloc_by_making_gap(struct coding_system * coding,ptrdiff_t gap_head_used,ptrdiff_t bytes)994 coding_alloc_by_making_gap (struct coding_system *coding,
995 			    ptrdiff_t gap_head_used, ptrdiff_t bytes)
996 {
997   if (EQ (coding->src_object, coding->dst_object))
998     {
999       /* The gap may contain the produced data at the head and not-yet
1000 	 consumed data at the tail.  To preserve those data, we at
1001 	 first make the gap size to zero, then increase the gap
1002 	 size.  */
1003       ptrdiff_t add = GAP_SIZE;
1004 
1005       GPT += gap_head_used, GPT_BYTE += gap_head_used;
1006       GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1007       make_gap (bytes);
1008       GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1009       GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
1010     }
1011   else
1012     make_gap_1 (XBUFFER (coding->dst_object), bytes);
1013 }
1014 
1015 
1016 static unsigned char *
alloc_destination(struct coding_system * coding,ptrdiff_t nbytes,unsigned char * dst)1017 alloc_destination (struct coding_system *coding, ptrdiff_t nbytes,
1018 		   unsigned char *dst)
1019 {
1020   ptrdiff_t offset = dst - coding->destination;
1021 
1022   if (BUFFERP (coding->dst_object))
1023     {
1024       struct buffer *buf = XBUFFER (coding->dst_object);
1025 
1026       coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
1027     }
1028   else
1029     coding_alloc_by_realloc (coding, nbytes);
1030   coding_set_destination (coding);
1031   dst = coding->destination + offset;
1032   return dst;
1033 }
1034 
1035 /** Macros for annotations.  */
1036 
1037 /* An annotation data is stored in the array coding->charbuf in this
1038    format:
1039      [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1040    LENGTH is the number of elements in the annotation.
1041    ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1042    NCHARS is the number of characters in the text annotated.
1043 
1044    The format of the following elements depend on ANNOTATION_MASK.
1045 
1046    In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1047    follows:
1048      ... NBYTES METHOD [ COMPOSITION-COMPONENTS ... ]
1049 
1050    NBYTES is the number of bytes specified in the header part of
1051    old-style emacs-mule encoding, or 0 for the other kind of
1052    composition.
1053 
1054    METHOD is one of enum composition_method.
1055 
1056    Optional COMPOSITION-COMPONENTS are characters and composition
1057    rules.
1058 
1059    In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1060    follows.
1061 
1062    If ANNOTATION_MASK is 0, this annotation is just a space holder to
1063    recover from an invalid annotation, and should be skipped by
1064    produce_annotation.  */
1065 
1066 /* Maximum length of the header of annotation data.  */
1067 #define MAX_ANNOTATION_LENGTH 5
1068 
1069 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars)	\
1070   do {							\
1071     *(buf)++ = -(len);					\
1072     *(buf)++ = (mask);					\
1073     *(buf)++ = (nchars);				\
1074     coding->annotated = 1;				\
1075   } while (0);
1076 
1077 #define ADD_COMPOSITION_DATA(buf, nchars, nbytes, method)		    \
1078   do {									    \
1079     ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1080     *buf++ = nbytes;							    \
1081     *buf++ = method;							    \
1082   } while (0)
1083 
1084 
1085 #define ADD_CHARSET_DATA(buf, nchars, id)				\
1086   do {									\
1087     ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars);	\
1088     *buf++ = id;							\
1089   } while (0)
1090 
1091 
1092 /* Bitmasks for coding->eol_seen.  */
1093 
1094 #define EOL_SEEN_NONE	0
1095 #define EOL_SEEN_LF	1
1096 #define EOL_SEEN_CR	2
1097 #define EOL_SEEN_CRLF	4
1098 
1099 
1100 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1101 
1102 
1103 
1104 
1105 /*** 3. UTF-8 ***/
1106 
1107 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1108    Return true if a text is encoded in UTF-8.  */
1109 
1110 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
1111 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
1112 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1113 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1114 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1115 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1116 
1117 #define UTF_8_BOM_1 0xEF
1118 #define UTF_8_BOM_2 0xBB
1119 #define UTF_8_BOM_3 0xBF
1120 
1121 /* Unlike the other detect_coding_XXX, this function counts the number
1122    of characters and checks the EOL format.  */
1123 
1124 static bool
detect_coding_utf_8(struct coding_system * coding,struct coding_detection_info * detect_info)1125 detect_coding_utf_8 (struct coding_system *coding,
1126 		     struct coding_detection_info *detect_info)
1127 {
1128   const unsigned char *src = coding->source, *src_base;
1129   const unsigned char *src_end = coding->source + coding->src_bytes;
1130   bool multibytep = coding->src_multibyte;
1131   ptrdiff_t consumed_chars = 0;
1132   bool bom_found = 0;
1133   ptrdiff_t nchars = coding->head_ascii;
1134   int eol_seen = coding->eol_seen;
1135 
1136   detect_info->checked |= CATEGORY_MASK_UTF_8;
1137   /* A coding system of this category is always ASCII compatible.  */
1138   src += nchars;
1139 
1140   if (src == coding->source	/* BOM should be at the head.  */
1141       && src + 3 < src_end	/* BOM is 3-byte long.  */
1142       && src[0] == UTF_8_BOM_1
1143       && src[1] == UTF_8_BOM_2
1144       && src[2] == UTF_8_BOM_3)
1145     {
1146       bom_found = 1;
1147       src += 3;
1148       nchars++;
1149     }
1150 
1151   while (1)
1152     {
1153       int c, c1, c2, c3, c4;
1154 
1155       src_base = src;
1156       ONE_MORE_BYTE (c);
1157       if (c < 0 || UTF_8_1_OCTET_P (c))
1158 	{
1159 	  nchars++;
1160 	  if (c == '\r')
1161 	    {
1162 	      if (src < src_end && *src == '\n')
1163 		{
1164 		  eol_seen |= EOL_SEEN_CRLF;
1165 		  src++;
1166 		  nchars++;
1167 		}
1168 	      else
1169 		eol_seen |= EOL_SEEN_CR;
1170 	    }
1171 	  else if (c == '\n')
1172 	    eol_seen |= EOL_SEEN_LF;
1173 	  continue;
1174 	}
1175       ONE_MORE_BYTE (c1);
1176       if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1177 	break;
1178       if (UTF_8_2_OCTET_LEADING_P (c))
1179 	{
1180 	  nchars++;
1181 	  continue;
1182 	}
1183       ONE_MORE_BYTE (c2);
1184       if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1185 	break;
1186       if (UTF_8_3_OCTET_LEADING_P (c))
1187 	{
1188 	  nchars++;
1189 	  continue;
1190 	}
1191       ONE_MORE_BYTE (c3);
1192       if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1193 	break;
1194       if (UTF_8_4_OCTET_LEADING_P (c))
1195 	{
1196 	  nchars++;
1197 	  continue;
1198 	}
1199       ONE_MORE_BYTE (c4);
1200       if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1201 	break;
1202       if (UTF_8_5_OCTET_LEADING_P (c)
1203 	  /* If we ever need to increase MAX_CHAR, the below may need
1204 	     to be reviewed.  */
1205 	  && c < MAX_MULTIBYTE_LEADING_CODE)
1206 	{
1207 	  nchars++;
1208 	  continue;
1209 	}
1210       break;
1211     }
1212   detect_info->rejected |= CATEGORY_MASK_UTF_8;
1213   return 0;
1214 
1215  no_more_source:
1216   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1217     {
1218       detect_info->rejected |= CATEGORY_MASK_UTF_8;
1219       return 0;
1220     }
1221   if (bom_found)
1222     {
1223       /* The first character 0xFFFE doesn't necessarily mean a BOM.  */
1224       detect_info->found |= CATEGORY_MASK_UTF_8_AUTO | CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
1225     }
1226   else
1227     {
1228       detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
1229       if (nchars < src_end - coding->source)
1230 	/* The found characters are less than source bytes, which
1231 	   means that we found a valid non-ASCII characters.  */
1232 	detect_info->found |= CATEGORY_MASK_UTF_8_AUTO | CATEGORY_MASK_UTF_8_NOSIG;
1233     }
1234   coding->detected_utf8_bytes = src_base - coding->source;
1235   coding->detected_utf8_chars = nchars;
1236   return 1;
1237 }
1238 
1239 
1240 static void
decode_coding_utf_8(struct coding_system * coding)1241 decode_coding_utf_8 (struct coding_system *coding)
1242 {
1243   const unsigned char *src = coding->source + coding->consumed;
1244   const unsigned char *src_end = coding->source + coding->src_bytes;
1245   const unsigned char *src_base;
1246   int *charbuf = coding->charbuf + coding->charbuf_used;
1247   int *charbuf_end = coding->charbuf + coding->charbuf_size;
1248   ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1249   bool multibytep = coding->src_multibyte;
1250   enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
1251   bool eol_dos
1252     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1253   int byte_after_cr = -1;
1254 
1255   if (bom != utf_without_bom)
1256     {
1257       int c1, c2, c3;
1258 
1259       src_base = src;
1260       ONE_MORE_BYTE (c1);
1261       if (! UTF_8_3_OCTET_LEADING_P (c1))
1262 	src = src_base;
1263       else
1264 	{
1265 	  ONE_MORE_BYTE (c2);
1266 	  if (! UTF_8_EXTRA_OCTET_P (c2))
1267 	    src = src_base;
1268 	  else
1269 	    {
1270 	      ONE_MORE_BYTE (c3);
1271 	      if (! UTF_8_EXTRA_OCTET_P (c3))
1272 		src = src_base;
1273 	      else
1274 		{
1275 		  if ((c1 != UTF_8_BOM_1)
1276 		      || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
1277 		    src = src_base;
1278 		  else
1279 		    CODING_UTF_8_BOM (coding) = utf_without_bom;
1280 		}
1281 	    }
1282 	}
1283     }
1284   CODING_UTF_8_BOM (coding) = utf_without_bom;
1285 
1286   while (1)
1287     {
1288       int c, c1, c2, c3, c4, c5;
1289 
1290       src_base = src;
1291       consumed_chars_base = consumed_chars;
1292 
1293       if (charbuf >= charbuf_end)
1294 	{
1295 	  if (byte_after_cr >= 0)
1296 	    src_base--;
1297 	  break;
1298 	}
1299 
1300       /* In the simple case, rapidly handle ordinary characters */
1301       if (multibytep && ! eol_dos
1302 	  && charbuf < charbuf_end - 6 && src < src_end - 6)
1303 	{
1304 	  while (charbuf < charbuf_end - 6 && src < src_end - 6)
1305 	    {
1306 	      c1 = *src;
1307 	      if (c1 & 0x80)
1308 		break;
1309 	      src++;
1310 	      consumed_chars++;
1311 	      *charbuf++ = c1;
1312 
1313 	      c1 = *src;
1314 	      if (c1 & 0x80)
1315 		break;
1316 	      src++;
1317 	      consumed_chars++;
1318 	      *charbuf++ = c1;
1319 
1320 	      c1 = *src;
1321 	      if (c1 & 0x80)
1322 		break;
1323 	      src++;
1324 	      consumed_chars++;
1325 	      *charbuf++ = c1;
1326 
1327 	      c1 = *src;
1328 	      if (c1 & 0x80)
1329 		break;
1330 	      src++;
1331 	      consumed_chars++;
1332 	      *charbuf++ = c1;
1333 	    }
1334 	  /* If we handled at least one character, restart the main loop.  */
1335 	  if (src != src_base)
1336 	    continue;
1337 	}
1338 
1339       if (byte_after_cr >= 0)
1340 	c1 = byte_after_cr, byte_after_cr = -1;
1341       else
1342 	ONE_MORE_BYTE (c1);
1343       if (c1 < 0)
1344 	{
1345 	  c = - c1;
1346 	}
1347       else if (UTF_8_1_OCTET_P (c1))
1348 	{
1349 	  if (eol_dos && c1 == '\r')
1350 	    ONE_MORE_BYTE (byte_after_cr);
1351 	  c = c1;
1352 	}
1353       else
1354 	{
1355 	  ONE_MORE_BYTE (c2);
1356 	  if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1357 	    goto invalid_code;
1358 	  if (UTF_8_2_OCTET_LEADING_P (c1))
1359 	    {
1360 	      c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1361 	      /* Reject overlong sequences here and below.  Encoders
1362 		 producing them are incorrect, they can be misleading,
1363 		 and they mess up read/write invariance.  */
1364 	      if (c < 128)
1365 		goto invalid_code;
1366 	    }
1367 	  else
1368 	    {
1369 	      ONE_MORE_BYTE (c3);
1370 	      if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1371 		goto invalid_code;
1372 	      if (UTF_8_3_OCTET_LEADING_P (c1))
1373 		{
1374 		  c = (((c1 & 0xF) << 12)
1375 		       | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1376 		  if (c < 0x800
1377 		      || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1378 		    goto invalid_code;
1379 		}
1380 	      else
1381 		{
1382 		  ONE_MORE_BYTE (c4);
1383 		  if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1384 		    goto invalid_code;
1385 		  if (UTF_8_4_OCTET_LEADING_P (c1))
1386 		    {
1387 		    c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1388 			 | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1389 		    if (c < 0x10000)
1390 		      goto invalid_code;
1391 		    }
1392 		  else
1393 		    {
1394 		      ONE_MORE_BYTE (c5);
1395 		      if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1396 			goto invalid_code;
1397 		      if (UTF_8_5_OCTET_LEADING_P (c1))
1398 			{
1399 			  c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1400 			       | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1401 			       | (c5 & 0x3F));
1402 			  if ((c > MAX_CHAR) || (c < 0x200000))
1403 			    goto invalid_code;
1404 			}
1405 		      else
1406 			goto invalid_code;
1407 		    }
1408 		}
1409 	    }
1410 	}
1411 
1412       *charbuf++ = c;
1413       continue;
1414 
1415     invalid_code:
1416       src = src_base;
1417       consumed_chars = consumed_chars_base;
1418       ONE_MORE_BYTE (c);
1419       *charbuf++ = ASCII_CHAR_P (c) ? c : BYTE8_TO_CHAR (c);
1420     }
1421 
1422  no_more_source:
1423   coding->consumed_char += consumed_chars_base;
1424   coding->consumed = src_base - coding->source;
1425   coding->charbuf_used = charbuf - coding->charbuf;
1426 }
1427 
1428 
1429 bool
encode_coding_utf_8(struct coding_system * coding)1430 encode_coding_utf_8 (struct coding_system *coding)
1431 {
1432   bool multibytep = coding->dst_multibyte;
1433   int *charbuf = coding->charbuf;
1434   int *charbuf_end = charbuf + coding->charbuf_used;
1435   unsigned char *dst = coding->destination + coding->produced;
1436   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1437   ptrdiff_t produced_chars = 0;
1438   int c;
1439 
1440   if (CODING_UTF_8_BOM (coding) == utf_with_bom)
1441     {
1442       ASSURE_DESTINATION (3);
1443       EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
1444       CODING_UTF_8_BOM (coding) = utf_without_bom;
1445     }
1446 
1447   if (multibytep)
1448     {
1449       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1450 
1451       while (charbuf < charbuf_end)
1452 	{
1453 	  unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1454 
1455 	  ASSURE_DESTINATION (safe_room);
1456 	  c = *charbuf++;
1457 	  if (CHAR_BYTE8_P (c))
1458 	    {
1459 	      c = CHAR_TO_BYTE8 (c);
1460 	      EMIT_ONE_BYTE (c);
1461 	    }
1462 	  else
1463 	    {
1464 	      CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
1465 	      for (p = str; p < pend; p++)
1466 		EMIT_ONE_BYTE (*p);
1467 	    }
1468 	}
1469     }
1470   else
1471     {
1472       int safe_room = MAX_MULTIBYTE_LENGTH;
1473 
1474       while (charbuf < charbuf_end)
1475 	{
1476 	  ASSURE_DESTINATION (safe_room);
1477 	  c = *charbuf++;
1478 	  if (CHAR_BYTE8_P (c))
1479 	    *dst++ = CHAR_TO_BYTE8 (c);
1480 	  else
1481 	    CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
1482 	}
1483       produced_chars = dst - (coding->destination + coding->produced);
1484     }
1485   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1486   coding->produced_char += produced_chars;
1487   coding->produced = dst - coding->destination;
1488   return 0;
1489 }
1490 
1491 
1492 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1493    Return true if a text is encoded in one of UTF-16 based coding systems.  */
1494 
1495 static bool
detect_coding_utf_16(struct coding_system * coding,struct coding_detection_info * detect_info)1496 detect_coding_utf_16 (struct coding_system *coding,
1497 		      struct coding_detection_info *detect_info)
1498 {
1499   const unsigned char *src = coding->source;
1500   const unsigned char *src_end = coding->source + coding->src_bytes;
1501   bool multibytep = coding->src_multibyte;
1502   int c1, c2;
1503 
1504   detect_info->checked |= CATEGORY_MASK_UTF_16;
1505   if (coding->mode & CODING_MODE_LAST_BLOCK
1506       && (coding->src_chars & 1))
1507     {
1508       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1509       return 0;
1510     }
1511 
1512   TWO_MORE_BYTES (c1, c2);
1513   if ((c1 == 0xFF) && (c2 == 0xFE))
1514     {
1515       detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1516 			     | CATEGORY_MASK_UTF_16_AUTO);
1517       detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1518 				| CATEGORY_MASK_UTF_16_BE_NOSIG
1519 				| CATEGORY_MASK_UTF_16_LE_NOSIG);
1520     }
1521   else if ((c1 == 0xFE) && (c2 == 0xFF))
1522     {
1523       detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1524 			     | CATEGORY_MASK_UTF_16_AUTO);
1525       detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1526 				| CATEGORY_MASK_UTF_16_BE_NOSIG
1527 				| CATEGORY_MASK_UTF_16_LE_NOSIG);
1528     }
1529   else if (c2 < 0)
1530     {
1531       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1532       return 0;
1533     }
1534   else
1535     {
1536       /* We check the dispersion of Eth and Oth bytes where E is even and
1537 	 O is odd.  If both are high, we assume binary data.*/
1538       unsigned char e[256], o[256];
1539       unsigned e_num = 1, o_num = 1;
1540 
1541       memset (e, 0, 256);
1542       memset (o, 0, 256);
1543       e[c1] = 1;
1544       o[c2] = 1;
1545 
1546       detect_info->rejected |= (CATEGORY_MASK_UTF_16_AUTO
1547 				|CATEGORY_MASK_UTF_16_BE
1548 				| CATEGORY_MASK_UTF_16_LE);
1549 
1550       while ((detect_info->rejected & CATEGORY_MASK_UTF_16)
1551 	     != CATEGORY_MASK_UTF_16)
1552 	{
1553 	  TWO_MORE_BYTES (c1, c2);
1554 	  if (c2 < 0)
1555 	    break;
1556 	  if (! e[c1])
1557 	    {
1558 	      e[c1] = 1;
1559 	      e_num++;
1560 	      if (e_num >= 128)
1561 		detect_info->rejected |= CATEGORY_MASK_UTF_16_BE_NOSIG;
1562 	    }
1563 	  if (! o[c2])
1564 	    {
1565 	      o[c2] = 1;
1566 	      o_num++;
1567 	      if (o_num >= 128)
1568 		detect_info->rejected |= CATEGORY_MASK_UTF_16_LE_NOSIG;
1569 	    }
1570 	}
1571       return 0;
1572     }
1573 
1574  no_more_source:
1575   return 1;
1576 }
1577 
1578 static void
decode_coding_utf_16(struct coding_system * coding)1579 decode_coding_utf_16 (struct coding_system *coding)
1580 {
1581   const unsigned char *src = coding->source + coding->consumed;
1582   const unsigned char *src_end = coding->source + coding->src_bytes;
1583   const unsigned char *src_base;
1584   int *charbuf = coding->charbuf + coding->charbuf_used;
1585   /* We may produces at most 3 chars in one loop.  */
1586   int *charbuf_end = coding->charbuf + coding->charbuf_size - 2;
1587   ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1588   bool multibytep = coding->src_multibyte;
1589   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1590   enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1591   int surrogate = CODING_UTF_16_SURROGATE (coding);
1592   bool eol_dos
1593     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1594   int byte_after_cr1 = -1, byte_after_cr2 = -1;
1595 
1596   if (bom == utf_with_bom)
1597     {
1598       int c, c1, c2;
1599 
1600       src_base = src;
1601       ONE_MORE_BYTE (c1);
1602       ONE_MORE_BYTE (c2);
1603       c = (c1 << 8) | c2;
1604 
1605       if (endian == utf_16_big_endian
1606 	  ? c != 0xFEFF : c != 0xFFFE)
1607 	{
1608 	  /* The first two bytes are not BOM.  Treat them as bytes
1609 	     for a normal character.  */
1610 	  src = src_base;
1611 	}
1612       CODING_UTF_16_BOM (coding) = utf_without_bom;
1613     }
1614   else if (bom == utf_detect_bom)
1615     {
1616       /* We have already tried to detect BOM and failed in
1617 	 detect_coding.  */
1618       CODING_UTF_16_BOM (coding) = utf_without_bom;
1619     }
1620 
1621   while (1)
1622     {
1623       int c, c1, c2;
1624 
1625       src_base = src;
1626       consumed_chars_base = consumed_chars;
1627 
1628       if (charbuf >= charbuf_end)
1629 	{
1630 	  if (byte_after_cr1 >= 0)
1631 	    src_base -= 2;
1632 	  break;
1633 	}
1634 
1635       if (byte_after_cr1 >= 0)
1636 	c1 = byte_after_cr1, byte_after_cr1 = -1;
1637       else
1638 	ONE_MORE_BYTE (c1);
1639       if (c1 < 0)
1640 	{
1641 	  *charbuf++ = -c1;
1642 	  continue;
1643 	}
1644       if (byte_after_cr2 >= 0)
1645 	c2 = byte_after_cr2, byte_after_cr2 = -1;
1646       else
1647 	ONE_MORE_BYTE (c2);
1648       if (c2 < 0)
1649 	{
1650 	  *charbuf++ = ASCII_CHAR_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1651 	  *charbuf++ = -c2;
1652 	  continue;
1653 	}
1654       c = (endian == utf_16_big_endian
1655 	   ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1656 
1657       if (surrogate)
1658 	{
1659 	  if (! UTF_16_LOW_SURROGATE_P (c))
1660 	    {
1661 	      if (endian == utf_16_big_endian)
1662 		c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1663 	      else
1664 		c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1665 	      *charbuf++ = c1;
1666 	      *charbuf++ = c2;
1667 	      if (UTF_16_HIGH_SURROGATE_P (c))
1668 		CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1669 	      else
1670 		*charbuf++ = c;
1671 	    }
1672 	  else
1673 	    {
1674 	      c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1675 	      CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1676 	      *charbuf++ = 0x10000 + c;
1677 	    }
1678 	}
1679       else
1680 	{
1681 	  if (UTF_16_HIGH_SURROGATE_P (c))
1682 	    CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1683 	  else
1684 	    {
1685 	      if (eol_dos && c == '\r')
1686 		{
1687 		  ONE_MORE_BYTE (byte_after_cr1);
1688 		  ONE_MORE_BYTE (byte_after_cr2);
1689 		}
1690 	      *charbuf++ = c;
1691 	    }
1692 	}
1693     }
1694 
1695  no_more_source:
1696   coding->consumed_char += consumed_chars_base;
1697   coding->consumed = src_base - coding->source;
1698   coding->charbuf_used = charbuf - coding->charbuf;
1699 }
1700 
1701 static bool
encode_coding_utf_16(struct coding_system * coding)1702 encode_coding_utf_16 (struct coding_system *coding)
1703 {
1704   bool multibytep = coding->dst_multibyte;
1705   int *charbuf = coding->charbuf;
1706   int *charbuf_end = charbuf + coding->charbuf_used;
1707   unsigned char *dst = coding->destination + coding->produced;
1708   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1709   int safe_room = 8;
1710   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1711   bool big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1712   ptrdiff_t produced_chars = 0;
1713   int c;
1714 
1715   if (bom != utf_without_bom)
1716     {
1717       ASSURE_DESTINATION (safe_room);
1718       if (big_endian)
1719 	EMIT_TWO_BYTES (0xFE, 0xFF);
1720       else
1721 	EMIT_TWO_BYTES (0xFF, 0xFE);
1722       CODING_UTF_16_BOM (coding) = utf_without_bom;
1723     }
1724 
1725   while (charbuf < charbuf_end)
1726     {
1727       ASSURE_DESTINATION (safe_room);
1728       c = *charbuf++;
1729       if (c > MAX_UNICODE_CHAR)
1730 	c = coding->default_char;
1731 
1732       if (c < 0x10000)
1733 	{
1734 	  if (big_endian)
1735 	    EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1736 	  else
1737 	    EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1738 	}
1739       else
1740 	{
1741 	  int c1, c2;
1742 
1743 	  c -= 0x10000;
1744 	  c1 = (c >> 10) + 0xD800;
1745 	  c2 = (c & 0x3FF) + 0xDC00;
1746 	  if (big_endian)
1747 	    EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1748 	  else
1749 	    EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1750 	}
1751     }
1752   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1753   coding->produced = dst - coding->destination;
1754   coding->produced_char += produced_chars;
1755   return 0;
1756 }
1757 
1758 
1759 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1760 
1761 /* Emacs' internal format for representation of multiple character
1762    sets is a kind of multi-byte encoding, i.e. characters are
1763    represented by variable-length sequences of one-byte codes.
1764 
1765    ASCII characters and control characters (e.g. `tab', `newline') are
1766    represented by one-byte sequences which are their ASCII codes, in
1767    the range 0x00 through 0x7F.
1768 
1769    8-bit characters of the range 0x80..0x9F are represented by
1770    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1771    code + 0x20).
1772 
1773    8-bit characters of the range 0xA0..0xFF are represented by
1774    one-byte sequences which are their 8-bit code.
1775 
1776    The other characters are represented by a sequence of `base
1777    leading-code', optional `extended leading-code', and one or two
1778    `position-code's.  The length of the sequence is determined by the
1779    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
1780    whereas extended leading-code and position-code take the range 0xA0
1781    through 0xFF.  See `charset.h' for more details about leading-code
1782    and position-code.
1783 
1784    --- CODE RANGE of Emacs' internal format ---
1785    character set	range
1786    -------------	-----
1787    ascii		0x00..0x7F
1788    eight-bit-control	LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1789    eight-bit-graphic	0xA0..0xBF
1790    ELSE			0x81..0x9D + [0xA0..0xFF]+
1791    ---------------------------------------------
1792 
1793    As this is the internal character representation, the format is
1794    usually not used externally (i.e. in a file or in a data sent to a
1795    process).  But, it is possible to have a text externally in this
1796    format (i.e. by encoding by the coding system `emacs-mule').
1797 
1798    In that case, a sequence of one-byte codes has a slightly different
1799    form.
1800 
1801    At first, all characters in eight-bit-control are represented by
1802    one-byte sequences which are their 8-bit code.
1803 
1804    Next, character composition data are represented by the byte
1805    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1806    where,
1807 	METHOD is 0xF2 plus one of composition method (enum
1808 	composition_method),
1809 
1810 	BYTES is 0xA0 plus a byte length of this composition data,
1811 
1812 	CHARS is 0xA0 plus a number of characters composed by this
1813 	data,
1814 
1815 	COMPONENTs are characters of multibyte form or composition
1816 	rules encoded by two-byte of ASCII codes.
1817 
1818    In addition, for backward compatibility, the following formats are
1819    also recognized as composition data on decoding.
1820 
1821    0x80 MSEQ ...
1822    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1823 
1824    Here,
1825 	MSEQ is a multibyte form but in these special format:
1826 	  ASCII: 0xA0 ASCII_CODE+0x80,
1827 	  other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1828 	RULE is a one byte code of the range 0xA0..0xF0 that
1829 	represents a composition rule.
1830   */
1831 
1832 char emacs_mule_bytes[256];
1833 
1834 
1835 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1836    Return true if a text is encoded in 'emacs-mule'.  */
1837 
1838 static bool
detect_coding_emacs_mule(struct coding_system * coding,struct coding_detection_info * detect_info)1839 detect_coding_emacs_mule (struct coding_system *coding,
1840 			  struct coding_detection_info *detect_info)
1841 {
1842   const unsigned char *src = coding->source, *src_base;
1843   const unsigned char *src_end = coding->source + coding->src_bytes;
1844   bool multibytep = coding->src_multibyte;
1845   ptrdiff_t consumed_chars = 0;
1846   int c;
1847   int found = 0;
1848 
1849   detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1850   /* A coding system of this category is always ASCII compatible.  */
1851   src += coding->head_ascii;
1852 
1853   while (1)
1854     {
1855       src_base = src;
1856       ONE_MORE_BYTE (c);
1857       if (c < 0)
1858 	continue;
1859       if (c == 0x80)
1860 	{
1861 	  /* Perhaps the start of composite character.  We simply skip
1862 	     it because analyzing it is too heavy for detecting.  But,
1863 	     at least, we check that the composite character
1864 	     constitutes of more than 4 bytes.  */
1865 	  const unsigned char *src_start;
1866 
1867 	repeat:
1868 	  src_start = src;
1869 	  do
1870 	    {
1871 	      ONE_MORE_BYTE (c);
1872 	    }
1873 	  while (c >= 0xA0);
1874 
1875 	  if (src - src_start <= 4)
1876 	    break;
1877 	  found = CATEGORY_MASK_EMACS_MULE;
1878 	  if (c == 0x80)
1879 	    goto repeat;
1880 	}
1881 
1882       if (c < 0x80)
1883 	{
1884 	  if (c < 0x20
1885 	      && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1886 	    break;
1887 	}
1888       else
1889 	{
1890 	  int more_bytes = emacs_mule_bytes[c] - 1;
1891 
1892 	  while (more_bytes > 0)
1893 	    {
1894 	      ONE_MORE_BYTE (c);
1895 	      if (c < 0xA0)
1896 		{
1897 		  src--;	/* Unread the last byte.  */
1898 		  break;
1899 		}
1900 	      more_bytes--;
1901 	    }
1902 	  if (more_bytes != 0)
1903 	    break;
1904 	  found = CATEGORY_MASK_EMACS_MULE;
1905 	}
1906     }
1907   detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1908   return 0;
1909 
1910  no_more_source:
1911   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1912     {
1913       detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1914       return 0;
1915     }
1916   detect_info->found |= found;
1917   return 1;
1918 }
1919 
1920 
1921 /* Parse emacs-mule multibyte sequence at SRC and return the decoded
1922    character.  If CMP_STATUS indicates that we must expect MSEQ or
1923    RULE described above, decode it and return the negative value of
1924    the decoded character or rule.  If an invalid byte is found, return
1925    -1.  If SRC is too short, return -2.  */
1926 
1927 static int
emacs_mule_char(struct coding_system * coding,const unsigned char * src,int * nbytes,int * nchars,int * id,struct composition_status * cmp_status)1928 emacs_mule_char (struct coding_system *coding, const unsigned char *src,
1929 		 int *nbytes, int *nchars, int *id,
1930 		 struct composition_status *cmp_status)
1931 {
1932   const unsigned char *src_end = coding->source + coding->src_bytes;
1933   const unsigned char *src_base = src;
1934   bool multibytep = coding->src_multibyte;
1935   int charset_ID;
1936   unsigned code;
1937   int c;
1938   ptrdiff_t consumed_chars = 0;
1939   bool mseq_found = 0;
1940 
1941   ONE_MORE_BYTE (c);
1942   if (c < 0)
1943     {
1944       c = -c;
1945       charset_ID = emacs_mule_charset[0];
1946     }
1947   else
1948     {
1949       if (c >= 0xA0)
1950 	{
1951 	  if (cmp_status->state != COMPOSING_NO
1952 	      && cmp_status->old_form)
1953 	    {
1954 	      if (cmp_status->state == COMPOSING_CHAR)
1955 		{
1956 		  if (c == 0xA0)
1957 		    {
1958 		      ONE_MORE_BYTE (c);
1959 		      c -= 0x80;
1960 		      if (c < 0)
1961 			goto invalid_code;
1962 		    }
1963 		  else
1964 		    c -= 0x20;
1965 		  mseq_found = 1;
1966 		}
1967 	      else
1968 		{
1969 		  *nbytes = src - src_base;
1970 		  *nchars = consumed_chars;
1971 		  return -c;
1972 		}
1973 	    }
1974 	  else
1975 	    goto invalid_code;
1976 	}
1977 
1978       switch (emacs_mule_bytes[c])
1979 	{
1980 	case 2:
1981 	  if ((charset_ID = emacs_mule_charset[c]) < 0)
1982 	    goto invalid_code;
1983 	  ONE_MORE_BYTE (c);
1984 	  if (c < 0xA0)
1985 	    goto invalid_code;
1986 	  code = c & 0x7F;
1987 	  break;
1988 
1989 	case 3:
1990 	  if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
1991 	      || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
1992 	    {
1993 	      ONE_MORE_BYTE (c);
1994 	      if (c < 0xA0 || (charset_ID = emacs_mule_charset[c]) < 0)
1995 		goto invalid_code;
1996 	      ONE_MORE_BYTE (c);
1997 	      if (c < 0xA0)
1998 		goto invalid_code;
1999 	      code = c & 0x7F;
2000 	    }
2001 	  else
2002 	    {
2003 	      if ((charset_ID = emacs_mule_charset[c]) < 0)
2004 		goto invalid_code;
2005 	      ONE_MORE_BYTE (c);
2006 	      if (c < 0xA0)
2007 		goto invalid_code;
2008 	      code = (c & 0x7F) << 8;
2009 	      ONE_MORE_BYTE (c);
2010 	      if (c < 0xA0)
2011 		goto invalid_code;
2012 	      code |= c & 0x7F;
2013 	    }
2014 	  break;
2015 
2016 	case 4:
2017 	  ONE_MORE_BYTE (c);
2018 	  if (c < 0 || (charset_ID = emacs_mule_charset[c]) < 0)
2019 	    goto invalid_code;
2020 	  ONE_MORE_BYTE (c);
2021 	  if (c < 0xA0)
2022 	    goto invalid_code;
2023 	  code = (c & 0x7F) << 8;
2024 	  ONE_MORE_BYTE (c);
2025 	  if (c < 0xA0)
2026 	    goto invalid_code;
2027 	  code |= c & 0x7F;
2028 	  break;
2029 
2030 	case 1:
2031 	  code = c;
2032 	  charset_ID = ASCII_CHAR_P (code) ? charset_ascii : charset_eight_bit;
2033 	  break;
2034 
2035 	default:
2036 	  emacs_abort ();
2037 	}
2038       CODING_DECODE_CHAR (coding, src, src_base, src_end,
2039 			  CHARSET_FROM_ID (charset_ID), code, c);
2040       if (c < 0)
2041 	goto invalid_code;
2042     }
2043   *nbytes = src - src_base;
2044   *nchars = consumed_chars;
2045   if (id)
2046     *id = charset_ID;
2047   return (mseq_found ? -c : c);
2048 
2049  no_more_source:
2050   return -2;
2051 
2052  invalid_code:
2053   return -1;
2054 }
2055 
2056 
2057 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2058 
2059 /* Handle these composition sequence ('|': the end of header elements,
2060    BYTES and CHARS >= 0xA0):
2061 
2062    (1) relative composition: 0x80 0xF2 BYTES CHARS | CHAR ...
2063    (2) altchar composition:  0x80 0xF4 BYTES CHARS | ALT ... ALT CHAR ...
2064    (3) alt&rule composition: 0x80 0xF5 BYTES CHARS | ALT RULE ... ALT CHAR ...
2065 
2066    and these old form:
2067 
2068    (4) relative composition: 0x80 | MSEQ ... MSEQ
2069    (5) rulebase composition: 0x80 0xFF | MSEQ MRULE ... MSEQ
2070 
2071    When the starter 0x80 and the following header elements are found,
2072    this annotation header is produced.
2073 
2074 	[ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS NBYTES METHOD ]
2075 
2076    NCHARS is CHARS - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2077    NBYTES is BYTES - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2078 
2079    Then, upon reading the following elements, these codes are produced
2080    until the composition end is found:
2081 
2082    (1) CHAR ... CHAR
2083    (2) ALT ... ALT CHAR ... CHAR
2084    (3) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT CHAR ... CHAR
2085    (4) CHAR ... CHAR
2086    (5) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
2087 
2088    When the composition end is found, LENGTH and NCHARS in the
2089    annotation header is updated as below:
2090 
2091    (1) LENGTH: unchanged, NCHARS: unchanged
2092    (2) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2093    (3) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2094    (4) LENGTH: unchanged,  NCHARS: number of CHARs
2095    (5) LENGTH: unchanged,  NCHARS: number of CHARs
2096 
2097    If an error is found while composing, the annotation header is
2098    changed to the original composition header (plus filler -1s) as
2099    below:
2100 
2101    (1),(2),(3)  [ 0x80 0xF2+METHOD BYTES CHARS -1 ]
2102    (5)          [ 0x80 0xFF -1 -1- -1 ]
2103 
2104    and the sequence [ -2 DECODED-RULE ] is changed to the original
2105    byte sequence as below:
2106 	o the original byte sequence is B: [ B -1 ]
2107 	o the original byte sequence is B1 B2: [ B1 B2 ]
2108 
2109    Most of the routines are implemented by macros because many
2110    variables and labels in the caller decode_coding_emacs_mule must be
2111    accessible, and they are usually called just once (thus doesn't
2112    increase the size of compiled object).  */
2113 
2114 /* Decode a composition rule represented by C as a component of
2115    composition sequence of Emacs 20 style.  Set RULE to the decoded
2116    rule. */
2117 
2118 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(c, rule)	\
2119   do {							\
2120     int gref, nref;					\
2121     							\
2122     c -= 0xA0;						\
2123     if (c < 0 || c >= 81)				\
2124       goto invalid_code;				\
2125     gref = c / 9, nref = c % 9;				\
2126     if (gref == 4) gref = 10;				\
2127     if (nref == 4) nref = 10;				\
2128     rule = COMPOSITION_ENCODE_RULE (gref, nref);	\
2129   } while (0)
2130 
2131 
2132 /* Decode a composition rule represented by C and the following byte
2133    at SRC as a component of composition sequence of Emacs 21 style.
2134    Set RULE to the decoded rule.  */
2135 
2136 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(c, rule)	\
2137   do {							\
2138     int gref, nref;					\
2139     							\
2140     gref = c - 0x20;					\
2141     if (gref < 0 || gref >= 81)				\
2142       goto invalid_code;				\
2143     ONE_MORE_BYTE (c);					\
2144     nref = c - 0x20;					\
2145     if (nref < 0 || nref >= 81)				\
2146       goto invalid_code;				\
2147     rule = COMPOSITION_ENCODE_RULE (gref, nref);	\
2148   } while (0)
2149 
2150 
2151 /* Start of Emacs 21 style format.  The first three bytes at SRC are
2152    (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is the
2153    byte length of this composition information, CHARS is the number of
2154    characters composed by this composition.  */
2155 
2156 #define DECODE_EMACS_MULE_21_COMPOSITION()				\
2157   do {									\
2158     enum composition_method method = c - 0xF2;				\
2159     int nbytes, nchars;							\
2160     									\
2161     ONE_MORE_BYTE (c);							\
2162     if (c < 0)								\
2163       goto invalid_code;						\
2164     nbytes = c - 0xA0;							\
2165     if (nbytes < 3 || (method == COMPOSITION_RELATIVE && nbytes != 4))	\
2166       goto invalid_code;						\
2167     ONE_MORE_BYTE (c);							\
2168     nchars = c - 0xA0;							\
2169     if (nchars <= 0 || nchars >= MAX_COMPOSITION_COMPONENTS)		\
2170       goto invalid_code;						\
2171     cmp_status->old_form = 0;						\
2172     cmp_status->method = method;					\
2173     if (method == COMPOSITION_RELATIVE)					\
2174       cmp_status->state = COMPOSING_CHAR;				\
2175     else								\
2176       cmp_status->state = COMPOSING_COMPONENT_CHAR;			\
2177     cmp_status->length = MAX_ANNOTATION_LENGTH;				\
2178     cmp_status->nchars = nchars;					\
2179     cmp_status->ncomps = nbytes - 4;					\
2180     ADD_COMPOSITION_DATA (charbuf, nchars, nbytes, method);		\
2181   } while (0)
2182 
2183 
2184 /* Start of Emacs 20 style format for relative composition.  */
2185 
2186 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION()		\
2187   do {								\
2188     cmp_status->old_form = 1;					\
2189     cmp_status->method = COMPOSITION_RELATIVE;			\
2190     cmp_status->state = COMPOSING_CHAR;				\
2191     cmp_status->length = MAX_ANNOTATION_LENGTH;			\
2192     cmp_status->nchars = cmp_status->ncomps = 0;		\
2193     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);	\
2194   } while (0)
2195 
2196 
2197 /* Start of Emacs 20 style format for rule-base composition.  */
2198 
2199 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION()		\
2200   do {								\
2201     cmp_status->old_form = 1;					\
2202     cmp_status->method = COMPOSITION_WITH_RULE;			\
2203     cmp_status->state = COMPOSING_CHAR;				\
2204     cmp_status->length = MAX_ANNOTATION_LENGTH;			\
2205     cmp_status->nchars = cmp_status->ncomps = 0;		\
2206     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);	\
2207   } while (0)
2208 
2209 
2210 #define DECODE_EMACS_MULE_COMPOSITION_START()		\
2211   do {							\
2212     const unsigned char *current_src = src;		\
2213     							\
2214     ONE_MORE_BYTE (c);					\
2215     if (c < 0)						\
2216       goto invalid_code;				\
2217     if (c - 0xF2 >= COMPOSITION_RELATIVE		\
2218 	&& c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)	\
2219       DECODE_EMACS_MULE_21_COMPOSITION ();		\
2220     else if (c < 0xA0)					\
2221       goto invalid_code;				\
2222     else if (c < 0xC0)					\
2223       {							\
2224 	DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION ();	\
2225 	/* Re-read C as a composition component.  */	\
2226 	src = current_src;				\
2227       }							\
2228     else if (c == 0xFF)					\
2229       DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION ();	\
2230     else						\
2231       goto invalid_code;				\
2232   } while (0)
2233 
2234 #define EMACS_MULE_COMPOSITION_END()				\
2235   do {								\
2236     int idx = - cmp_status->length;				\
2237     								\
2238     if (cmp_status->old_form)					\
2239       charbuf[idx + 2] = cmp_status->nchars;			\
2240     else if (cmp_status->method > COMPOSITION_RELATIVE)		\
2241       charbuf[idx] = charbuf[idx + 2] - cmp_status->length;	\
2242     cmp_status->state = COMPOSING_NO;				\
2243   } while (0)
2244 
2245 
2246 static int
emacs_mule_finish_composition(int * charbuf,struct composition_status * cmp_status)2247 emacs_mule_finish_composition (int *charbuf,
2248 			       struct composition_status *cmp_status)
2249 {
2250   int idx = - cmp_status->length;
2251   int new_chars;
2252 
2253   if (cmp_status->old_form && cmp_status->nchars > 0)
2254     {
2255       charbuf[idx + 2] = cmp_status->nchars;
2256       new_chars = 0;
2257       if (cmp_status->method == COMPOSITION_WITH_RULE
2258 	  && cmp_status->state == COMPOSING_CHAR)
2259 	{
2260 	  /* The last rule was invalid.  */
2261 	  int rule = charbuf[-1] + 0xA0;
2262 
2263 	  charbuf[-2] = BYTE8_TO_CHAR (rule);
2264 	  charbuf[-1] = -1;
2265 	  new_chars = 1;
2266 	}
2267     }
2268   else
2269     {
2270       charbuf[idx++] = BYTE8_TO_CHAR (0x80);
2271 
2272       if (cmp_status->method == COMPOSITION_WITH_RULE)
2273 	{
2274 	  charbuf[idx++] = BYTE8_TO_CHAR (0xFF);
2275 	  charbuf[idx++] = -3;
2276 	  charbuf[idx++] = 0;
2277 	  new_chars = 1;
2278 	}
2279       else
2280 	{
2281 	  int nchars = charbuf[idx + 1] + 0xA0;
2282 	  int nbytes = charbuf[idx + 2] + 0xA0;
2283 
2284 	  charbuf[idx++] = BYTE8_TO_CHAR (0xF2 + cmp_status->method);
2285 	  charbuf[idx++] = BYTE8_TO_CHAR (nbytes);
2286 	  charbuf[idx++] = BYTE8_TO_CHAR (nchars);
2287 	  charbuf[idx++] = -1;
2288 	  new_chars = 4;
2289 	}
2290     }
2291   cmp_status->state = COMPOSING_NO;
2292   return new_chars;
2293 }
2294 
2295 #define EMACS_MULE_MAYBE_FINISH_COMPOSITION()				  \
2296   do {									  \
2297     if (cmp_status->state != COMPOSING_NO)				  \
2298       char_offset += emacs_mule_finish_composition (charbuf, cmp_status); \
2299   } while (0)
2300 
2301 
2302 static void
decode_coding_emacs_mule(struct coding_system * coding)2303 decode_coding_emacs_mule (struct coding_system *coding)
2304 {
2305   const unsigned char *src = coding->source + coding->consumed;
2306   const unsigned char *src_end = coding->source + coding->src_bytes;
2307   const unsigned char *src_base;
2308   int *charbuf = coding->charbuf + coding->charbuf_used;
2309   /* We may produce two annotations (charset and composition) in one
2310      loop and one more charset annotation at the end.  */
2311   int *charbuf_end
2312     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3)
2313       /* We can produce up to 2 characters in a loop.  */
2314       - 1;
2315   ptrdiff_t consumed_chars = 0, consumed_chars_base;
2316   bool multibytep = coding->src_multibyte;
2317   ptrdiff_t char_offset = coding->produced_char;
2318   ptrdiff_t last_offset = char_offset;
2319   int last_id = charset_ascii;
2320   bool eol_dos
2321     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
2322   int byte_after_cr = -1;
2323   struct composition_status *cmp_status = &coding->spec.emacs_mule.cmp_status;
2324 
2325   if (cmp_status->state != COMPOSING_NO)
2326     {
2327       int i;
2328 
2329       if (charbuf_end - charbuf < cmp_status->length)
2330 	emacs_abort ();
2331       for (i = 0; i < cmp_status->length; i++)
2332 	*charbuf++ = cmp_status->carryover[i];
2333       coding->annotated = 1;
2334     }
2335 
2336   while (1)
2337     {
2338       int c;
2339       int id UNINIT;
2340 
2341       src_base = src;
2342       consumed_chars_base = consumed_chars;
2343 
2344       if (charbuf >= charbuf_end)
2345 	{
2346 	  if (byte_after_cr >= 0)
2347 	    src_base--;
2348 	  break;
2349 	}
2350 
2351       if (byte_after_cr >= 0)
2352 	c = byte_after_cr, byte_after_cr = -1;
2353       else
2354 	ONE_MORE_BYTE (c);
2355 
2356       if (c < 0 || c == 0x80)
2357 	{
2358 	  EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2359 	  if (c < 0)
2360 	    {
2361 	      *charbuf++ = -c;
2362 	      char_offset++;
2363 	    }
2364 	  else
2365 	    DECODE_EMACS_MULE_COMPOSITION_START ();
2366 	  continue;
2367 	}
2368 
2369       if (c < 0x80)
2370 	{
2371 	  if (eol_dos && c == '\r')
2372 	    ONE_MORE_BYTE (byte_after_cr);
2373 	  id = charset_ascii;
2374 	  if (cmp_status->state != COMPOSING_NO)
2375 	    {
2376 	      if (cmp_status->old_form)
2377 		EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2378 	      else if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2379 		cmp_status->ncomps--;
2380 	    }
2381 	}
2382       else
2383 	{
2384 	  int nchars UNINIT, nbytes UNINIT;
2385 	  /* emacs_mule_char can load a charset map from a file, which
2386 	     allocates a large structure and might cause buffer text
2387 	     to be relocated as result.  Thus, we need to remember the
2388 	     original pointer to buffer text, and fix up all related
2389 	     pointers after the call.  */
2390 	  const unsigned char *orig = coding->source;
2391 	  ptrdiff_t offset;
2392 
2393 	  c = emacs_mule_char (coding, src_base, &nbytes, &nchars, &id,
2394 			       cmp_status);
2395 	  offset = coding->source - orig;
2396 	  if (offset)
2397 	    {
2398 	      src += offset;
2399 	      src_base += offset;
2400 	      src_end += offset;
2401 	    }
2402 	  if (c < 0)
2403 	    {
2404 	      if (c == -1)
2405 		goto invalid_code;
2406 	      if (c == -2)
2407 		break;
2408 	    }
2409 	  src = src_base + nbytes;
2410 	  consumed_chars = consumed_chars_base + nchars;
2411 	  if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2412 	    cmp_status->ncomps -= nchars;
2413 	}
2414 
2415       /* Now if C >= 0, we found a normally encoded character, if C <
2416 	 0, we found an old-style composition component character or
2417 	 rule.  */
2418 
2419       if (cmp_status->state == COMPOSING_NO)
2420 	{
2421 	  if (last_id != id)
2422 	    {
2423 	      if (last_id != charset_ascii)
2424 		ADD_CHARSET_DATA (charbuf, char_offset - last_offset,
2425 				  last_id);
2426 	      last_id = id;
2427 	      last_offset = char_offset;
2428 	    }
2429 	  *charbuf++ = c;
2430 	  char_offset++;
2431 	}
2432       else if (cmp_status->state == COMPOSING_CHAR)
2433 	{
2434 	  if (cmp_status->old_form)
2435 	    {
2436 	      if (c >= 0)
2437 		{
2438 		  EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2439 		  *charbuf++ = c;
2440 		  char_offset++;
2441 		}
2442 	      else
2443 		{
2444 		  *charbuf++ = -c;
2445 		  cmp_status->nchars++;
2446 		  cmp_status->length++;
2447 		  if (cmp_status->nchars == MAX_COMPOSITION_COMPONENTS)
2448 		    EMACS_MULE_COMPOSITION_END ();
2449 		  else if (cmp_status->method == COMPOSITION_WITH_RULE)
2450 		    cmp_status->state = COMPOSING_RULE;
2451 		}
2452 	    }
2453 	  else
2454 	    {
2455 	      *charbuf++ = c;
2456 	      cmp_status->length++;
2457 	      cmp_status->nchars--;
2458 	      if (cmp_status->nchars == 0)
2459 		EMACS_MULE_COMPOSITION_END ();
2460 	    }
2461 	}
2462       else if (cmp_status->state == COMPOSING_RULE)
2463 	{
2464 	  int rule;
2465 
2466 	  if (c >= 0)
2467 	    {
2468 	      EMACS_MULE_COMPOSITION_END ();
2469 	      *charbuf++ = c;
2470 	      char_offset++;
2471 	    }
2472 	  else
2473 	    {
2474 	      c = -c;
2475 	      DECODE_EMACS_MULE_COMPOSITION_RULE_20 (c, rule);
2476 	      if (rule < 0)
2477 		goto invalid_code;
2478 	      *charbuf++ = -2;
2479 	      *charbuf++ = rule;
2480 	      cmp_status->length += 2;
2481 	      cmp_status->state = COMPOSING_CHAR;
2482 	    }
2483 	}
2484       else if (cmp_status->state == COMPOSING_COMPONENT_CHAR)
2485 	{
2486 	  *charbuf++ = c;
2487 	  cmp_status->length++;
2488 	  if (cmp_status->ncomps == 0)
2489 	    cmp_status->state = COMPOSING_CHAR;
2490 	  else if (cmp_status->ncomps > 0)
2491 	    {
2492 	      if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)
2493 		cmp_status->state = COMPOSING_COMPONENT_RULE;
2494 	    }
2495 	  else
2496 	    EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2497 	}
2498       else			/* COMPOSING_COMPONENT_RULE */
2499 	{
2500 	  int rule;
2501 
2502 	  DECODE_EMACS_MULE_COMPOSITION_RULE_21 (c, rule);
2503 	  if (rule < 0)
2504 	    goto invalid_code;
2505 	  *charbuf++ = -2;
2506 	  *charbuf++ = rule;
2507 	  cmp_status->length += 2;
2508 	  cmp_status->ncomps--;
2509 	  if (cmp_status->ncomps > 0)
2510 	    cmp_status->state = COMPOSING_COMPONENT_CHAR;
2511 	  else
2512 	    EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2513 	}
2514       continue;
2515 
2516     invalid_code:
2517       EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2518       src = src_base;
2519       consumed_chars = consumed_chars_base;
2520       ONE_MORE_BYTE (c);
2521       *charbuf++ = ASCII_CHAR_P (c) ? c : BYTE8_TO_CHAR (c);
2522       char_offset++;
2523     }
2524 
2525  no_more_source:
2526   if (cmp_status->state != COMPOSING_NO)
2527     {
2528       if (coding->mode & CODING_MODE_LAST_BLOCK)
2529 	EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2530       else
2531 	{
2532 	  int i;
2533 
2534 	  charbuf -= cmp_status->length;
2535 	  for (i = 0; i < cmp_status->length; i++)
2536 	    cmp_status->carryover[i] = charbuf[i];
2537 	}
2538     }
2539   if (last_id != charset_ascii)
2540     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2541   coding->consumed_char += consumed_chars_base;
2542   coding->consumed = src_base - coding->source;
2543   coding->charbuf_used = charbuf - coding->charbuf;
2544 }
2545 
2546 
2547 #define EMACS_MULE_LEADING_CODES(id, codes)	\
2548   do {						\
2549     if (id < 0xA0)				\
2550       codes[0] = id, codes[1] = 0;		\
2551     else if (id < 0xE0)				\
2552       codes[0] = 0x9A, codes[1] = id;		\
2553     else if (id < 0xF0)				\
2554       codes[0] = 0x9B, codes[1] = id;		\
2555     else if (id < 0xF5)				\
2556       codes[0] = 0x9C, codes[1] = id;		\
2557     else					\
2558       codes[0] = 0x9D, codes[1] = id;		\
2559   } while (0);
2560 
2561 
2562 static bool
encode_coding_emacs_mule(struct coding_system * coding)2563 encode_coding_emacs_mule (struct coding_system *coding)
2564 {
2565   bool multibytep = coding->dst_multibyte;
2566   int *charbuf = coding->charbuf;
2567   int *charbuf_end = charbuf + coding->charbuf_used;
2568   unsigned char *dst = coding->destination + coding->produced;
2569   unsigned char *dst_end = coding->destination + coding->dst_bytes;
2570   int safe_room = 8;
2571   ptrdiff_t produced_chars = 0;
2572   Lisp_Object attrs, charset_list;
2573   int c;
2574   int preferred_charset_id = -1;
2575 
2576   CODING_GET_INFO (coding, attrs, charset_list);
2577   if (! EQ (charset_list, Vemacs_mule_charset_list))
2578     {
2579       charset_list = Vemacs_mule_charset_list;
2580       ASET (attrs, coding_attr_charset_list, charset_list);
2581     }
2582 
2583   while (charbuf < charbuf_end)
2584     {
2585       ASSURE_DESTINATION (safe_room);
2586       c = *charbuf++;
2587 
2588       if (c < 0)
2589 	{
2590 	  /* Handle an annotation.  */
2591 	  switch (*charbuf)
2592 	    {
2593 	    case CODING_ANNOTATE_COMPOSITION_MASK:
2594 	      /* Not yet implemented.  */
2595 	      break;
2596 	    case CODING_ANNOTATE_CHARSET_MASK:
2597 	      preferred_charset_id = charbuf[3];
2598 	      if (preferred_charset_id >= 0
2599 		  && NILP (Fmemq (make_fixnum (preferred_charset_id),
2600 				  charset_list)))
2601 		preferred_charset_id = -1;
2602 	      break;
2603 	    default:
2604 	      emacs_abort ();
2605 	    }
2606 	  charbuf += -c - 1;
2607 	  continue;
2608 	}
2609 
2610       if (ASCII_CHAR_P (c))
2611 	EMIT_ONE_ASCII_BYTE (c);
2612       else if (CHAR_BYTE8_P (c))
2613 	{
2614 	  c = CHAR_TO_BYTE8 (c);
2615 	  EMIT_ONE_BYTE (c);
2616 	}
2617       else
2618 	{
2619 	  struct charset *charset;
2620 	  unsigned code;
2621 	  int dimension;
2622 	  int emacs_mule_id;
2623 	  unsigned char leading_codes[2];
2624 
2625 	  if (preferred_charset_id >= 0)
2626 	    {
2627 	      bool result;
2628 
2629 	      charset = CHARSET_FROM_ID (preferred_charset_id);
2630 	      CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
2631 	      if (result)
2632 		code = ENCODE_CHAR (charset, c);
2633 	      else
2634 		CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2635 				     &code, charset);
2636 	    }
2637 	  else
2638 	    CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2639 				 &code, charset);
2640 	  if (! charset)
2641 	    {
2642 	      c = coding->default_char;
2643 	      if (ASCII_CHAR_P (c))
2644 		{
2645 		  EMIT_ONE_ASCII_BYTE (c);
2646 		  continue;
2647 		}
2648 	      CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2649 				   &code, charset);
2650 	    }
2651 	  dimension = CHARSET_DIMENSION (charset);
2652 	  emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2653 	  EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2654 	  EMIT_ONE_BYTE (leading_codes[0]);
2655 	  if (leading_codes[1])
2656 	    EMIT_ONE_BYTE (leading_codes[1]);
2657 	  if (dimension == 1)
2658 	    EMIT_ONE_BYTE (code | 0x80);
2659 	  else
2660 	    {
2661 	      code |= 0x8080;
2662 	      EMIT_ONE_BYTE (code >> 8);
2663 	      EMIT_ONE_BYTE (code & 0xFF);
2664 	    }
2665 	}
2666     }
2667   record_conversion_result (coding, CODING_RESULT_SUCCESS);
2668   coding->produced_char += produced_chars;
2669   coding->produced = dst - coding->destination;
2670   return 0;
2671 }
2672 
2673 
2674 /*** 7. ISO2022 handlers ***/
2675 
2676 /* The following note describes the coding system ISO2022 briefly.
2677    Since the intention of this note is to help understand the
2678    functions in this file, some parts are NOT ACCURATE or are OVERLY
2679    SIMPLIFIED.  For thorough understanding, please refer to the
2680    original document of ISO2022.  This is equivalent to the standard
2681    ECMA-35, obtainable from <URL:https://www.ecma.ch/> (*).
2682 
2683    ISO2022 provides many mechanisms to encode several character sets
2684    in 7-bit and 8-bit environments.  For 7-bit environments, all text
2685    is encoded using bytes less than 128.  This may make the encoded
2686    text a little bit longer, but the text passes more easily through
2687    several types of gateway, some of which strip off the MSB (Most
2688    Significant Bit).
2689 
2690    There are two kinds of character sets: control character sets and
2691    graphic character sets.  The former contain control characters such
2692    as `newline' and `escape' to provide control functions (control
2693    functions are also provided by escape sequences).  The latter
2694    contain graphic characters such as 'A' and '-'.  Emacs recognizes
2695    two control character sets and many graphic character sets.
2696 
2697    Graphic character sets are classified into one of the following
2698    four classes, according to the number of bytes (DIMENSION) and
2699    number of characters in one dimension (CHARS) of the set:
2700    - DIMENSION1_CHARS94
2701    - DIMENSION1_CHARS96
2702    - DIMENSION2_CHARS94
2703    - DIMENSION2_CHARS96
2704 
2705    In addition, each character set is assigned an identification tag,
2706    unique for each set, called the "final character" (denoted as <F>
2707    hereafter).  The <F> of each character set is decided by ECMA(*)
2708    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
2709    (0x30..0x3F are for private use only).
2710 
2711    Note (*): ECMA = European Computer Manufacturers Association
2712 
2713    Here are examples of graphic character sets [NAME(<F>)]:
2714 	o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2715 	o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2716 	o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2717 	o DIMENSION2_CHARS96 -- none for the moment
2718 
2719    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2720 	C0 [0x00..0x1F] -- control character plane 0
2721 	GL [0x20..0x7F] -- graphic character plane 0
2722 	C1 [0x80..0x9F] -- control character plane 1
2723 	GR [0xA0..0xFF] -- graphic character plane 1
2724 
2725    A control character set is directly designated and invoked to C0 or
2726    C1 by an escape sequence.  The most common case is that:
2727    - ISO646's  control character set is designated/invoked to C0, and
2728    - ISO6429's control character set is designated/invoked to C1,
2729    and usually these designations/invocations are omitted in encoded
2730    text.  In a 7-bit environment, only C0 can be used, and a control
2731    character for C1 is encoded by an appropriate escape sequence to
2732    fit into the environment.  All control characters for C1 are
2733    defined to have corresponding escape sequences.
2734 
2735    A graphic character set is at first designated to one of four
2736    graphic registers (G0 through G3), then these graphic registers are
2737    invoked to GL or GR.  These designations and invocations can be
2738    done independently.  The most common case is that G0 is invoked to
2739    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
2740    these invocations and designations are omitted in encoded text.
2741    In a 7-bit environment, only GL can be used.
2742 
2743    When a graphic character set of CHARS94 is invoked to GL, codes
2744    0x20 and 0x7F of the GL area work as control characters SPACE and
2745    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2746    be used.
2747 
2748    There are two ways of invocation: locking-shift and single-shift.
2749    With locking-shift, the invocation lasts until the next different
2750    invocation, whereas with single-shift, the invocation affects the
2751    following character only and doesn't affect the locking-shift
2752    state.  Invocations are done by the following control characters or
2753    escape sequences:
2754 
2755    ----------------------------------------------------------------------
2756    abbrev  function	             cntrl escape seq	description
2757    ----------------------------------------------------------------------
2758    SI/LS0  (shift-in)		     0x0F  none		invoke G0 into GL
2759    SO/LS1  (shift-out)		     0x0E  none		invoke G1 into GL
2760    LS2     (locking-shift-2)	     none  ESC 'n'	invoke G2 into GL
2761    LS3     (locking-shift-3)	     none  ESC 'o'	invoke G3 into GL
2762    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
2763    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
2764    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
2765    SS2     (single-shift-2)	     0x8E  ESC 'N'	invoke G2 for one char
2766    SS3     (single-shift-3)	     0x8F  ESC 'O'	invoke G3 for one char
2767    ----------------------------------------------------------------------
2768    (*) These are not used by any known coding system.
2769 
2770    Control characters for these functions are defined by macros
2771    ISO_CODE_XXX in `coding.h'.
2772 
2773    Designations are done by the following escape sequences:
2774    ----------------------------------------------------------------------
2775    escape sequence	description
2776    ----------------------------------------------------------------------
2777    ESC '(' <F>		designate DIMENSION1_CHARS94<F> to G0
2778    ESC ')' <F>		designate DIMENSION1_CHARS94<F> to G1
2779    ESC '*' <F>		designate DIMENSION1_CHARS94<F> to G2
2780    ESC '+' <F>		designate DIMENSION1_CHARS94<F> to G3
2781    ESC ',' <F>		designate DIMENSION1_CHARS96<F> to G0 (*)
2782    ESC '-' <F>		designate DIMENSION1_CHARS96<F> to G1
2783    ESC '.' <F>		designate DIMENSION1_CHARS96<F> to G2
2784    ESC '/' <F>		designate DIMENSION1_CHARS96<F> to G3
2785    ESC '$' '(' <F>	designate DIMENSION2_CHARS94<F> to G0 (**)
2786    ESC '$' ')' <F>	designate DIMENSION2_CHARS94<F> to G1
2787    ESC '$' '*' <F>	designate DIMENSION2_CHARS94<F> to G2
2788    ESC '$' '+' <F>	designate DIMENSION2_CHARS94<F> to G3
2789    ESC '$' ',' <F>	designate DIMENSION2_CHARS96<F> to G0 (*)
2790    ESC '$' '-' <F>	designate DIMENSION2_CHARS96<F> to G1
2791    ESC '$' '.' <F>	designate DIMENSION2_CHARS96<F> to G2
2792    ESC '$' '/' <F>	designate DIMENSION2_CHARS96<F> to G3
2793    ----------------------------------------------------------------------
2794 
2795    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2796    of dimension 1, chars 94, and final character <F>, etc...
2797 
2798    Note (*): Although these designations are not allowed in ISO2022,
2799    Emacs accepts them on decoding, and produces them on encoding
2800    CHARS96 character sets in a coding system which is characterized as
2801    7-bit environment, non-locking-shift, and non-single-shift.
2802 
2803    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2804    '(' must be omitted.  We refer to this as "short-form" hereafter.
2805 
2806    Now you may notice that there are a lot of ways of encoding the
2807    same multilingual text in ISO2022.  Actually, there exist many
2808    coding systems such as Compound Text (used in X11's inter client
2809    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2810    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2811    localized platforms), and all of these are variants of ISO2022.
2812 
2813    In addition to the above, Emacs handles two more kinds of escape
2814    sequences: ISO6429's direction specification and Emacs' private
2815    sequence for specifying character composition.
2816 
2817    ISO6429's direction specification takes the following form:
2818 	o CSI ']'      -- end of the current direction
2819 	o CSI '0' ']'  -- end of the current direction
2820 	o CSI '1' ']'  -- start of left-to-right text
2821 	o CSI '2' ']'  -- start of right-to-left text
2822    The control character CSI (0x9B: control sequence introducer) is
2823    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2824 
2825    Character composition specification takes the following form:
2826 	o ESC '0' -- start relative composition
2827 	o ESC '1' -- end composition
2828 	o ESC '2' -- start rule-base composition (*)
2829 	o ESC '3' -- start relative composition with alternate chars  (**)
2830 	o ESC '4' -- start rule-base composition with alternate chars  (**)
2831   Since these are not standard escape sequences of any ISO standard,
2832   the use of them with these meanings is restricted to Emacs only.
2833 
2834   (*) This form is used only in Emacs 20.7 and older versions,
2835   but newer versions can safely decode it.
2836   (**) This form is used only in Emacs 21.1 and newer versions,
2837   and older versions can't decode it.
2838 
2839   Here's a list of example usages of these composition escape
2840   sequences (categorized by `enum composition_method').
2841 
2842   COMPOSITION_RELATIVE:
2843 	ESC 0 CHAR [ CHAR ] ESC 1
2844   COMPOSITION_WITH_RULE:
2845 	ESC 2 CHAR [ RULE CHAR ] ESC 1
2846   COMPOSITION_WITH_ALTCHARS:
2847 	ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2848   COMPOSITION_WITH_RULE_ALTCHARS:
2849 	ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2850 
2851 static enum iso_code_class_type iso_code_class[256];
2852 
2853 #define SAFE_CHARSET_P(coding, id)	\
2854   ((id) <= (coding)->max_charset_id	\
2855    && (coding)->safe_charsets[id] != 255)
2856 
2857 static void
setup_iso_safe_charsets(Lisp_Object attrs)2858 setup_iso_safe_charsets (Lisp_Object attrs)
2859 {
2860   Lisp_Object charset_list, safe_charsets;
2861   Lisp_Object request;
2862   Lisp_Object reg_usage;
2863   Lisp_Object tail;
2864   EMACS_INT reg94, reg96;
2865   int flags = XFIXNUM (AREF (attrs, coding_attr_iso_flags));
2866   int max_charset_id;
2867 
2868   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2869   if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2870       && ! EQ (charset_list, Viso_2022_charset_list))
2871     {
2872       charset_list = Viso_2022_charset_list;
2873       ASET (attrs, coding_attr_charset_list, charset_list);
2874       ASET (attrs, coding_attr_safe_charsets, Qnil);
2875     }
2876 
2877   if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2878     return;
2879 
2880   max_charset_id = 0;
2881   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2882     {
2883       int id = XFIXNUM (XCAR (tail));
2884       if (max_charset_id < id)
2885 	max_charset_id = id;
2886     }
2887 
2888   safe_charsets = make_uninit_string (max_charset_id + 1);
2889   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
2890   request = AREF (attrs, coding_attr_iso_request);
2891   reg_usage = AREF (attrs, coding_attr_iso_usage);
2892   reg94 = XFIXNUM (XCAR (reg_usage));
2893   reg96 = XFIXNUM (XCDR (reg_usage));
2894 
2895   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2896     {
2897       Lisp_Object id;
2898       Lisp_Object reg;
2899       struct charset *charset;
2900 
2901       id = XCAR (tail);
2902       charset = CHARSET_FROM_ID (XFIXNUM (id));
2903       reg = Fcdr (Fassq (id, request));
2904       if (! NILP (reg))
2905 	SSET (safe_charsets, XFIXNUM (id), XFIXNUM (reg));
2906       else if (charset->iso_chars_96)
2907 	{
2908 	  if (reg96 < 4)
2909 	    SSET (safe_charsets, XFIXNUM (id), reg96);
2910 	}
2911       else
2912 	{
2913 	  if (reg94 < 4)
2914 	    SSET (safe_charsets, XFIXNUM (id), reg94);
2915 	}
2916     }
2917   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2918 }
2919 
2920 
2921 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2922    Return true if a text is encoded in one of ISO-2022 based coding
2923    systems.  */
2924 
2925 static bool
detect_coding_iso_2022(struct coding_system * coding,struct coding_detection_info * detect_info)2926 detect_coding_iso_2022 (struct coding_system *coding,
2927 			struct coding_detection_info *detect_info)
2928 {
2929   const unsigned char *src = coding->source, *src_base = src;
2930   const unsigned char *src_end = coding->source + coding->src_bytes;
2931   bool multibytep = coding->src_multibyte;
2932   bool single_shifting = 0;
2933   int id;
2934   int c, c1;
2935   ptrdiff_t consumed_chars = 0;
2936   int i;
2937   int rejected = 0;
2938   int found = 0;
2939   int composition_count = -1;
2940 
2941   detect_info->checked |= CATEGORY_MASK_ISO;
2942 
2943   for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2944     {
2945       struct coding_system *this = &(coding_categories[i]);
2946       Lisp_Object attrs, val;
2947 
2948       if (this->id < 0)
2949 	continue;
2950       attrs = CODING_ID_ATTRS (this->id);
2951       if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2952 	  && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Viso_2022_charset_list))
2953 	setup_iso_safe_charsets (attrs);
2954       val = CODING_ATTR_SAFE_CHARSETS (attrs);
2955       this->max_charset_id = SCHARS (val) - 1;
2956       this->safe_charsets = SDATA (val);
2957     }
2958 
2959   /* A coding system of this category is always ASCII compatible.  */
2960   src += coding->head_ascii;
2961 
2962   while (rejected != CATEGORY_MASK_ISO)
2963     {
2964       src_base = src;
2965       ONE_MORE_BYTE (c);
2966       switch (c)
2967 	{
2968 	case ISO_CODE_ESC:
2969 	  if (inhibit_iso_escape_detection)
2970 	    break;
2971 	  single_shifting = 0;
2972 	  ONE_MORE_BYTE (c);
2973 	  if (c == 'N' || c == 'O')
2974 	    {
2975 	      /* ESC <Fe> for SS2 or SS3.  */
2976 	      single_shifting = 1;
2977 	      rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2978 	    }
2979 	  else if (c == '1')
2980 	    {
2981 	      /* End of composition.  */
2982 	      if (composition_count < 0
2983 		  || composition_count > MAX_COMPOSITION_COMPONENTS)
2984 		/* Invalid */
2985 		break;
2986 	      composition_count = -1;
2987 	      found |= CATEGORY_MASK_ISO;
2988 	    }
2989 	  else if (c >= '0' && c <= '4')
2990 	    {
2991 	      /* ESC <Fp> for start/end composition.  */
2992 	      composition_count = 0;
2993 	    }
2994 	  else
2995 	    {
2996 	      if (c >= '(' && c <= '/')
2997 		{
2998 		  /* Designation sequence for a charset of dimension 1.  */
2999 		  ONE_MORE_BYTE (c1);
3000 		  if (c1 < ' ' || c1 >= 0x80
3001 		      || (id = iso_charset_table[0][c >= ','][c1]) < 0)
3002 		    {
3003 		      /* Invalid designation sequence.  Just ignore.  */
3004 		      if (c1 >= 0x80)
3005 			rejected |= (CATEGORY_MASK_ISO_7BIT
3006 				     | CATEGORY_MASK_ISO_7_ELSE);
3007 		      break;
3008 		    }
3009 		}
3010 	      else if (c == '$')
3011 		{
3012 		  /* Designation sequence for a charset of dimension 2.  */
3013 		  ONE_MORE_BYTE (c);
3014 		  if (c >= '@' && c <= 'B')
3015 		    /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
3016 		    id = iso_charset_table[1][0][c];
3017 		  else if (c >= '(' && c <= '/')
3018 		    {
3019 		      ONE_MORE_BYTE (c1);
3020 		      if (c1 < ' ' || c1 >= 0x80
3021 			  || (id = iso_charset_table[1][c >= ','][c1]) < 0)
3022 			{
3023 			  /* Invalid designation sequence.  Just ignore.  */
3024 			  if (c1 >= 0x80)
3025 			    rejected |= (CATEGORY_MASK_ISO_7BIT
3026 					 | CATEGORY_MASK_ISO_7_ELSE);
3027 			  break;
3028 			}
3029 		    }
3030 		  else
3031 		    {
3032 		      /* Invalid designation sequence.  Just ignore it.  */
3033 		      if (c >= 0x80)
3034 			rejected |= (CATEGORY_MASK_ISO_7BIT
3035 				     | CATEGORY_MASK_ISO_7_ELSE);
3036 		      break;
3037 		    }
3038 		}
3039 	      else
3040 		{
3041 		  /* Invalid escape sequence.  Just ignore it.  */
3042 		  if (c >= 0x80)
3043 		    rejected |= (CATEGORY_MASK_ISO_7BIT
3044 				 | CATEGORY_MASK_ISO_7_ELSE);
3045 		  break;
3046 		}
3047 
3048 	      /* We found a valid designation sequence for CHARSET.  */
3049 	      rejected |= CATEGORY_MASK_ISO_8BIT;
3050 	      if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
3051 				  id))
3052 		found |= CATEGORY_MASK_ISO_7;
3053 	      else
3054 		rejected |= CATEGORY_MASK_ISO_7;
3055 	      if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
3056 				  id))
3057 		found |= CATEGORY_MASK_ISO_7_TIGHT;
3058 	      else
3059 		rejected |= CATEGORY_MASK_ISO_7_TIGHT;
3060 	      if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
3061 				  id))
3062 		found |= CATEGORY_MASK_ISO_7_ELSE;
3063 	      else
3064 		rejected |= CATEGORY_MASK_ISO_7_ELSE;
3065 	      if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
3066 				  id))
3067 		found |= CATEGORY_MASK_ISO_8_ELSE;
3068 	      else
3069 		rejected |= CATEGORY_MASK_ISO_8_ELSE;
3070 	    }
3071 	  break;
3072 
3073 	case ISO_CODE_SO:
3074 	case ISO_CODE_SI:
3075 	  /* Locking shift out/in.  */
3076 	  if (inhibit_iso_escape_detection)
3077 	    break;
3078 	  single_shifting = 0;
3079 	  rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3080 	  break;
3081 
3082 	case ISO_CODE_CSI:
3083 	  /* Control sequence introducer.  */
3084 	  single_shifting = 0;
3085 	  rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3086 	  found |= CATEGORY_MASK_ISO_8_ELSE;
3087 	  goto check_extra_latin;
3088 
3089 	case ISO_CODE_SS2:
3090 	case ISO_CODE_SS3:
3091 	  /* Single shift.   */
3092 	  if (inhibit_iso_escape_detection)
3093 	    break;
3094 	  single_shifting = 0;
3095 	  rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3096 	  if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3097 	      & CODING_ISO_FLAG_SINGLE_SHIFT)
3098 	    {
3099 	      found |= CATEGORY_MASK_ISO_8_1;
3100 	      single_shifting = 1;
3101 	    }
3102 	  if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
3103 	      & CODING_ISO_FLAG_SINGLE_SHIFT)
3104 	    {
3105 	      found |= CATEGORY_MASK_ISO_8_2;
3106 	      single_shifting = 1;
3107 	    }
3108 	  if (single_shifting)
3109 	    break;
3110 	  goto check_extra_latin;
3111 
3112 	default:
3113 	  if (c < 0)
3114 	    continue;
3115 	  if (c < 0x80)
3116 	    {
3117 	      if (composition_count >= 0)
3118 		composition_count++;
3119 	      single_shifting = 0;
3120 	      break;
3121 	    }
3122 	  rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3123 	  if (c >= 0xA0)
3124 	    {
3125 	      found |= CATEGORY_MASK_ISO_8_1;
3126 	      /* Check the length of succeeding codes of the range
3127                  0xA0..0FF.  If the byte length is even, we include
3128                  CATEGORY_MASK_ISO_8_2 in `found'.  We can check this
3129                  only when we are not single shifting.  */
3130 	      if (! single_shifting
3131 		  && ! (rejected & CATEGORY_MASK_ISO_8_2))
3132 		{
3133 		  ptrdiff_t len = 1;
3134 		  while (src < src_end)
3135 		    {
3136 		      src_base = src;
3137 		      ONE_MORE_BYTE (c);
3138 		      if (c < 0xA0)
3139 			{
3140 			  src = src_base;
3141 			  break;
3142 			}
3143 		      len++;
3144 		    }
3145 
3146 		  if (len & 1 && src < src_end)
3147 		    {
3148 		      rejected |= CATEGORY_MASK_ISO_8_2;
3149 		      if (composition_count >= 0)
3150 			composition_count += len;
3151 		    }
3152 		  else
3153 		    {
3154 		      found |= CATEGORY_MASK_ISO_8_2;
3155 		      if (composition_count >= 0)
3156 			composition_count += len / 2;
3157 		    }
3158 		}
3159 	      break;
3160 	    }
3161 	check_extra_latin:
3162 	  if (! VECTORP (Vlatin_extra_code_table)
3163 	      || NILP (AREF (Vlatin_extra_code_table, c)))
3164 	    {
3165 	      rejected = CATEGORY_MASK_ISO;
3166 	      break;
3167 	    }
3168 	  if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3169 	      & CODING_ISO_FLAG_LATIN_EXTRA)
3170 	    found |= CATEGORY_MASK_ISO_8_1;
3171 	  else
3172 	    rejected |= CATEGORY_MASK_ISO_8_1;
3173 	  rejected |= CATEGORY_MASK_ISO_8_2;
3174 	  break;
3175 	}
3176     }
3177   detect_info->rejected |= CATEGORY_MASK_ISO;
3178   return 0;
3179 
3180  no_more_source:
3181   detect_info->rejected |= rejected;
3182   detect_info->found |= (found & ~rejected);
3183   return 1;
3184 }
3185 
3186 
3187 /* Set designation state into CODING.  Set CHARS_96 to -1 if the
3188    escape sequence should be kept.  */
3189 #define DECODE_DESIGNATION(reg, dim, chars_96, final)			\
3190   do {									\
3191     int id, prev;							\
3192 									\
3193     if (final < '0' || final >= 128					\
3194 	|| ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0)	\
3195 	|| !SAFE_CHARSET_P (coding, id))				\
3196       {									\
3197 	CODING_ISO_DESIGNATION (coding, reg) = -2;			\
3198 	chars_96 = -1;							\
3199 	break;								\
3200       }									\
3201     prev = CODING_ISO_DESIGNATION (coding, reg);			\
3202     if (id == charset_jisx0201_roman)					\
3203       {									\
3204 	if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)	\
3205 	  id = charset_ascii;						\
3206       }									\
3207     else if (id == charset_jisx0208_1978)				\
3208       {									\
3209 	if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)	\
3210 	  id = charset_jisx0208;					\
3211       }									\
3212     CODING_ISO_DESIGNATION (coding, reg) = id;				\
3213     /* If there was an invalid designation to REG previously, and this	\
3214        designation is ASCII to REG, we should keep this designation	\
3215        sequence.  */							\
3216     if (prev == -2 && id == charset_ascii)				\
3217       chars_96 = -1;							\
3218   } while (0)
3219 
3220 
3221 /* Handle these composition sequence (ALT: alternate char):
3222 
3223    (1) relative composition: ESC 0 CHAR ... ESC 1
3224    (2) rulebase composition: ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3225    (3) altchar composition:  ESC 3 ALT ... ALT ESC 0 CHAR ... ESC 1
3226    (4) alt&rule composition: ESC 4 ALT RULE ... ALT ESC 0 CHAR ... ESC 1
3227 
3228    When the start sequence (ESC 0/2/3/4) is found, this annotation
3229    header is produced.
3230 
3231 	[ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) 0 METHOD ]
3232 
3233    Then, upon reading CHAR or RULE (one or two bytes), these codes are
3234    produced until the end sequence (ESC 1) is found:
3235 
3236    (1) CHAR ... CHAR
3237    (2) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
3238    (3) ALT ... ALT -1 -1 CHAR ... CHAR
3239    (4) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT -1 -1 CHAR ... CHAR
3240 
3241    When the end sequence (ESC 1) is found, LENGTH and NCHARS in the
3242    annotation header is updated as below:
3243 
3244    (1) LENGTH: unchanged,  NCHARS: number of CHARs
3245    (2) LENGTH: unchanged,  NCHARS: number of CHARs
3246    (3) LENGTH: += number of ALTs + 2,  NCHARS: number of CHARs
3247    (4) LENGTH: += number of ALTs * 3,  NCHARS: number of CHARs
3248 
3249    If an error is found while composing, the annotation header is
3250    changed to:
3251 
3252 	[ ESC '0'/'2'/'3'/'4' -2 0 ]
3253 
3254    and the sequence [ -2 DECODED-RULE ] is changed to the original
3255    byte sequence as below:
3256 	o the original byte sequence is B: [ B -1 ]
3257 	o the original byte sequence is B1 B2: [ B1 B2 ]
3258    and the sequence [ -1 -1 ] is changed to the original byte
3259    sequence:
3260 	[ ESC '0' ]
3261 */
3262 
3263 /* Decode a composition rule C1 and maybe one more byte from the
3264    source, and set RULE to the encoded composition rule.  If the rule
3265    is invalid, goto invalid_code.  */
3266 
3267 #define DECODE_COMPOSITION_RULE(rule)					\
3268   do {									\
3269     rule = c1 - 32;							\
3270     if (rule < 0)							\
3271       goto invalid_code;						\
3272     if (rule < 81)		/* old format (before ver.21) */	\
3273       {									\
3274 	int gref = (rule) / 9;						\
3275 	int nref = (rule) % 9;						\
3276 	if (gref == 4) gref = 10;					\
3277 	if (nref == 4) nref = 10;					\
3278 	rule = COMPOSITION_ENCODE_RULE (gref, nref);			\
3279       }									\
3280     else			/* new format (after ver.21) */		\
3281       {									\
3282 	int b;								\
3283 									\
3284 	ONE_MORE_BYTE (b);						\
3285 	if (! COMPOSITION_ENCODE_RULE_VALID (rule - 81, b - 32))	\
3286 	  goto invalid_code;						\
3287 	rule = COMPOSITION_ENCODE_RULE (rule - 81, b - 32);		\
3288 	rule += 0x100;   /* Distinguish it from the old format.  */	\
3289       }									\
3290   } while (0)
3291 
3292 #define ENCODE_COMPOSITION_RULE(rule)				\
3293   do {								\
3294     int gref = (rule % 0x100) / 12, nref = (rule % 0x100) % 12;	\
3295     								\
3296     if (rule < 0x100)		/* old format */		\
3297       {								\
3298 	if (gref == 10) gref = 4;				\
3299 	if (nref == 10) nref = 4;				\
3300 	charbuf[idx] = 32 + gref * 9 + nref;			\
3301 	charbuf[idx + 1] = -1;					\
3302 	new_chars++;						\
3303       }								\
3304     else				/* new format */	\
3305       {								\
3306 	charbuf[idx] = 32 + 81 + gref;				\
3307 	charbuf[idx + 1] = 32 + nref;				\
3308 	new_chars += 2;						\
3309       }								\
3310   } while (0)
3311 
3312 /* Finish the current composition as invalid.  */
3313 
3314 static int
finish_composition(int * charbuf,struct composition_status * cmp_status)3315 finish_composition (int *charbuf, struct composition_status *cmp_status)
3316 {
3317   int idx = - cmp_status->length;
3318   int new_chars;
3319 
3320   /* Recover the original ESC sequence */
3321   charbuf[idx++] = ISO_CODE_ESC;
3322   charbuf[idx++] = (cmp_status->method == COMPOSITION_RELATIVE ? '0'
3323 		    : cmp_status->method == COMPOSITION_WITH_RULE ? '2'
3324 		    : cmp_status->method == COMPOSITION_WITH_ALTCHARS ? '3'
3325 		    /* cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS */
3326 		    : '4');
3327   charbuf[idx++] = -2;
3328   charbuf[idx++] = 0;
3329   charbuf[idx++] = -1;
3330   new_chars = cmp_status->nchars;
3331   if (cmp_status->method >= COMPOSITION_WITH_RULE)
3332     for (; idx < 0; idx++)
3333       {
3334 	int elt = charbuf[idx];
3335 
3336 	if (elt == -2)
3337 	  {
3338 	    ENCODE_COMPOSITION_RULE (charbuf[idx + 1]);
3339 	    idx++;
3340 	  }
3341 	else if (elt == -1)
3342 	  {
3343 	    charbuf[idx++] = ISO_CODE_ESC;
3344 	    charbuf[idx] = '0';
3345 	    new_chars += 2;
3346 	  }
3347       }
3348   cmp_status->state = COMPOSING_NO;
3349   return new_chars;
3350 }
3351 
3352 /* If characters are under composition, finish the composition.  */
3353 #define MAYBE_FINISH_COMPOSITION()				\
3354   do {								\
3355     if (cmp_status->state != COMPOSING_NO)			\
3356       char_offset += finish_composition (charbuf, cmp_status);	\
3357   } while (0)
3358 
3359 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
3360 
3361    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
3362    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3363    ESC 3 : altchar composition :  ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
3364    ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
3365 
3366    Produce this annotation sequence now:
3367 
3368    [ -LENGTH(==-4) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) METHOD ]
3369 */
3370 
3371 #define DECODE_COMPOSITION_START(c1)					   \
3372   do {									   \
3373     if (c1 == '0'							   \
3374 	&& ((cmp_status->state == COMPOSING_COMPONENT_CHAR		   \
3375 	     && cmp_status->method == COMPOSITION_WITH_ALTCHARS)	   \
3376 	    || (cmp_status->state == COMPOSING_COMPONENT_RULE		   \
3377 		&& cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS))) \
3378       {									   \
3379 	*charbuf++ = -1;						   \
3380 	*charbuf++= -1;							   \
3381 	cmp_status->state = COMPOSING_CHAR;				   \
3382 	cmp_status->length += 2;					   \
3383       }									   \
3384     else								   \
3385       {									   \
3386 	MAYBE_FINISH_COMPOSITION ();					   \
3387 	cmp_status->method = (c1 == '0' ? COMPOSITION_RELATIVE		   \
3388 			      : c1 == '2' ? COMPOSITION_WITH_RULE	   \
3389 			      : c1 == '3' ? COMPOSITION_WITH_ALTCHARS	   \
3390 			      : COMPOSITION_WITH_RULE_ALTCHARS);	   \
3391 	cmp_status->state						   \
3392 	  = (c1 <= '2' ? COMPOSING_CHAR : COMPOSING_COMPONENT_CHAR);	   \
3393 	ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);	   \
3394 	cmp_status->length = MAX_ANNOTATION_LENGTH;			   \
3395 	cmp_status->nchars = cmp_status->ncomps = 0;			   \
3396 	coding->annotated = 1;						   \
3397       }									   \
3398   } while (0)
3399 
3400 
3401 /* Handle composition end sequence ESC 1.  */
3402 
3403 #define DECODE_COMPOSITION_END()					\
3404   do {									\
3405     if (cmp_status->nchars == 0						\
3406 	|| ((cmp_status->state == COMPOSING_CHAR)			\
3407 	    == (cmp_status->method == COMPOSITION_WITH_RULE)))		\
3408       {									\
3409 	MAYBE_FINISH_COMPOSITION ();					\
3410 	goto invalid_code;						\
3411       }									\
3412     if (cmp_status->method == COMPOSITION_WITH_ALTCHARS)		\
3413       charbuf[- cmp_status->length] -= cmp_status->ncomps + 2;		\
3414     else if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)	\
3415       charbuf[- cmp_status->length] -= cmp_status->ncomps * 3;		\
3416     charbuf[- cmp_status->length + 2] = cmp_status->nchars;		\
3417     char_offset += cmp_status->nchars;					\
3418     cmp_status->state = COMPOSING_NO;					\
3419   } while (0)
3420 
3421 /* Store a composition rule RULE in charbuf, and update cmp_status.  */
3422 
3423 #define STORE_COMPOSITION_RULE(rule)	\
3424   do {					\
3425     *charbuf++ = -2;			\
3426     *charbuf++ = rule;			\
3427     cmp_status->length += 2;		\
3428     cmp_status->state--;		\
3429   } while (0)
3430 
3431 /* Store a composed char or a component char C in charbuf, and update
3432    cmp_status.  */
3433 
3434 #define STORE_COMPOSITION_CHAR(c)					\
3435   do {									\
3436     *charbuf++ = (c);							\
3437     cmp_status->length++;						\
3438     if (cmp_status->state == COMPOSING_CHAR)				\
3439       cmp_status->nchars++;						\
3440     else								\
3441       cmp_status->ncomps++;						\
3442     if (cmp_status->method == COMPOSITION_WITH_RULE			\
3443 	|| (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS	\
3444 	    && cmp_status->state == COMPOSING_COMPONENT_CHAR))		\
3445       cmp_status->state++;						\
3446   } while (0)
3447 
3448 
3449 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
3450 
3451 static void
decode_coding_iso_2022(struct coding_system * coding)3452 decode_coding_iso_2022 (struct coding_system *coding)
3453 {
3454   const unsigned char *src = coding->source + coding->consumed;
3455   const unsigned char *src_end = coding->source + coding->src_bytes;
3456   const unsigned char *src_base;
3457   int *charbuf = coding->charbuf + coding->charbuf_used;
3458   /* We may produce two annotations (charset and composition) in one
3459      loop and one more charset annotation at the end.  */
3460   int *charbuf_end
3461     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
3462   ptrdiff_t consumed_chars = 0, consumed_chars_base;
3463   bool multibytep = coding->src_multibyte;
3464   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
3465   int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3466   int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3467   int charset_id_2, charset_id_3;
3468   struct charset *charset;
3469   int c;
3470   struct composition_status *cmp_status = CODING_ISO_CMP_STATUS (coding);
3471   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
3472   ptrdiff_t char_offset = coding->produced_char;
3473   ptrdiff_t last_offset = char_offset;
3474   int last_id = charset_ascii;
3475   bool eol_dos
3476     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
3477   int byte_after_cr = -1;
3478   int i;
3479 
3480   setup_iso_safe_charsets (attrs);
3481   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
3482 
3483   if (cmp_status->state != COMPOSING_NO)
3484     {
3485       if (charbuf_end - charbuf < cmp_status->length)
3486 	emacs_abort ();
3487       for (i = 0; i < cmp_status->length; i++)
3488 	*charbuf++ = cmp_status->carryover[i];
3489       coding->annotated = 1;
3490     }
3491 
3492   while (1)
3493     {
3494       int c1, c2, c3;
3495 
3496       src_base = src;
3497       consumed_chars_base = consumed_chars;
3498 
3499       if (charbuf >= charbuf_end)
3500 	{
3501 	  if (byte_after_cr >= 0)
3502 	    src_base--;
3503 	  break;
3504 	}
3505 
3506       if (byte_after_cr >= 0)
3507 	c1 = byte_after_cr, byte_after_cr = -1;
3508       else
3509 	ONE_MORE_BYTE (c1);
3510       if (c1 < 0)
3511 	goto invalid_code;
3512 
3513       if (CODING_ISO_EXTSEGMENT_LEN (coding) > 0)
3514 	{
3515 	  *charbuf++ = ASCII_CHAR_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3516 	  char_offset++;
3517 	  CODING_ISO_EXTSEGMENT_LEN (coding)--;
3518 	  continue;
3519 	}
3520 
3521       if (CODING_ISO_EMBEDDED_UTF_8 (coding))
3522 	{
3523 	  if (c1 == ISO_CODE_ESC)
3524 	    {
3525 	      if (src + 1 >= src_end)
3526 		goto no_more_source;
3527 	      *charbuf++ = ISO_CODE_ESC;
3528 	      char_offset++;
3529 	      if (src[0] == '%' && src[1] == '@')
3530 		{
3531 		  src += 2;
3532 		  consumed_chars += 2;
3533 		  char_offset += 2;
3534 		  /* We are sure charbuf can contain two more chars. */
3535 		  *charbuf++ = '%';
3536 		  *charbuf++ = '@';
3537 		  CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
3538 		}
3539 	    }
3540 	  else
3541 	    {
3542 	      *charbuf++ = ASCII_CHAR_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3543 	      char_offset++;
3544 	    }
3545 	  continue;
3546 	}
3547 
3548       if ((cmp_status->state == COMPOSING_RULE
3549 	   || cmp_status->state == COMPOSING_COMPONENT_RULE)
3550 	  && c1 != ISO_CODE_ESC)
3551 	{
3552 	  int rule;
3553 
3554 	  DECODE_COMPOSITION_RULE (rule);
3555 	  STORE_COMPOSITION_RULE (rule);
3556 	  continue;
3557 	}
3558 
3559       /* We produce at most one character.  */
3560       switch (iso_code_class [c1])
3561 	{
3562 	case ISO_0x20_or_0x7F:
3563 	  if (charset_id_0 < 0
3564 	      || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
3565 	    /* This is SPACE or DEL.  */
3566 	    charset = CHARSET_FROM_ID (charset_ascii);
3567 	  else
3568 	    charset = CHARSET_FROM_ID (charset_id_0);
3569 	  break;
3570 
3571 	case ISO_graphic_plane_0:
3572 	  if (charset_id_0 < 0)
3573 	    charset = CHARSET_FROM_ID (charset_ascii);
3574 	  else
3575 	    charset = CHARSET_FROM_ID (charset_id_0);
3576 	  break;
3577 
3578 	case ISO_0xA0_or_0xFF:
3579 	  if (charset_id_1 < 0
3580 	      || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3581 	      || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3582 	    goto invalid_code;
3583 	  /* This is a graphic character, we fall down ... */
3584 	  FALLTHROUGH;
3585 	case ISO_graphic_plane_1:
3586 	  if (charset_id_1 < 0)
3587 	    goto invalid_code;
3588 	  charset = CHARSET_FROM_ID (charset_id_1);
3589 	  break;
3590 
3591 	case ISO_control_0:
3592 	  if (eol_dos && c1 == '\r')
3593 	    ONE_MORE_BYTE (byte_after_cr);
3594 	  MAYBE_FINISH_COMPOSITION ();
3595 	  charset = CHARSET_FROM_ID (charset_ascii);
3596 	  break;
3597 
3598 	case ISO_control_1:
3599 	  goto invalid_code;
3600 
3601 	case ISO_shift_out:
3602 	  if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3603 	      || CODING_ISO_DESIGNATION (coding, 1) < 0)
3604 	    goto invalid_code;
3605 	  CODING_ISO_INVOCATION (coding, 0) = 1;
3606 	  charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3607 	  continue;
3608 
3609 	case ISO_shift_in:
3610 	  if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3611 	    goto invalid_code;
3612 	  CODING_ISO_INVOCATION (coding, 0) = 0;
3613 	  charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3614 	  continue;
3615 
3616 	case ISO_single_shift_2_7:
3617 	  if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS))
3618 	    goto invalid_code;
3619 	  FALLTHROUGH;
3620 	case ISO_single_shift_2:
3621 	  if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3622 	    goto invalid_code;
3623 	  /* SS2 is handled as an escape sequence of ESC 'N' */
3624 	  c1 = 'N';
3625 	  goto label_escape_sequence;
3626 
3627 	case ISO_single_shift_3:
3628 	  if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3629 	    goto invalid_code;
3630 	  /* SS2 is handled as an escape sequence of ESC 'O' */
3631 	  c1 = 'O';
3632 	  goto label_escape_sequence;
3633 
3634 	case ISO_control_sequence_introducer:
3635 	  /* CSI is handled as an escape sequence of ESC '[' ...  */
3636 	  c1 = '[';
3637 	  goto label_escape_sequence;
3638 
3639 	case ISO_escape:
3640 	  ONE_MORE_BYTE (c1);
3641 	label_escape_sequence:
3642 	  /* Escape sequences handled here are invocation,
3643 	     designation, direction specification, and character
3644 	     composition specification.  */
3645 	  switch (c1)
3646 	    {
3647 	    case '&':		/* revision of following character set */
3648 	      ONE_MORE_BYTE (c1);
3649 	      if (!(c1 >= '@' && c1 <= '~'))
3650 		goto invalid_code;
3651 	      ONE_MORE_BYTE (c1);
3652 	      if (c1 != ISO_CODE_ESC)
3653 		goto invalid_code;
3654 	      ONE_MORE_BYTE (c1);
3655 	      goto label_escape_sequence;
3656 
3657 	    case '$':		/* designation of 2-byte character set */
3658 	      if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3659 		goto invalid_code;
3660 	      {
3661 		int reg, chars96;
3662 
3663 		ONE_MORE_BYTE (c1);
3664 		if (c1 >= '@' && c1 <= 'B')
3665 		  {	/* designation of JISX0208.1978, GB2312.1980,
3666 			   or JISX0208.1980 */
3667 		    reg = 0, chars96 = 0;
3668 		  }
3669 		else if (c1 >= 0x28 && c1 <= 0x2B)
3670 		  { /* designation of DIMENSION2_CHARS94 character set */
3671 		    reg = c1 - 0x28, chars96 = 0;
3672 		    ONE_MORE_BYTE (c1);
3673 		  }
3674 		else if (c1 >= 0x2C && c1 <= 0x2F)
3675 		  { /* designation of DIMENSION2_CHARS96 character set */
3676 		    reg = c1 - 0x2C, chars96 = 1;
3677 		    ONE_MORE_BYTE (c1);
3678 		  }
3679 		else
3680 		  goto invalid_code;
3681 		DECODE_DESIGNATION (reg, 2, chars96, c1);
3682 		/* We must update these variables now.  */
3683 		if (reg == 0)
3684 		  charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3685 		else if (reg == 1)
3686 		  charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3687 		if (chars96 < 0)
3688 		  goto invalid_code;
3689 	      }
3690 	      continue;
3691 
3692 	    case 'n':		/* invocation of locking-shift-2 */
3693 	      if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3694 		  || CODING_ISO_DESIGNATION (coding, 2) < 0)
3695 		goto invalid_code;
3696 	      CODING_ISO_INVOCATION (coding, 0) = 2;
3697 	      charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3698 	      continue;
3699 
3700 	    case 'o':		/* invocation of locking-shift-3 */
3701 	      if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3702 		  || CODING_ISO_DESIGNATION (coding, 3) < 0)
3703 		goto invalid_code;
3704 	      CODING_ISO_INVOCATION (coding, 0) = 3;
3705 	      charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3706 	      continue;
3707 
3708 	    case 'N':		/* invocation of single-shift-2 */
3709 	      if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3710 		  || CODING_ISO_DESIGNATION (coding, 2) < 0)
3711 		goto invalid_code;
3712 	      charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3713 	      if (charset_id_2 < 0)
3714 		charset = CHARSET_FROM_ID (charset_ascii);
3715 	      else
3716 		charset = CHARSET_FROM_ID (charset_id_2);
3717 	      ONE_MORE_BYTE (c1);
3718 	      if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0)
3719 		  || (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3720 		      && ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LEVEL_4)
3721 			  ? c1 >= 0x80 : c1 < 0x80)))
3722 		goto invalid_code;
3723 	      break;
3724 
3725 	    case 'O':		/* invocation of single-shift-3 */
3726 	      if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3727 		  || CODING_ISO_DESIGNATION (coding, 3) < 0)
3728 		goto invalid_code;
3729 	      charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3730 	      if (charset_id_3 < 0)
3731 		charset = CHARSET_FROM_ID (charset_ascii);
3732 	      else
3733 		charset = CHARSET_FROM_ID (charset_id_3);
3734 	      ONE_MORE_BYTE (c1);
3735 	      if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0)
3736 		  || (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3737 		      && ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LEVEL_4)
3738 			  ? c1 >= 0x80 : c1 < 0x80)))
3739 		goto invalid_code;
3740 	      break;
3741 
3742 	    case '0': case '2':	case '3': case '4': /* start composition */
3743 	      if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3744 		goto invalid_code;
3745 	      if (last_id != charset_ascii)
3746 		{
3747 		  ADD_CHARSET_DATA (charbuf, char_offset- last_offset, last_id);
3748 		  last_id = charset_ascii;
3749 		  last_offset = char_offset;
3750 		}
3751 	      DECODE_COMPOSITION_START (c1);
3752 	      continue;
3753 
3754 	    case '1':		/* end composition */
3755 	      if (cmp_status->state == COMPOSING_NO)
3756 		goto invalid_code;
3757 	      DECODE_COMPOSITION_END ();
3758 	      continue;
3759 
3760 	    case '[':		/* specification of direction */
3761 	      if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION))
3762 		goto invalid_code;
3763 	      /* For the moment, nested direction is not supported.
3764 		 So, `coding->mode & CODING_MODE_DIRECTION' zero means
3765 		 left-to-right, and nonzero means right-to-left.  */
3766 	      ONE_MORE_BYTE (c1);
3767 	      switch (c1)
3768 		{
3769 		case ']':	/* end of the current direction */
3770 		  coding->mode &= ~CODING_MODE_DIRECTION;
3771 		  break;
3772 
3773 		case '0':	/* end of the current direction */
3774 		case '1':	/* start of left-to-right direction */
3775 		  ONE_MORE_BYTE (c1);
3776 		  if (c1 == ']')
3777 		    coding->mode &= ~CODING_MODE_DIRECTION;
3778 		  else
3779 		    goto invalid_code;
3780 		  break;
3781 
3782 		case '2':	/* start of right-to-left direction */
3783 		  ONE_MORE_BYTE (c1);
3784 		  if (c1 == ']')
3785 		    coding->mode |= CODING_MODE_DIRECTION;
3786 		  else
3787 		    goto invalid_code;
3788 		  break;
3789 
3790 		default:
3791 		  goto invalid_code;
3792 		}
3793 	      continue;
3794 
3795 	    case '%':
3796 	      ONE_MORE_BYTE (c1);
3797 	      if (c1 == '/')
3798 		{
3799 		  /* CTEXT extended segment:
3800 		     ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3801 		     We keep these bytes as is for the moment.
3802 		     They may be decoded by post-read-conversion.  */
3803 		  int dim, M, L;
3804 		  int size;
3805 
3806 		  ONE_MORE_BYTE (dim);
3807 		  if (dim < '0' || dim > '4')
3808 		    goto invalid_code;
3809 		  ONE_MORE_BYTE (M);
3810 		  if (M < 128)
3811 		    goto invalid_code;
3812 		  ONE_MORE_BYTE (L);
3813 		  if (L < 128)
3814 		    goto invalid_code;
3815 		  size = ((M - 128) * 128) + (L - 128);
3816 		  if (charbuf + 6 > charbuf_end)
3817 		    goto break_loop;
3818 		  *charbuf++ = ISO_CODE_ESC;
3819 		  *charbuf++ = '%';
3820 		  *charbuf++ = '/';
3821 		  *charbuf++ = dim;
3822 		  *charbuf++ = BYTE8_TO_CHAR (M);
3823 		  *charbuf++ = BYTE8_TO_CHAR (L);
3824 		  CODING_ISO_EXTSEGMENT_LEN (coding) = size;
3825 		}
3826 	      else if (c1 == 'G')
3827 		{
3828 		  /* XFree86 extension for embedding UTF-8 in CTEXT:
3829 		     ESC % G --UTF-8-BYTES-- ESC % @
3830 		     We keep these bytes as is for the moment.
3831 		     They may be decoded by post-read-conversion.  */
3832 		  if (charbuf + 3 > charbuf_end)
3833 		    goto break_loop;
3834 		  *charbuf++ = ISO_CODE_ESC;
3835 		  *charbuf++ = '%';
3836 		  *charbuf++ = 'G';
3837 		  CODING_ISO_EMBEDDED_UTF_8 (coding) = 1;
3838 		}
3839 	      else
3840 		goto invalid_code;
3841 	      continue;
3842 	      break;
3843 
3844 	    default:
3845 	      if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3846 		goto invalid_code;
3847 	      {
3848 		int reg, chars96;
3849 
3850 		if (c1 >= 0x28 && c1 <= 0x2B)
3851 		  { /* designation of DIMENSION1_CHARS94 character set */
3852 		    reg = c1 - 0x28, chars96 = 0;
3853 		    ONE_MORE_BYTE (c1);
3854 		  }
3855 		else if (c1 >= 0x2C && c1 <= 0x2F)
3856 		  { /* designation of DIMENSION1_CHARS96 character set */
3857 		    reg = c1 - 0x2C, chars96 = 1;
3858 		    ONE_MORE_BYTE (c1);
3859 		  }
3860 		else
3861 		  goto invalid_code;
3862 		DECODE_DESIGNATION (reg, 1, chars96, c1);
3863 		/* We must update these variables now.  */
3864 		if (reg == 0)
3865 		  charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3866 		else if (reg == 1)
3867 		  charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3868 		if (chars96 < 0)
3869 		  goto invalid_code;
3870 	      }
3871 	      continue;
3872 	    }
3873 	  break;
3874 
3875 	default:
3876 	  emacs_abort ();
3877 	}
3878 
3879       if (cmp_status->state == COMPOSING_NO
3880 	  && charset->id != charset_ascii
3881 	  && last_id != charset->id)
3882 	{
3883 	  if (last_id != charset_ascii)
3884 	    ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3885 	  last_id = charset->id;
3886 	  last_offset = char_offset;
3887 	}
3888 
3889       /* Now we know CHARSET and 1st position code C1 of a character.
3890          Produce a decoded character while getting 2nd and 3rd
3891          position codes C2, C3 if necessary.  */
3892       if (CHARSET_DIMENSION (charset) > 1)
3893 	{
3894 	  ONE_MORE_BYTE (c2);
3895 	  if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0)
3896 	      || ((c1 & 0x80) != (c2 & 0x80)))
3897 	    /* C2 is not in a valid range.  */
3898 	    goto invalid_code;
3899 	  if (CHARSET_DIMENSION (charset) == 2)
3900 	    c1 = (c1 << 8) | c2;
3901 	  else
3902 	    {
3903 	      ONE_MORE_BYTE (c3);
3904 	      if (c3 < 0x20 || (c3 >= 0x80 && c3 < 0xA0)
3905 		  || ((c1 & 0x80) != (c3 & 0x80)))
3906 		/* C3 is not in a valid range.  */
3907 		goto invalid_code;
3908 	      c1 = (c1 << 16) | (c2 << 8) | c2;
3909 	    }
3910 	}
3911       c1 &= 0x7F7F7F;
3912       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3913       if (c < 0)
3914 	{
3915 	  MAYBE_FINISH_COMPOSITION ();
3916 	  for (; src_base < src; src_base++, char_offset++)
3917 	    {
3918 	      if (ASCII_CHAR_P (*src_base))
3919 		*charbuf++ = *src_base;
3920 	      else
3921 		*charbuf++ = BYTE8_TO_CHAR (*src_base);
3922 	    }
3923 	}
3924       else if (cmp_status->state == COMPOSING_NO)
3925 	{
3926 	  *charbuf++ = c;
3927 	  char_offset++;
3928 	}
3929       else if ((cmp_status->state == COMPOSING_CHAR
3930 		? cmp_status->nchars
3931 		: cmp_status->ncomps)
3932 	       >= MAX_COMPOSITION_COMPONENTS)
3933 	{
3934 	  /* Too long composition.  */
3935 	  MAYBE_FINISH_COMPOSITION ();
3936 	  *charbuf++ = c;
3937 	  char_offset++;
3938 	}
3939       else
3940 	STORE_COMPOSITION_CHAR (c);
3941       continue;
3942 
3943     invalid_code:
3944       MAYBE_FINISH_COMPOSITION ();
3945       src = src_base;
3946       consumed_chars = consumed_chars_base;
3947       ONE_MORE_BYTE (c);
3948       *charbuf++ = c < 0 ? -c : ASCII_CHAR_P (c) ? c : BYTE8_TO_CHAR (c);
3949       char_offset++;
3950       /* Reset the invocation and designation status to the safest
3951 	 one; i.e. designate ASCII to the graphic register 0, and
3952 	 invoke that register to the graphic plane 0.  This typically
3953 	 helps the case that a designation sequence for ASCII "ESC (
3954 	 B" is somehow broken (e.g. broken by a newline).  */
3955       CODING_ISO_INVOCATION (coding, 0) = 0;
3956       CODING_ISO_DESIGNATION (coding, 0) = charset_ascii;
3957       charset_id_0 = charset_ascii;
3958       continue;
3959 
3960     break_loop:
3961       break;
3962     }
3963 
3964  no_more_source:
3965   if (cmp_status->state != COMPOSING_NO)
3966     {
3967       if (coding->mode & CODING_MODE_LAST_BLOCK)
3968 	MAYBE_FINISH_COMPOSITION ();
3969       else
3970 	{
3971 	  charbuf -= cmp_status->length;
3972 	  for (i = 0; i < cmp_status->length; i++)
3973 	    cmp_status->carryover[i] = charbuf[i];
3974 	}
3975     }
3976   else if (last_id != charset_ascii)
3977     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3978   coding->consumed_char += consumed_chars_base;
3979   coding->consumed = src_base - coding->source;
3980   coding->charbuf_used = charbuf - coding->charbuf;
3981 }
3982 
3983 
3984 /* ISO2022 encoding stuff.  */
3985 
3986 /*
3987    It is not enough to say just "ISO2022" on encoding, we have to
3988    specify more details.  In Emacs, each coding system of ISO2022
3989    variant has the following specifications:
3990 	1. Initial designation to G0 thru G3.
3991 	2. Allows short-form designation?
3992 	3. ASCII should be designated to G0 before control characters?
3993 	4. ASCII should be designated to G0 at end of line?
3994 	5. 7-bit environment or 8-bit environment?
3995 	6. Use locking-shift?
3996 	7. Use Single-shift?
3997    And the following two are only for Japanese:
3998 	8. Use ASCII in place of JIS0201-1976-Roman?
3999 	9. Use JISX0208-1983 in place of JISX0208-1978?
4000    These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
4001    defined by macros CODING_ISO_FLAG_XXX.  See `coding.h' for more
4002    details.
4003 */
4004 
4005 /* Produce codes (escape sequence) for designating CHARSET to graphic
4006    register REG at DST, and increment DST.  If <final-char> of CHARSET is
4007    '@', 'A', or 'B' and the coding system CODING allows, produce
4008    designation sequence of short-form.  */
4009 
4010 #define ENCODE_DESIGNATION(charset, reg, coding)			\
4011   do {									\
4012     unsigned char final_char = CHARSET_ISO_FINAL (charset);		\
4013     const char *intermediate_char_94 = "()*+";				\
4014     const char *intermediate_char_96 = ",-./";				\
4015     int revision = -1;							\
4016 									\
4017     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION)		\
4018       revision = CHARSET_ISO_REVISION (charset);			\
4019 									\
4020     if (revision >= 0)							\
4021       {									\
4022 	EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&');			\
4023 	EMIT_ONE_BYTE ('@' + revision);					\
4024       }									\
4025     EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC);					\
4026     if (CHARSET_DIMENSION (charset) == 1)				\
4027       {									\
4028 	int b;								\
4029 	if (! CHARSET_ISO_CHARS_96 (charset))				\
4030 	  b = intermediate_char_94[reg];				\
4031 	else								\
4032 	  b = intermediate_char_96[reg];				\
4033 	EMIT_ONE_ASCII_BYTE (b);					\
4034       }									\
4035     else								\
4036       {									\
4037 	EMIT_ONE_ASCII_BYTE ('$');					\
4038 	if (! CHARSET_ISO_CHARS_96 (charset))				\
4039 	  {								\
4040 	    if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM	\
4041 		|| reg != 0						\
4042 		|| final_char < '@' || final_char > 'B')		\
4043 	      EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]);		\
4044 	  }								\
4045 	else								\
4046 	  EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]);		\
4047       }									\
4048     EMIT_ONE_ASCII_BYTE (final_char);					\
4049 									\
4050     CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset);	\
4051   } while (0)
4052 
4053 
4054 /* The following two macros produce codes (control character or escape
4055    sequence) for ISO2022 single-shift functions (single-shift-2 and
4056    single-shift-3).  */
4057 
4058 #define ENCODE_SINGLE_SHIFT_2						\
4059   do {									\
4060     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)		\
4061       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N');				\
4062     else								\
4063       EMIT_ONE_BYTE (ISO_CODE_SS2);					\
4064     CODING_ISO_SINGLE_SHIFTING (coding) = 1;				\
4065   } while (0)
4066 
4067 
4068 #define ENCODE_SINGLE_SHIFT_3						\
4069   do {									\
4070     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)		\
4071       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O');				\
4072     else								\
4073       EMIT_ONE_BYTE (ISO_CODE_SS3);					\
4074     CODING_ISO_SINGLE_SHIFTING (coding) = 1;				\
4075   } while (0)
4076 
4077 
4078 /* The following four macros produce codes (control character or
4079    escape sequence) for ISO2022 locking-shift functions (shift-in,
4080    shift-out, locking-shift-2, and locking-shift-3).  */
4081 
4082 #define ENCODE_SHIFT_IN					\
4083   do {							\
4084     EMIT_ONE_ASCII_BYTE (ISO_CODE_SI);			\
4085     CODING_ISO_INVOCATION (coding, 0) = 0;		\
4086   } while (0)
4087 
4088 
4089 #define ENCODE_SHIFT_OUT				\
4090   do {							\
4091     EMIT_ONE_ASCII_BYTE (ISO_CODE_SO);			\
4092     CODING_ISO_INVOCATION (coding, 0) = 1;		\
4093   } while (0)
4094 
4095 
4096 #define ENCODE_LOCKING_SHIFT_2				\
4097   do {							\
4098     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');		\
4099     CODING_ISO_INVOCATION (coding, 0) = 2;		\
4100   } while (0)
4101 
4102 
4103 #define ENCODE_LOCKING_SHIFT_3				\
4104   do {							\
4105     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');		\
4106     CODING_ISO_INVOCATION (coding, 0) = 3;		\
4107   } while (0)
4108 
4109 
4110 /* Produce codes for a DIMENSION1 character whose character set is
4111    CHARSET and whose position-code is C1.  Designation and invocation
4112    sequences are also produced in advance if necessary.  */
4113 
4114 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)			\
4115   do {									\
4116     int id = CHARSET_ID (charset);					\
4117 									\
4118     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)		\
4119 	&& id == charset_ascii)						\
4120       {									\
4121 	id = charset_jisx0201_roman;					\
4122 	charset = CHARSET_FROM_ID (id);					\
4123       }									\
4124 									\
4125     if (CODING_ISO_SINGLE_SHIFTING (coding))				\
4126       {									\
4127 	if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)	\
4128 	  EMIT_ONE_ASCII_BYTE (c1 & 0x7F);				\
4129 	else								\
4130 	  EMIT_ONE_BYTE (c1 | 0x80);					\
4131 	CODING_ISO_SINGLE_SHIFTING (coding) = 0;			\
4132 	break;								\
4133       }									\
4134     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))		\
4135       {									\
4136 	EMIT_ONE_ASCII_BYTE (c1 & 0x7F);				\
4137 	break;								\
4138       }									\
4139     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))		\
4140       {									\
4141 	EMIT_ONE_BYTE (c1 | 0x80);					\
4142 	break;								\
4143       }									\
4144     else								\
4145       /* Since CHARSET is not yet invoked to any graphic planes, we	\
4146 	 must invoke it, or, at first, designate it to some graphic	\
4147 	 register.  Then repeat the loop to actually produce the	\
4148 	 character.  */							\
4149       dst = encode_invocation_designation (charset, coding, dst,	\
4150 					   &produced_chars);		\
4151   } while (1)
4152 
4153 
4154 /* Produce codes for a DIMENSION2 character whose character set is
4155    CHARSET and whose position-codes are C1 and C2.  Designation and
4156    invocation codes are also produced in advance if necessary.  */
4157 
4158 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)		\
4159   do {									\
4160     int id = CHARSET_ID (charset);					\
4161 									\
4162     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)	\
4163 	&& id == charset_jisx0208)					\
4164       {									\
4165 	id = charset_jisx0208_1978;					\
4166 	charset = CHARSET_FROM_ID (id);					\
4167       }									\
4168 									\
4169     if (CODING_ISO_SINGLE_SHIFTING (coding))				\
4170       {									\
4171 	if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)	\
4172 	  EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);		\
4173 	else								\
4174 	  EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);			\
4175 	CODING_ISO_SINGLE_SHIFTING (coding) = 0;			\
4176 	break;								\
4177       }									\
4178     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))		\
4179       {									\
4180 	EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);		\
4181 	break;								\
4182       }									\
4183     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))		\
4184       {									\
4185 	EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);			\
4186 	break;								\
4187       }									\
4188     else								\
4189       /* Since CHARSET is not yet invoked to any graphic planes, we	\
4190 	 must invoke it, or, at first, designate it to some graphic	\
4191 	 register.  Then repeat the loop to actually produce the	\
4192 	 character.  */							\
4193       dst = encode_invocation_designation (charset, coding, dst,	\
4194 					   &produced_chars);		\
4195   } while (1)
4196 
4197 
4198 #define ENCODE_ISO_CHARACTER(charset, c)				   \
4199   do {									   \
4200     unsigned code;							   \
4201     CODING_ENCODE_CHAR (coding, dst, dst_end, (charset), (c), code);	   \
4202 									   \
4203     if (CHARSET_DIMENSION (charset) == 1)				   \
4204       ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code);		   \
4205     else								   \
4206       ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
4207   } while (0)
4208 
4209 
4210 /* Produce designation and invocation codes at a place pointed by DST
4211    to use CHARSET.  The element `spec.iso_2022' of *CODING is updated.
4212    Return new DST.  */
4213 
4214 static unsigned char *
encode_invocation_designation(struct charset * charset,struct coding_system * coding,unsigned char * dst,ptrdiff_t * p_nchars)4215 encode_invocation_designation (struct charset *charset,
4216 			       struct coding_system *coding,
4217 			       unsigned char *dst, ptrdiff_t *p_nchars)
4218 {
4219   bool multibytep = coding->dst_multibyte;
4220   ptrdiff_t produced_chars = *p_nchars;
4221   int reg;			/* graphic register number */
4222   int id = CHARSET_ID (charset);
4223 
4224   /* At first, check designations.  */
4225   for (reg = 0; reg < 4; reg++)
4226     if (id == CODING_ISO_DESIGNATION (coding, reg))
4227       break;
4228 
4229   if (reg >= 4)
4230     {
4231       /* CHARSET is not yet designated to any graphic registers.  */
4232       /* At first check the requested designation.  */
4233       reg = CODING_ISO_REQUEST (coding, id);
4234       if (reg < 0)
4235 	/* Since CHARSET requests no special designation, designate it
4236 	   to graphic register 0.  */
4237 	reg = 0;
4238 
4239       ENCODE_DESIGNATION (charset, reg, coding);
4240     }
4241 
4242   if (CODING_ISO_INVOCATION (coding, 0) != reg
4243       && CODING_ISO_INVOCATION (coding, 1) != reg)
4244     {
4245       /* Since the graphic register REG is not invoked to any graphic
4246 	 planes, invoke it to graphic plane 0.  */
4247       switch (reg)
4248 	{
4249 	case 0:			/* graphic register 0 */
4250 	  ENCODE_SHIFT_IN;
4251 	  break;
4252 
4253 	case 1:			/* graphic register 1 */
4254 	  ENCODE_SHIFT_OUT;
4255 	  break;
4256 
4257 	case 2:			/* graphic register 2 */
4258 	  if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4259 	    ENCODE_SINGLE_SHIFT_2;
4260 	  else
4261 	    ENCODE_LOCKING_SHIFT_2;
4262 	  break;
4263 
4264 	case 3:			/* graphic register 3 */
4265 	  if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4266 	    ENCODE_SINGLE_SHIFT_3;
4267 	  else
4268 	    ENCODE_LOCKING_SHIFT_3;
4269 	  break;
4270 
4271 	default:
4272 	  break;
4273 	}
4274     }
4275 
4276   *p_nchars = produced_chars;
4277   return dst;
4278 }
4279 
4280 
4281 /* Produce codes for designation and invocation to reset the graphic
4282    planes and registers to initial state.  */
4283 #define ENCODE_RESET_PLANE_AND_REGISTER()				\
4284   do {									\
4285     int reg;								\
4286     struct charset *charset;						\
4287 									\
4288     if (CODING_ISO_INVOCATION (coding, 0) != 0)				\
4289       ENCODE_SHIFT_IN;							\
4290     for (reg = 0; reg < 4; reg++)					\
4291       if (CODING_ISO_INITIAL (coding, reg) >= 0				\
4292 	  && (CODING_ISO_DESIGNATION (coding, reg)			\
4293 	      != CODING_ISO_INITIAL (coding, reg)))			\
4294 	{								\
4295 	  charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg));	\
4296 	  ENCODE_DESIGNATION (charset, reg, coding);			\
4297 	}								\
4298   } while (0)
4299 
4300 
4301 /* Produce designation sequences of charsets in the line started from
4302    CHARBUF to a place pointed by DST, and return the number of
4303    produced bytes.  DST should not directly point a buffer text area
4304    which may be relocated by char_charset call.
4305 
4306    If the current block ends before any end-of-line, we may fail to
4307    find all the necessary designations.  */
4308 
4309 static ptrdiff_t
encode_designation_at_bol(struct coding_system * coding,int * charbuf,int * charbuf_end,unsigned char * dst)4310 encode_designation_at_bol (struct coding_system *coding,
4311 			   int *charbuf, int *charbuf_end,
4312 			   unsigned char *dst)
4313 {
4314   unsigned char *orig = dst;
4315   struct charset *charset;
4316   /* Table of charsets to be designated to each graphic register.  */
4317   int r[4];
4318   int c, found = 0, reg;
4319   ptrdiff_t produced_chars = 0;
4320   bool multibytep = coding->dst_multibyte;
4321   Lisp_Object attrs;
4322   Lisp_Object charset_list;
4323 
4324   attrs = CODING_ID_ATTRS (coding->id);
4325   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4326   if (EQ (charset_list, Qiso_2022))
4327     charset_list = Viso_2022_charset_list;
4328 
4329   for (reg = 0; reg < 4; reg++)
4330     r[reg] = -1;
4331 
4332   while (charbuf < charbuf_end && found < 4)
4333     {
4334       int id;
4335 
4336       c = *charbuf++;
4337       if (c == '\n')
4338 	break;
4339       charset = char_charset (c, charset_list, NULL);
4340       id = CHARSET_ID (charset);
4341       reg = CODING_ISO_REQUEST (coding, id);
4342       if (reg >= 0 && r[reg] < 0)
4343 	{
4344 	  found++;
4345 	  r[reg] = id;
4346 	}
4347     }
4348 
4349   if (found)
4350     {
4351       for (reg = 0; reg < 4; reg++)
4352 	if (r[reg] >= 0
4353 	    && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
4354 	  ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
4355     }
4356 
4357   return dst - orig;
4358 }
4359 
4360 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
4361 
4362 static bool
encode_coding_iso_2022(struct coding_system * coding)4363 encode_coding_iso_2022 (struct coding_system *coding)
4364 {
4365   bool multibytep = coding->dst_multibyte;
4366   int *charbuf = coding->charbuf;
4367   int *charbuf_end = charbuf + coding->charbuf_used;
4368   unsigned char *dst = coding->destination + coding->produced;
4369   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4370   int safe_room = 16;
4371   bool bol_designation
4372     = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
4373        && CODING_ISO_BOL (coding));
4374   ptrdiff_t produced_chars = 0;
4375   Lisp_Object attrs, eol_type, charset_list;
4376   bool ascii_compatible;
4377   int c;
4378   int preferred_charset_id = -1;
4379 
4380   CODING_GET_INFO (coding, attrs, charset_list);
4381   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
4382   if (VECTORP (eol_type))
4383     eol_type = Qunix;
4384 
4385   setup_iso_safe_charsets (attrs);
4386   /* Charset list may have been changed.  */
4387   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4388   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
4389 
4390   ascii_compatible
4391     = (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
4392        && ! (CODING_ISO_FLAGS (coding) & (CODING_ISO_FLAG_DESIGNATION
4393 					  | CODING_ISO_FLAG_LOCKING_SHIFT)));
4394 
4395   while (charbuf < charbuf_end)
4396     {
4397       ASSURE_DESTINATION (safe_room);
4398 
4399       if (bol_designation)
4400 	{
4401 	  /* We have to produce designation sequences if any now.  */
4402 	  unsigned char desig_buf[16];
4403 	  ptrdiff_t nbytes;
4404 	  ptrdiff_t offset;
4405 
4406 	  charset_map_loaded = 0;
4407 	  nbytes = encode_designation_at_bol (coding, charbuf, charbuf_end,
4408 					      desig_buf);
4409 	  if (charset_map_loaded
4410 	      && (offset = coding_change_destination (coding)))
4411 	    {
4412 	      dst += offset;
4413 	      dst_end += offset;
4414 	    }
4415 	  memcpy (dst, desig_buf, nbytes);
4416 	  dst += nbytes;
4417 	  /* We are sure that designation sequences are all ASCII bytes.  */
4418 	  produced_chars += nbytes;
4419 	  bol_designation = 0;
4420 	  ASSURE_DESTINATION (safe_room);
4421 	}
4422 
4423       c = *charbuf++;
4424 
4425       if (c < 0)
4426 	{
4427 	  /* Handle an annotation.  */
4428 	  switch (*charbuf)
4429 	    {
4430 	    case CODING_ANNOTATE_COMPOSITION_MASK:
4431 	      /* Not yet implemented.  */
4432 	      break;
4433 	    case CODING_ANNOTATE_CHARSET_MASK:
4434 	      preferred_charset_id = charbuf[2];
4435 	      if (preferred_charset_id >= 0
4436 		  && NILP (Fmemq (make_fixnum (preferred_charset_id),
4437 				  charset_list)))
4438 		preferred_charset_id = -1;
4439 	      break;
4440 	    default:
4441 	      emacs_abort ();
4442 	    }
4443 	  charbuf += -c - 1;
4444 	  continue;
4445 	}
4446 
4447       /* Now encode the character C.  */
4448       if (c < 0x20 || c == 0x7F)
4449 	{
4450 	  if (c == '\n'
4451 	      || (c == '\r' && EQ (eol_type, Qmac)))
4452 	    {
4453 	      if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4454 		ENCODE_RESET_PLANE_AND_REGISTER ();
4455 	      if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
4456 		{
4457 		  int i;
4458 
4459 		  for (i = 0; i < 4; i++)
4460 		    CODING_ISO_DESIGNATION (coding, i)
4461 		      = CODING_ISO_INITIAL (coding, i);
4462 		}
4463 	      bol_designation = ((CODING_ISO_FLAGS (coding)
4464 				  & CODING_ISO_FLAG_DESIGNATE_AT_BOL)
4465 				 != 0);
4466 	    }
4467 	  else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
4468 	    ENCODE_RESET_PLANE_AND_REGISTER ();
4469 	  EMIT_ONE_ASCII_BYTE (c);
4470 	}
4471       else if (ASCII_CHAR_P (c))
4472 	{
4473 	  if (ascii_compatible)
4474 	    EMIT_ONE_ASCII_BYTE (c);
4475 	  else
4476 	    {
4477 	      struct charset *charset = CHARSET_FROM_ID (charset_ascii);
4478 	      ENCODE_ISO_CHARACTER (charset, c);
4479 	    }
4480 	}
4481       else if (CHAR_BYTE8_P (c))
4482 	{
4483 	  c = CHAR_TO_BYTE8 (c);
4484 	  EMIT_ONE_BYTE (c);
4485 	}
4486       else
4487 	{
4488 	  struct charset *charset;
4489 
4490 	  if (preferred_charset_id >= 0)
4491 	    {
4492 	      bool result;
4493 
4494 	      charset = CHARSET_FROM_ID (preferred_charset_id);
4495 	      CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
4496 	      if (! result)
4497 		CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4498 				     NULL, charset);
4499 	    }
4500 	  else
4501 	    CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4502 				 NULL, charset);
4503 	  if (!charset)
4504 	    {
4505 	      if (coding->mode & CODING_MODE_SAFE_ENCODING)
4506 		{
4507 		  c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4508 		  charset = CHARSET_FROM_ID (charset_ascii);
4509 		}
4510 	      else
4511 		{
4512 		  c = coding->default_char;
4513 		  CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4514 				       charset_list, NULL, charset);
4515 		}
4516 	    }
4517 	  ENCODE_ISO_CHARACTER (charset, c);
4518 	}
4519     }
4520 
4521   if (coding->mode & CODING_MODE_LAST_BLOCK
4522       && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4523     {
4524       ASSURE_DESTINATION (safe_room);
4525       ENCODE_RESET_PLANE_AND_REGISTER ();
4526     }
4527   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4528   CODING_ISO_BOL (coding) = bol_designation;
4529   coding->produced_char += produced_chars;
4530   coding->produced = dst - coding->destination;
4531   return 0;
4532 }
4533 
4534 
4535 /*** 8,9. SJIS and BIG5 handlers ***/
4536 
4537 /* Although SJIS and BIG5 are not ISO's coding system, they are used
4538    quite widely.  So, for the moment, Emacs supports them in the bare
4539    C code.  But, in the future, they may be supported only by CCL.  */
4540 
4541 /* SJIS is a coding system encoding three character sets: ASCII, right
4542    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
4543    as is.  A character of charset katakana-jisx0201 is encoded by
4544    "position-code + 0x80".  A character of charset japanese-jisx0208
4545    is encoded in 2-byte but two position-codes are divided and shifted
4546    so that it fit in the range below.
4547 
4548    --- CODE RANGE of SJIS ---
4549    (character set)	(range)
4550    ASCII		0x00 .. 0x7F
4551    KATAKANA-JISX0201	0xA0 .. 0xDF
4552    JISX0208 (1st byte)	0x81 .. 0x9F and 0xE0 .. 0xEF
4553 	    (2nd byte)	0x40 .. 0x7E and 0x80 .. 0xFC
4554    -------------------------------
4555 
4556 */
4557 
4558 /* BIG5 is a coding system encoding two character sets: ASCII and
4559    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
4560    character set and is encoded in two-byte.
4561 
4562    --- CODE RANGE of BIG5 ---
4563    (character set)	(range)
4564    ASCII		0x00 .. 0x7F
4565    Big5 (1st byte)	0xA1 .. 0xFE
4566 	(2nd byte)	0x40 .. 0x7E and 0xA1 .. 0xFE
4567    --------------------------
4568 
4569   */
4570 
4571 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4572    Return true if a text is encoded in SJIS.  */
4573 
4574 static bool
detect_coding_sjis(struct coding_system * coding,struct coding_detection_info * detect_info)4575 detect_coding_sjis (struct coding_system *coding,
4576 		    struct coding_detection_info *detect_info)
4577 {
4578   const unsigned char *src = coding->source, *src_base;
4579   const unsigned char *src_end = coding->source + coding->src_bytes;
4580   bool multibytep = coding->src_multibyte;
4581   ptrdiff_t consumed_chars = 0;
4582   int found = 0;
4583   int c;
4584   Lisp_Object attrs, charset_list;
4585   int max_first_byte_of_2_byte_code;
4586 
4587   CODING_GET_INFO (coding, attrs, charset_list);
4588   max_first_byte_of_2_byte_code = list_length (charset_list) <= 3 ? 0xEF : 0xFC;
4589 
4590   detect_info->checked |= CATEGORY_MASK_SJIS;
4591   /* A coding system of this category is always ASCII compatible.  */
4592   src += coding->head_ascii;
4593 
4594   while (1)
4595     {
4596       src_base = src;
4597       ONE_MORE_BYTE (c);
4598       if (c < 0x80)
4599 	continue;
4600       if ((c >= 0x81 && c <= 0x9F)
4601 	  || (c >= 0xE0 && c <= max_first_byte_of_2_byte_code))
4602 	{
4603 	  ONE_MORE_BYTE (c);
4604 	  if (c < 0x40 || c == 0x7F || c > 0xFC)
4605 	    break;
4606 	  found = CATEGORY_MASK_SJIS;
4607 	}
4608       else if (c >= 0xA0 && c < 0xE0)
4609 	found = CATEGORY_MASK_SJIS;
4610       else
4611 	break;
4612     }
4613   detect_info->rejected |= CATEGORY_MASK_SJIS;
4614   return 0;
4615 
4616  no_more_source:
4617   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4618     {
4619       detect_info->rejected |= CATEGORY_MASK_SJIS;
4620       return 0;
4621     }
4622   detect_info->found |= found;
4623   return 1;
4624 }
4625 
4626 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4627    Return true if a text is encoded in BIG5.  */
4628 
4629 static bool
detect_coding_big5(struct coding_system * coding,struct coding_detection_info * detect_info)4630 detect_coding_big5 (struct coding_system *coding,
4631 		    struct coding_detection_info *detect_info)
4632 {
4633   const unsigned char *src = coding->source, *src_base;
4634   const unsigned char *src_end = coding->source + coding->src_bytes;
4635   bool multibytep = coding->src_multibyte;
4636   ptrdiff_t consumed_chars = 0;
4637   int found = 0;
4638   int c;
4639 
4640   detect_info->checked |= CATEGORY_MASK_BIG5;
4641   /* A coding system of this category is always ASCII compatible.  */
4642   src += coding->head_ascii;
4643 
4644   while (1)
4645     {
4646       src_base = src;
4647       ONE_MORE_BYTE (c);
4648       if (c < 0x80)
4649 	continue;
4650       if (c >= 0xA1)
4651 	{
4652 	  ONE_MORE_BYTE (c);
4653 	  if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4654 	    return 0;
4655 	  found = CATEGORY_MASK_BIG5;
4656 	}
4657       else
4658 	break;
4659     }
4660   detect_info->rejected |= CATEGORY_MASK_BIG5;
4661   return 0;
4662 
4663  no_more_source:
4664   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4665     {
4666       detect_info->rejected |= CATEGORY_MASK_BIG5;
4667       return 0;
4668     }
4669   detect_info->found |= found;
4670   return 1;
4671 }
4672 
4673 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
4674 
4675 static void
decode_coding_sjis(struct coding_system * coding)4676 decode_coding_sjis (struct coding_system *coding)
4677 {
4678   const unsigned char *src = coding->source + coding->consumed;
4679   const unsigned char *src_end = coding->source + coding->src_bytes;
4680   const unsigned char *src_base;
4681   int *charbuf = coding->charbuf + coding->charbuf_used;
4682   /* We may produce one charset annotation in one loop and one more at
4683      the end.  */
4684   int *charbuf_end
4685     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4686   ptrdiff_t consumed_chars = 0, consumed_chars_base;
4687   bool multibytep = coding->src_multibyte;
4688   struct charset *charset_roman, *charset_kanji, *charset_kana;
4689   struct charset *charset_kanji2;
4690   Lisp_Object attrs, charset_list, val;
4691   ptrdiff_t char_offset = coding->produced_char;
4692   ptrdiff_t last_offset = char_offset;
4693   int last_id = charset_ascii;
4694   bool eol_dos
4695     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4696   int byte_after_cr = -1;
4697 
4698   CODING_GET_INFO (coding, attrs, charset_list);
4699 
4700   val = charset_list;
4701   charset_roman = CHARSET_FROM_ID (XFIXNUM (XCAR (val))), val = XCDR (val);
4702   charset_kana = CHARSET_FROM_ID (XFIXNUM (XCAR (val))), val = XCDR (val);
4703   charset_kanji = CHARSET_FROM_ID (XFIXNUM (XCAR (val))), val = XCDR (val);
4704   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XFIXNUM (XCAR (val)));
4705 
4706   while (1)
4707     {
4708       int c, c1;
4709       struct charset *charset;
4710 
4711       src_base = src;
4712       consumed_chars_base = consumed_chars;
4713 
4714       if (charbuf >= charbuf_end)
4715 	{
4716 	  if (byte_after_cr >= 0)
4717 	    src_base--;
4718 	  break;
4719 	}
4720 
4721       if (byte_after_cr >= 0)
4722 	c = byte_after_cr, byte_after_cr = -1;
4723       else
4724 	ONE_MORE_BYTE (c);
4725       if (c < 0)
4726 	goto invalid_code;
4727       if (c < 0x80)
4728 	{
4729 	  if (eol_dos && c == '\r')
4730 	    ONE_MORE_BYTE (byte_after_cr);
4731 	  charset = charset_roman;
4732 	}
4733       else if (c == 0x80 || c == 0xA0)
4734 	goto invalid_code;
4735       else if (c >= 0xA1 && c <= 0xDF)
4736 	{
4737 	  /* SJIS -> JISX0201-Kana */
4738 	  c &= 0x7F;
4739 	  charset = charset_kana;
4740 	}
4741       else if (c <= 0xEF)
4742 	{
4743 	  /* SJIS -> JISX0208 */
4744 	  ONE_MORE_BYTE (c1);
4745 	  if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4746 	    goto invalid_code;
4747 	  c = (c << 8) | c1;
4748 	  SJIS_TO_JIS (c);
4749 	  charset = charset_kanji;
4750 	}
4751       else if (c <= 0xFC && charset_kanji2)
4752 	{
4753 	  /* SJIS -> JISX0213-2 */
4754 	  ONE_MORE_BYTE (c1);
4755 	  if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4756 	    goto invalid_code;
4757 	  c = (c << 8) | c1;
4758 	  SJIS_TO_JIS2 (c);
4759 	  charset = charset_kanji2;
4760 	}
4761       else
4762 	goto invalid_code;
4763       if (charset->id != charset_ascii
4764 	  && last_id != charset->id)
4765 	{
4766 	  if (last_id != charset_ascii)
4767 	    ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4768 	  last_id = charset->id;
4769 	  last_offset = char_offset;
4770 	}
4771       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4772       *charbuf++ = c;
4773       char_offset++;
4774       continue;
4775 
4776     invalid_code:
4777       src = src_base;
4778       consumed_chars = consumed_chars_base;
4779       ONE_MORE_BYTE (c);
4780       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4781       char_offset++;
4782     }
4783 
4784  no_more_source:
4785   if (last_id != charset_ascii)
4786     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4787   coding->consumed_char += consumed_chars_base;
4788   coding->consumed = src_base - coding->source;
4789   coding->charbuf_used = charbuf - coding->charbuf;
4790 }
4791 
4792 static void
decode_coding_big5(struct coding_system * coding)4793 decode_coding_big5 (struct coding_system *coding)
4794 {
4795   const unsigned char *src = coding->source + coding->consumed;
4796   const unsigned char *src_end = coding->source + coding->src_bytes;
4797   const unsigned char *src_base;
4798   int *charbuf = coding->charbuf + coding->charbuf_used;
4799   /* We may produce one charset annotation in one loop and one more at
4800      the end.  */
4801   int *charbuf_end
4802     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4803   ptrdiff_t consumed_chars = 0, consumed_chars_base;
4804   bool multibytep = coding->src_multibyte;
4805   struct charset *charset_roman, *charset_big5;
4806   Lisp_Object attrs, charset_list, val;
4807   ptrdiff_t char_offset = coding->produced_char;
4808   ptrdiff_t last_offset = char_offset;
4809   int last_id = charset_ascii;
4810   bool eol_dos
4811     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4812   int byte_after_cr = -1;
4813 
4814   CODING_GET_INFO (coding, attrs, charset_list);
4815   val = charset_list;
4816   charset_roman = CHARSET_FROM_ID (XFIXNUM (XCAR (val))), val = XCDR (val);
4817   charset_big5 = CHARSET_FROM_ID (XFIXNUM (XCAR (val)));
4818 
4819   while (1)
4820     {
4821       int c, c1;
4822       struct charset *charset;
4823 
4824       src_base = src;
4825       consumed_chars_base = consumed_chars;
4826 
4827       if (charbuf >= charbuf_end)
4828 	{
4829 	  if (byte_after_cr >= 0)
4830 	    src_base--;
4831 	  break;
4832 	}
4833 
4834       if (byte_after_cr >= 0)
4835 	c = byte_after_cr, byte_after_cr = -1;
4836       else
4837 	ONE_MORE_BYTE (c);
4838 
4839       if (c < 0)
4840 	goto invalid_code;
4841       if (c < 0x80)
4842 	{
4843 	  if (eol_dos && c == '\r')
4844 	    ONE_MORE_BYTE (byte_after_cr);
4845 	  charset = charset_roman;
4846 	}
4847       else
4848 	{
4849 	  /* BIG5 -> Big5 */
4850 	  if (c < 0xA1 || c > 0xFE)
4851 	    goto invalid_code;
4852 	  ONE_MORE_BYTE (c1);
4853 	  if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4854 	    goto invalid_code;
4855 	  c = c << 8 | c1;
4856 	  charset = charset_big5;
4857 	}
4858       if (charset->id != charset_ascii
4859 	  && last_id != charset->id)
4860 	{
4861 	  if (last_id != charset_ascii)
4862 	    ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4863 	  last_id = charset->id;
4864 	  last_offset = char_offset;
4865 	}
4866       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4867       *charbuf++ = c;
4868       char_offset++;
4869       continue;
4870 
4871     invalid_code:
4872       src = src_base;
4873       consumed_chars = consumed_chars_base;
4874       ONE_MORE_BYTE (c);
4875       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4876       char_offset++;
4877     }
4878 
4879  no_more_source:
4880   if (last_id != charset_ascii)
4881     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4882   coding->consumed_char += consumed_chars_base;
4883   coding->consumed = src_base - coding->source;
4884   coding->charbuf_used = charbuf - coding->charbuf;
4885 }
4886 
4887 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4888    This function can encode charsets `ascii', `katakana-jisx0201',
4889    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
4890    are sure that all these charsets are registered as official charset
4891    (i.e. do not have extended leading-codes).  Characters of other
4892    charsets are produced without any encoding.  */
4893 
4894 static bool
encode_coding_sjis(struct coding_system * coding)4895 encode_coding_sjis (struct coding_system *coding)
4896 {
4897   bool multibytep = coding->dst_multibyte;
4898   int *charbuf = coding->charbuf;
4899   int *charbuf_end = charbuf + coding->charbuf_used;
4900   unsigned char *dst = coding->destination + coding->produced;
4901   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4902   int safe_room = 4;
4903   ptrdiff_t produced_chars = 0;
4904   Lisp_Object attrs, charset_list, val;
4905   bool ascii_compatible;
4906   struct charset *charset_kanji, *charset_kana;
4907   struct charset *charset_kanji2;
4908   int c;
4909 
4910   CODING_GET_INFO (coding, attrs, charset_list);
4911   val = XCDR (charset_list);
4912   charset_kana = CHARSET_FROM_ID (XFIXNUM (XCAR (val))), val = XCDR (val);
4913   charset_kanji = CHARSET_FROM_ID (XFIXNUM (XCAR (val))), val = XCDR (val);
4914   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XFIXNUM (XCAR (val)));
4915 
4916   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4917 
4918   while (charbuf < charbuf_end)
4919     {
4920       ASSURE_DESTINATION (safe_room);
4921       c = *charbuf++;
4922       /* Now encode the character C.  */
4923       if (ASCII_CHAR_P (c) && ascii_compatible)
4924 	EMIT_ONE_ASCII_BYTE (c);
4925       else if (CHAR_BYTE8_P (c))
4926 	{
4927 	  c = CHAR_TO_BYTE8 (c);
4928 	  EMIT_ONE_BYTE (c);
4929 	}
4930       else
4931 	{
4932 	  unsigned code;
4933 	  struct charset *charset;
4934 	  CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4935 			       &code, charset);
4936 
4937 	  if (!charset)
4938 	    {
4939 	      if (coding->mode & CODING_MODE_SAFE_ENCODING)
4940 		{
4941 		  code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4942 		  charset = CHARSET_FROM_ID (charset_ascii);
4943 		}
4944 	      else
4945 		{
4946 		  c = coding->default_char;
4947 		  CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4948 				       charset_list, &code, charset);
4949 		}
4950 	    }
4951 	  if (code == CHARSET_INVALID_CODE (charset))
4952 	    emacs_abort ();
4953 	  if (charset == charset_kanji)
4954 	    {
4955 	      int c1, c2;
4956 	      JIS_TO_SJIS (code);
4957 	      c1 = code >> 8, c2 = code & 0xFF;
4958 	      EMIT_TWO_BYTES (c1, c2);
4959 	    }
4960 	  else if (charset == charset_kana)
4961 	    EMIT_ONE_BYTE (code | 0x80);
4962 	  else if (charset_kanji2 && charset == charset_kanji2)
4963 	    {
4964 	      int c1, c2;
4965 
4966 	      c1 = code >> 8;
4967 	      if (c1 == 0x21 || (c1 >= 0x23 && c1 <= 0x25)
4968 		  || c1 == 0x28
4969 		  || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
4970 		{
4971 		  JIS_TO_SJIS2 (code);
4972 		  c1 = code >> 8, c2 = code & 0xFF;
4973 		  EMIT_TWO_BYTES (c1, c2);
4974 		}
4975 	      else
4976 		EMIT_ONE_ASCII_BYTE (code & 0x7F);
4977 	    }
4978 	  else
4979 	    EMIT_ONE_ASCII_BYTE (code & 0x7F);
4980 	}
4981     }
4982   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4983   coding->produced_char += produced_chars;
4984   coding->produced = dst - coding->destination;
4985   return 0;
4986 }
4987 
4988 static bool
encode_coding_big5(struct coding_system * coding)4989 encode_coding_big5 (struct coding_system *coding)
4990 {
4991   bool multibytep = coding->dst_multibyte;
4992   int *charbuf = coding->charbuf;
4993   int *charbuf_end = charbuf + coding->charbuf_used;
4994   unsigned char *dst = coding->destination + coding->produced;
4995   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4996   int safe_room = 4;
4997   ptrdiff_t produced_chars = 0;
4998   Lisp_Object attrs, charset_list, val;
4999   bool ascii_compatible;
5000   struct charset *charset_big5;
5001   int c;
5002 
5003   CODING_GET_INFO (coding, attrs, charset_list);
5004   val = XCDR (charset_list);
5005   charset_big5 = CHARSET_FROM_ID (XFIXNUM (XCAR (val)));
5006   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5007 
5008   while (charbuf < charbuf_end)
5009     {
5010       ASSURE_DESTINATION (safe_room);
5011       c = *charbuf++;
5012       /* Now encode the character C.  */
5013       if (ASCII_CHAR_P (c) && ascii_compatible)
5014 	EMIT_ONE_ASCII_BYTE (c);
5015       else if (CHAR_BYTE8_P (c))
5016 	{
5017 	  c = CHAR_TO_BYTE8 (c);
5018 	  EMIT_ONE_BYTE (c);
5019 	}
5020       else
5021 	{
5022 	  unsigned code;
5023 	  struct charset *charset;
5024 	  CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5025 			       &code, charset);
5026 
5027 	  if (! charset)
5028 	    {
5029 	      if (coding->mode & CODING_MODE_SAFE_ENCODING)
5030 		{
5031 		  code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5032 		  charset = CHARSET_FROM_ID (charset_ascii);
5033 		}
5034 	      else
5035 		{
5036 		  c = coding->default_char;
5037 		  CODING_CHAR_CHARSET (coding, dst, dst_end, c,
5038 				       charset_list, &code, charset);
5039 		}
5040 	    }
5041 	  if (code == CHARSET_INVALID_CODE (charset))
5042 	    emacs_abort ();
5043 	  if (charset == charset_big5)
5044 	    {
5045 	      int c1, c2;
5046 
5047 	      c1 = code >> 8, c2 = code & 0xFF;
5048 	      EMIT_TWO_BYTES (c1, c2);
5049 	    }
5050 	  else
5051 	    EMIT_ONE_ASCII_BYTE (code & 0x7F);
5052 	}
5053     }
5054   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5055   coding->produced_char += produced_chars;
5056   coding->produced = dst - coding->destination;
5057   return 0;
5058 }
5059 
5060 
5061 /*** 10. CCL handlers ***/
5062 
5063 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5064    Return true if a text is encoded in a coding system of which
5065    encoder/decoder are written in CCL program.  */
5066 
5067 static bool
detect_coding_ccl(struct coding_system * coding,struct coding_detection_info * detect_info)5068 detect_coding_ccl (struct coding_system *coding,
5069 		   struct coding_detection_info *detect_info)
5070 {
5071   const unsigned char *src = coding->source, *src_base;
5072   const unsigned char *src_end = coding->source + coding->src_bytes;
5073   bool multibytep = coding->src_multibyte;
5074   ptrdiff_t consumed_chars = 0;
5075   int found = 0;
5076   unsigned char *valids;
5077   ptrdiff_t head_ascii = coding->head_ascii;
5078   Lisp_Object attrs;
5079 
5080   detect_info->checked |= CATEGORY_MASK_CCL;
5081 
5082   coding = &coding_categories[coding_category_ccl];
5083   valids = CODING_CCL_VALIDS (coding);
5084   attrs = CODING_ID_ATTRS (coding->id);
5085   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5086     src += head_ascii;
5087 
5088   while (1)
5089     {
5090       int c;
5091 
5092       src_base = src;
5093       ONE_MORE_BYTE (c);
5094       if (c < 0 || ! valids[c])
5095 	break;
5096       if ((valids[c] > 1))
5097 	found = CATEGORY_MASK_CCL;
5098     }
5099   detect_info->rejected |= CATEGORY_MASK_CCL;
5100   return 0;
5101 
5102  no_more_source:
5103   detect_info->found |= found;
5104   return 1;
5105 }
5106 
5107 static void
decode_coding_ccl(struct coding_system * coding)5108 decode_coding_ccl (struct coding_system *coding)
5109 {
5110   const unsigned char *src = coding->source + coding->consumed;
5111   const unsigned char *src_end = coding->source + coding->src_bytes;
5112   int *charbuf = coding->charbuf + coding->charbuf_used;
5113   int *charbuf_end = coding->charbuf + coding->charbuf_size;
5114   ptrdiff_t consumed_chars = 0;
5115   bool multibytep = coding->src_multibyte;
5116   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5117   int source_charbuf[1024];
5118   int source_byteidx[1025];
5119   Lisp_Object attrs, charset_list;
5120 
5121   CODING_GET_INFO (coding, attrs, charset_list);
5122 
5123   while (1)
5124     {
5125       const unsigned char *p = src;
5126       ptrdiff_t offset;
5127       int i = 0;
5128 
5129       if (multibytep)
5130 	{
5131 	  while (i < 1024 && p < src_end)
5132 	    {
5133 	      source_byteidx[i] = p - src;
5134 	      source_charbuf[i++] = string_char_advance (&p);
5135 	    }
5136 	  source_byteidx[i] = p - src;
5137 	}
5138       else
5139 	while (i < 1024 && p < src_end)
5140 	  source_charbuf[i++] = *p++;
5141 
5142       if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
5143 	ccl->last_block = true;
5144       /* As ccl_driver calls DECODE_CHAR, buffer may be relocated.  */
5145       charset_map_loaded = 0;
5146       ccl_driver (ccl, source_charbuf, charbuf, i, charbuf_end - charbuf,
5147 		  charset_list);
5148       if (charset_map_loaded
5149 	  && (offset = coding_change_source (coding)))
5150 	{
5151 	  p += offset;
5152 	  src += offset;
5153 	  src_end += offset;
5154 	}
5155       charbuf += ccl->produced;
5156       if (multibytep)
5157 	src += source_byteidx[ccl->consumed];
5158       else
5159 	src += ccl->consumed;
5160       consumed_chars += ccl->consumed;
5161       if (p == src_end || ccl->status != CCL_STAT_SUSPEND_BY_SRC)
5162 	break;
5163     }
5164 
5165   switch (ccl->status)
5166     {
5167     case CCL_STAT_SUSPEND_BY_SRC:
5168       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5169       break;
5170     case CCL_STAT_SUSPEND_BY_DST:
5171       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5172       break;
5173     case CCL_STAT_QUIT:
5174     case CCL_STAT_INVALID_CMD:
5175       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5176       break;
5177     default:
5178       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5179       break;
5180     }
5181   coding->consumed_char += consumed_chars;
5182   coding->consumed = src - coding->source;
5183   coding->charbuf_used = charbuf - coding->charbuf;
5184 }
5185 
5186 static bool
encode_coding_ccl(struct coding_system * coding)5187 encode_coding_ccl (struct coding_system *coding)
5188 {
5189   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5190   bool multibytep = coding->dst_multibyte;
5191   int *charbuf = coding->charbuf;
5192   int *charbuf_end = charbuf + coding->charbuf_used;
5193   unsigned char *dst = coding->destination + coding->produced;
5194   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5195   int destination_charbuf[1024];
5196   ptrdiff_t produced_chars = 0;
5197   int i;
5198   Lisp_Object attrs, charset_list;
5199 
5200   CODING_GET_INFO (coding, attrs, charset_list);
5201   if (coding->consumed_char == coding->src_chars
5202       && coding->mode & CODING_MODE_LAST_BLOCK)
5203     ccl->last_block = true;
5204 
5205   do
5206     {
5207       ptrdiff_t offset;
5208 
5209       /* As ccl_driver calls DECODE_CHAR, buffer may be relocated.  */
5210       charset_map_loaded = 0;
5211       ccl_driver (ccl, charbuf, destination_charbuf,
5212 		  charbuf_end - charbuf, 1024, charset_list);
5213       if (charset_map_loaded
5214 	  && (offset = coding_change_destination (coding)))
5215 	dst += offset;
5216       if (multibytep)
5217 	{
5218 	  ASSURE_DESTINATION (ccl->produced * 2);
5219 	  for (i = 0; i < ccl->produced; i++)
5220 	    EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
5221 	}
5222       else
5223 	{
5224 	  ASSURE_DESTINATION (ccl->produced);
5225 	  for (i = 0; i < ccl->produced; i++)
5226 	    *dst++ = destination_charbuf[i] & 0xFF;
5227 	  produced_chars += ccl->produced;
5228 	}
5229       charbuf += ccl->consumed;
5230       if (ccl->status == CCL_STAT_QUIT
5231 	  || ccl->status == CCL_STAT_INVALID_CMD)
5232 	break;
5233     }
5234   while (charbuf < charbuf_end);
5235 
5236   switch (ccl->status)
5237     {
5238     case CCL_STAT_SUSPEND_BY_SRC:
5239       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5240       break;
5241     case CCL_STAT_SUSPEND_BY_DST:
5242       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5243       break;
5244     case CCL_STAT_QUIT:
5245     case CCL_STAT_INVALID_CMD:
5246       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5247       break;
5248     default:
5249       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5250       break;
5251     }
5252 
5253   coding->produced_char += produced_chars;
5254   coding->produced = dst - coding->destination;
5255   return 0;
5256 }
5257 
5258 
5259 /*** 10, 11. no-conversion handlers ***/
5260 
5261 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
5262 
5263 static void
decode_coding_raw_text(struct coding_system * coding)5264 decode_coding_raw_text (struct coding_system *coding)
5265 {
5266   bool eol_dos
5267     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5268 
5269   coding->chars_at_source = 1;
5270   coding->consumed_char = coding->src_chars;
5271   coding->consumed = coding->src_bytes;
5272   if (eol_dos && coding->source[coding->src_bytes - 1] == '\r')
5273     {
5274       coding->consumed_char--;
5275       coding->consumed--;
5276       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5277     }
5278   else
5279     record_conversion_result (coding, CODING_RESULT_SUCCESS);
5280 }
5281 
5282 static bool
encode_coding_raw_text(struct coding_system * coding)5283 encode_coding_raw_text (struct coding_system *coding)
5284 {
5285   bool multibytep = coding->dst_multibyte;
5286   int *charbuf = coding->charbuf;
5287   int *charbuf_end = coding->charbuf + coding->charbuf_used;
5288   unsigned char *dst = coding->destination + coding->produced;
5289   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5290   ptrdiff_t produced_chars = 0;
5291   int c;
5292 
5293   if (multibytep)
5294     {
5295       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
5296 
5297       if (coding->src_multibyte)
5298 	while (charbuf < charbuf_end)
5299 	  {
5300 	    ASSURE_DESTINATION (safe_room);
5301 	    c = *charbuf++;
5302 	    if (ASCII_CHAR_P (c))
5303 	      EMIT_ONE_ASCII_BYTE (c);
5304 	    else if (CHAR_BYTE8_P (c))
5305 	      {
5306 		c = CHAR_TO_BYTE8 (c);
5307 		EMIT_ONE_BYTE (c);
5308 	      }
5309 	    else
5310 	      {
5311 		unsigned char str[MAX_MULTIBYTE_LENGTH];
5312 		int len = CHAR_STRING (c, str);
5313 		for (int i = 0; i < len; i++)
5314 		  EMIT_ONE_BYTE (str[i]);
5315 	      }
5316 	  }
5317       else
5318 	while (charbuf < charbuf_end)
5319 	  {
5320 	    ASSURE_DESTINATION (safe_room);
5321 	    c = *charbuf++;
5322 	    EMIT_ONE_BYTE (c);
5323 	  }
5324     }
5325   else
5326     {
5327       if (coding->src_multibyte)
5328 	{
5329 	  int safe_room = MAX_MULTIBYTE_LENGTH;
5330 
5331 	  while (charbuf < charbuf_end)
5332 	    {
5333 	      ASSURE_DESTINATION (safe_room);
5334 	      c = *charbuf++;
5335 	      if (ASCII_CHAR_P (c))
5336 		*dst++ = c;
5337 	      else if (CHAR_BYTE8_P (c))
5338 		*dst++ = CHAR_TO_BYTE8 (c);
5339 	      else
5340 		dst += CHAR_STRING (c, dst);
5341 	    }
5342 	}
5343       else
5344 	{
5345 	  ASSURE_DESTINATION (charbuf_end - charbuf);
5346 	  while (charbuf < charbuf_end && dst < dst_end)
5347 	    *dst++ = *charbuf++;
5348 	}
5349       produced_chars = dst - (coding->destination + coding->produced);
5350     }
5351   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5352   coding->produced_char += produced_chars;
5353   coding->produced = dst - coding->destination;
5354   return 0;
5355 }
5356 
5357 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5358    Return true if a text is encoded in a charset-based coding system.  */
5359 
5360 static bool
detect_coding_charset(struct coding_system * coding,struct coding_detection_info * detect_info)5361 detect_coding_charset (struct coding_system *coding,
5362 		       struct coding_detection_info *detect_info)
5363 {
5364   const unsigned char *src = coding->source, *src_base;
5365   const unsigned char *src_end = coding->source + coding->src_bytes;
5366   bool multibytep = coding->src_multibyte;
5367   ptrdiff_t consumed_chars = 0;
5368   Lisp_Object attrs, valids, name;
5369   int found = 0;
5370   ptrdiff_t head_ascii = coding->head_ascii;
5371   bool check_latin_extra = 0;
5372 
5373   detect_info->checked |= CATEGORY_MASK_CHARSET;
5374 
5375   coding = &coding_categories[coding_category_charset];
5376   attrs = CODING_ID_ATTRS (coding->id);
5377   valids = AREF (attrs, coding_attr_charset_valids);
5378   name = CODING_ID_NAME (coding->id);
5379   if (strncmp (SSDATA (SYMBOL_NAME (name)),
5380 	       "iso-8859-", sizeof ("iso-8859-") - 1) == 0
5381       || strncmp (SSDATA (SYMBOL_NAME (name)),
5382 		  "iso-latin-", sizeof ("iso-latin-") - 1) == 0)
5383     check_latin_extra = 1;
5384 
5385   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5386     src += head_ascii;
5387 
5388   while (1)
5389     {
5390       int c;
5391       Lisp_Object val;
5392       struct charset *charset;
5393       int dim, idx;
5394 
5395       src_base = src;
5396       ONE_MORE_BYTE (c);
5397       if (c < 0)
5398 	continue;
5399       val = AREF (valids, c);
5400       if (NILP (val))
5401 	break;
5402       if (c >= 0x80)
5403 	{
5404 	  if (c < 0xA0
5405 	      && check_latin_extra
5406 	      && (!VECTORP (Vlatin_extra_code_table)
5407 		  || NILP (AREF (Vlatin_extra_code_table, c))))
5408 	    break;
5409 	  found = CATEGORY_MASK_CHARSET;
5410 	}
5411       if (FIXNUMP (val))
5412 	{
5413 	  charset = CHARSET_FROM_ID (XFIXNAT (val));
5414 	  dim = CHARSET_DIMENSION (charset);
5415 	  for (idx = 1; idx < dim; idx++)
5416 	    {
5417 	      if (src == src_end)
5418 		goto too_short;
5419 	      ONE_MORE_BYTE (c);
5420 	      if (c < charset->code_space[(dim - 1 - idx) * 4]
5421 		  || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5422 		break;
5423 	    }
5424 	  if (idx < dim)
5425 	    break;
5426 	}
5427       else
5428 	{
5429 	  idx = 1;
5430 	  for (; CONSP (val); val = XCDR (val))
5431 	    {
5432 	      charset = CHARSET_FROM_ID (XFIXNAT (XCAR (val)));
5433 	      dim = CHARSET_DIMENSION (charset);
5434 	      while (idx < dim)
5435 		{
5436 		  if (src == src_end)
5437 		    goto too_short;
5438 		  ONE_MORE_BYTE (c);
5439 		  if (c < charset->code_space[(dim - 1 - idx) * 4]
5440 		      || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5441 		    break;
5442 		  idx++;
5443 		}
5444 	      if (idx == dim)
5445 		{
5446 		  val = Qnil;
5447 		  break;
5448 		}
5449 	    }
5450 	  if (CONSP (val))
5451 	    break;
5452 	}
5453     }
5454  too_short:
5455   detect_info->rejected |= CATEGORY_MASK_CHARSET;
5456   return 0;
5457 
5458  no_more_source:
5459   detect_info->found |= found;
5460   return 1;
5461 }
5462 
5463 static void
decode_coding_charset(struct coding_system * coding)5464 decode_coding_charset (struct coding_system *coding)
5465 {
5466   const unsigned char *src = coding->source + coding->consumed;
5467   const unsigned char *src_end = coding->source + coding->src_bytes;
5468   const unsigned char *src_base;
5469   int *charbuf = coding->charbuf + coding->charbuf_used;
5470   /* We may produce one charset annotation in one loop and one more at
5471      the end.  */
5472   int *charbuf_end
5473     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
5474   ptrdiff_t consumed_chars = 0, consumed_chars_base;
5475   bool multibytep = coding->src_multibyte;
5476   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
5477   Lisp_Object valids;
5478   ptrdiff_t char_offset = coding->produced_char;
5479   ptrdiff_t last_offset = char_offset;
5480   int last_id = charset_ascii;
5481   bool eol_dos
5482     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5483   int byte_after_cr = -1;
5484 
5485   valids = AREF (attrs, coding_attr_charset_valids);
5486 
5487   while (1)
5488     {
5489       int c;
5490       Lisp_Object val;
5491       struct charset *charset;
5492       int dim;
5493       int len = 1;
5494       unsigned code;
5495 
5496       src_base = src;
5497       consumed_chars_base = consumed_chars;
5498 
5499       if (charbuf >= charbuf_end)
5500 	{
5501 	  if (byte_after_cr >= 0)
5502 	    src_base--;
5503 	  break;
5504 	}
5505 
5506       if (byte_after_cr >= 0)
5507 	{
5508 	  c = byte_after_cr;
5509 	  byte_after_cr = -1;
5510 	}
5511       else
5512 	{
5513 	  ONE_MORE_BYTE (c);
5514 	  if (eol_dos && c == '\r')
5515 	    ONE_MORE_BYTE (byte_after_cr);
5516 	}
5517       if (c < 0)
5518 	goto invalid_code;
5519       code = c;
5520 
5521       val = AREF (valids, c);
5522       if (! FIXNUMP (val) && ! CONSP (val))
5523 	goto invalid_code;
5524       if (FIXNUMP (val))
5525 	{
5526 	  charset = CHARSET_FROM_ID (XFIXNAT (val));
5527 	  dim = CHARSET_DIMENSION (charset);
5528 	  while (len < dim)
5529 	    {
5530 	      ONE_MORE_BYTE (c);
5531 	      code = (code << 8) | c;
5532 	      len++;
5533 	    }
5534 	  CODING_DECODE_CHAR (coding, src, src_base, src_end,
5535 			      charset, code, c);
5536 	}
5537       else
5538 	{
5539 	  /* VAL is a list of charset IDs.  It is assured that the
5540 	     list is sorted by charset dimensions (smaller one
5541 	     comes first).  */
5542 	  while (CONSP (val))
5543 	    {
5544 	      charset = CHARSET_FROM_ID (XFIXNAT (XCAR (val)));
5545 	      dim = CHARSET_DIMENSION (charset);
5546 	      while (len < dim)
5547 		{
5548 		  ONE_MORE_BYTE (c);
5549 		  code = (code << 8) | c;
5550 		  len++;
5551 		}
5552 	      CODING_DECODE_CHAR (coding, src, src_base,
5553 				  src_end, charset, code, c);
5554 	      if (c >= 0)
5555 		break;
5556 	      val = XCDR (val);
5557 	    }
5558 	}
5559       if (c < 0)
5560 	goto invalid_code;
5561       if (charset->id != charset_ascii
5562 	  && last_id != charset->id)
5563 	{
5564 	  if (last_id != charset_ascii)
5565 	    ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5566 	  last_id = charset->id;
5567 	  last_offset = char_offset;
5568 	}
5569 
5570       *charbuf++ = c;
5571       char_offset++;
5572       continue;
5573 
5574     invalid_code:
5575       src = src_base;
5576       consumed_chars = consumed_chars_base;
5577       ONE_MORE_BYTE (c);
5578       *charbuf++ = c < 0 ? -c : ASCII_CHAR_P (c) ? c : BYTE8_TO_CHAR (c);
5579       char_offset++;
5580     }
5581 
5582  no_more_source:
5583   if (last_id != charset_ascii)
5584     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5585   coding->consumed_char += consumed_chars_base;
5586   coding->consumed = src_base - coding->source;
5587   coding->charbuf_used = charbuf - coding->charbuf;
5588 }
5589 
5590 static bool
encode_coding_charset(struct coding_system * coding)5591 encode_coding_charset (struct coding_system *coding)
5592 {
5593   bool multibytep = coding->dst_multibyte;
5594   int *charbuf = coding->charbuf;
5595   int *charbuf_end = charbuf + coding->charbuf_used;
5596   unsigned char *dst = coding->destination + coding->produced;
5597   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5598   int safe_room = MAX_MULTIBYTE_LENGTH;
5599   ptrdiff_t produced_chars = 0;
5600   Lisp_Object attrs, charset_list;
5601   bool ascii_compatible;
5602   int c;
5603 
5604   CODING_GET_INFO (coding, attrs, charset_list);
5605   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5606 
5607   while (charbuf < charbuf_end)
5608     {
5609       struct charset *charset;
5610       unsigned code;
5611 
5612       ASSURE_DESTINATION (safe_room);
5613       c = *charbuf++;
5614       if (ascii_compatible && ASCII_CHAR_P (c))
5615 	EMIT_ONE_ASCII_BYTE (c);
5616       else if (CHAR_BYTE8_P (c))
5617 	{
5618 	  c = CHAR_TO_BYTE8 (c);
5619 	  EMIT_ONE_BYTE (c);
5620 	}
5621       else
5622 	{
5623 	  CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5624 			       &code, charset);
5625 
5626 	  if (charset)
5627 	    {
5628 	      if (CHARSET_DIMENSION (charset) == 1)
5629 		EMIT_ONE_BYTE (code);
5630 	      else if (CHARSET_DIMENSION (charset) == 2)
5631 		EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5632 	      else if (CHARSET_DIMENSION (charset) == 3)
5633 		EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5634 	      else
5635 		EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5636 				 (code >> 8) & 0xFF, code & 0xFF);
5637 	    }
5638 	  else
5639 	    {
5640 	      if (coding->mode & CODING_MODE_SAFE_ENCODING)
5641 		c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5642 	      else
5643 		c = coding->default_char;
5644 	      EMIT_ONE_BYTE (c);
5645 	    }
5646 	}
5647     }
5648 
5649   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5650   coding->produced_char += produced_chars;
5651   coding->produced = dst - coding->destination;
5652   return 0;
5653 }
5654 
5655 
5656 /*** 7. C library functions ***/
5657 
5658 /* Setup coding context CODING from information about CODING_SYSTEM.
5659    If CODING_SYSTEM is nil, `no-conversion' is assumed.  If
5660    CODING_SYSTEM is invalid, signal an error.  */
5661 
5662 void
setup_coding_system(Lisp_Object coding_system,struct coding_system * coding)5663 setup_coding_system (Lisp_Object coding_system, struct coding_system *coding)
5664 {
5665   Lisp_Object attrs;
5666   Lisp_Object eol_type;
5667   Lisp_Object coding_type;
5668   Lisp_Object val;
5669 
5670   if (NILP (coding_system))
5671     coding_system = Qundecided;
5672 
5673   CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
5674 
5675   attrs = CODING_ID_ATTRS (coding->id);
5676   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
5677 
5678   coding->mode = 0;
5679   if (VECTORP (eol_type))
5680     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5681 			    | CODING_REQUIRE_DETECTION_MASK);
5682   else if (! EQ (eol_type, Qunix))
5683     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5684 			    | CODING_REQUIRE_ENCODING_MASK);
5685   else
5686     coding->common_flags = 0;
5687   if (! NILP (CODING_ATTR_POST_READ (attrs)))
5688     coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5689   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5690     coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5691   if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5692     coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
5693 
5694   val = CODING_ATTR_SAFE_CHARSETS (attrs);
5695   coding->max_charset_id = SCHARS (val) - 1;
5696   coding->safe_charsets = SDATA (val);
5697   coding->default_char = XFIXNUM (CODING_ATTR_DEFAULT_CHAR (attrs));
5698   coding->carryover_bytes = 0;
5699   coding->raw_destination = 0;
5700 
5701   coding_type = CODING_ATTR_TYPE (attrs);
5702   if (EQ (coding_type, Qundecided))
5703     {
5704       coding->detector = NULL;
5705       coding->decoder = decode_coding_raw_text;
5706       coding->encoder = encode_coding_raw_text;
5707       coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5708       coding->spec.undecided.inhibit_nbd
5709 	= (encode_inhibit_flag
5710 	   (AREF (attrs, coding_attr_undecided_inhibit_null_byte_detection)));
5711       coding->spec.undecided.inhibit_ied
5712 	= (encode_inhibit_flag
5713 	   (AREF (attrs, coding_attr_undecided_inhibit_iso_escape_detection)));
5714       coding->spec.undecided.prefer_utf_8
5715 	= ! NILP (AREF (attrs, coding_attr_undecided_prefer_utf_8));
5716     }
5717   else if (EQ (coding_type, Qiso_2022))
5718     {
5719       int i;
5720       int flags = XFIXNUM (AREF (attrs, coding_attr_iso_flags));
5721 
5722       /* Invoke graphic register 0 to plane 0.  */
5723       CODING_ISO_INVOCATION (coding, 0) = 0;
5724       /* Invoke graphic register 1 to plane 1 if we can use 8-bit.  */
5725       CODING_ISO_INVOCATION (coding, 1)
5726 	= (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5727       /* Setup the initial status of designation.  */
5728       for (i = 0; i < 4; i++)
5729 	CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5730       /* Not single shifting initially.  */
5731       CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5732       /* Beginning of buffer should also be regarded as bol. */
5733       CODING_ISO_BOL (coding) = 1;
5734       coding->detector = detect_coding_iso_2022;
5735       coding->decoder = decode_coding_iso_2022;
5736       coding->encoder = encode_coding_iso_2022;
5737       if (flags & CODING_ISO_FLAG_SAFE)
5738 	coding->mode |= CODING_MODE_SAFE_ENCODING;
5739       coding->common_flags
5740 	|= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5741 	    | CODING_REQUIRE_FLUSHING_MASK);
5742       if (flags & CODING_ISO_FLAG_COMPOSITION)
5743 	coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
5744       if (flags & CODING_ISO_FLAG_DESIGNATION)
5745 	coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
5746       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5747 	{
5748 	  setup_iso_safe_charsets (attrs);
5749 	  val = CODING_ATTR_SAFE_CHARSETS (attrs);
5750 	  coding->max_charset_id = SCHARS (val) - 1;
5751 	  coding->safe_charsets = SDATA (val);
5752 	}
5753       CODING_ISO_FLAGS (coding) = flags;
5754       CODING_ISO_CMP_STATUS (coding)->state = COMPOSING_NO;
5755       CODING_ISO_CMP_STATUS (coding)->method = COMPOSITION_NO;
5756       CODING_ISO_EXTSEGMENT_LEN (coding) = 0;
5757       CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
5758     }
5759   else if (EQ (coding_type, Qcharset))
5760     {
5761       coding->detector = detect_coding_charset;
5762       coding->decoder = decode_coding_charset;
5763       coding->encoder = encode_coding_charset;
5764       coding->common_flags
5765 	|= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5766     }
5767   else if (EQ (coding_type, Qutf_8))
5768     {
5769       val = AREF (attrs, coding_attr_utf_bom);
5770       CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
5771 				   : EQ (val, Qt) ? utf_with_bom
5772 				   : utf_without_bom);
5773       coding->detector = detect_coding_utf_8;
5774       coding->decoder = decode_coding_utf_8;
5775       coding->encoder = encode_coding_utf_8;
5776       coding->common_flags
5777 	|= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5778       if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
5779 	coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5780     }
5781   else if (EQ (coding_type, Qutf_16))
5782     {
5783       val = AREF (attrs, coding_attr_utf_bom);
5784       CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
5785 				    : EQ (val, Qt) ? utf_with_bom
5786 				    : utf_without_bom);
5787       val = AREF (attrs, coding_attr_utf_16_endian);
5788       CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5789 				       : utf_16_little_endian);
5790       CODING_UTF_16_SURROGATE (coding) = 0;
5791       coding->detector = detect_coding_utf_16;
5792       coding->decoder = decode_coding_utf_16;
5793       coding->encoder = encode_coding_utf_16;
5794       coding->common_flags
5795 	|= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5796       if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
5797 	coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5798     }
5799   else if (EQ (coding_type, Qccl))
5800     {
5801       coding->detector = detect_coding_ccl;
5802       coding->decoder = decode_coding_ccl;
5803       coding->encoder = encode_coding_ccl;
5804       coding->common_flags
5805 	|= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5806 	    | CODING_REQUIRE_FLUSHING_MASK);
5807     }
5808   else if (EQ (coding_type, Qemacs_mule))
5809     {
5810       coding->detector = detect_coding_emacs_mule;
5811       coding->decoder = decode_coding_emacs_mule;
5812       coding->encoder = encode_coding_emacs_mule;
5813       coding->common_flags
5814 	|= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5815       if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5816 	  && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5817 	{
5818 	  Lisp_Object tail, safe_charsets;
5819 	  int max_charset_id = 0;
5820 
5821 	  for (tail = Vemacs_mule_charset_list; CONSP (tail);
5822 	       tail = XCDR (tail))
5823 	    if (max_charset_id < XFIXNAT (XCAR (tail)))
5824 	      max_charset_id = XFIXNAT (XCAR (tail));
5825 	  safe_charsets = make_uninit_string (max_charset_id + 1);
5826 	  memset (SDATA (safe_charsets), 255, max_charset_id + 1);
5827 	  for (tail = Vemacs_mule_charset_list; CONSP (tail);
5828 	       tail = XCDR (tail))
5829 	    SSET (safe_charsets, XFIXNAT (XCAR (tail)), 0);
5830 	  coding->max_charset_id = max_charset_id;
5831 	  coding->safe_charsets = SDATA (safe_charsets);
5832 	}
5833       coding->spec.emacs_mule.cmp_status.state = COMPOSING_NO;
5834       coding->spec.emacs_mule.cmp_status.method = COMPOSITION_NO;
5835     }
5836   else if (EQ (coding_type, Qshift_jis))
5837     {
5838       coding->detector = detect_coding_sjis;
5839       coding->decoder = decode_coding_sjis;
5840       coding->encoder = encode_coding_sjis;
5841       coding->common_flags
5842 	|= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5843     }
5844   else if (EQ (coding_type, Qbig5))
5845     {
5846       coding->detector = detect_coding_big5;
5847       coding->decoder = decode_coding_big5;
5848       coding->encoder = encode_coding_big5;
5849       coding->common_flags
5850 	|= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5851     }
5852   else				/* EQ (coding_type, Qraw_text) */
5853     {
5854       coding->detector = NULL;
5855       coding->decoder = decode_coding_raw_text;
5856       coding->encoder = encode_coding_raw_text;
5857       if (! EQ (eol_type, Qunix))
5858 	{
5859 	  coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5860 	  if (! VECTORP (eol_type))
5861 	    coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5862 	}
5863 
5864     }
5865 
5866   return;
5867 }
5868 
5869 /* Return a list of charsets supported by CODING.  */
5870 
5871 Lisp_Object
coding_charset_list(struct coding_system * coding)5872 coding_charset_list (struct coding_system *coding)
5873 {
5874   Lisp_Object attrs, charset_list;
5875 
5876   CODING_GET_INFO (coding, attrs, charset_list);
5877   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5878     {
5879       int flags = XFIXNUM (AREF (attrs, coding_attr_iso_flags));
5880 
5881       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5882 	charset_list = Viso_2022_charset_list;
5883     }
5884   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5885     {
5886       charset_list = Vemacs_mule_charset_list;
5887     }
5888   return charset_list;
5889 }
5890 
5891 
5892 /* Return a list of charsets supported by CODING-SYSTEM.  */
5893 
5894 Lisp_Object
coding_system_charset_list(Lisp_Object coding_system)5895 coding_system_charset_list (Lisp_Object coding_system)
5896 {
5897   ptrdiff_t id;
5898   Lisp_Object attrs, charset_list;
5899 
5900   CHECK_CODING_SYSTEM_GET_ID (coding_system, id);
5901   attrs = CODING_ID_ATTRS (id);
5902 
5903   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5904     {
5905       int flags = XFIXNUM (AREF (attrs, coding_attr_iso_flags));
5906 
5907       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5908 	charset_list = Viso_2022_charset_list;
5909       else
5910 	charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5911     }
5912   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5913     {
5914       charset_list = Vemacs_mule_charset_list;
5915     }
5916   else
5917     {
5918       charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5919     }
5920   return charset_list;
5921 }
5922 
5923 
5924 /* Return raw-text or one of its subsidiaries that has the same
5925    eol_type as CODING-SYSTEM.  */
5926 
5927 Lisp_Object
raw_text_coding_system(Lisp_Object coding_system)5928 raw_text_coding_system (Lisp_Object coding_system)
5929 {
5930   Lisp_Object spec, attrs;
5931   Lisp_Object eol_type, raw_text_eol_type;
5932 
5933   if (NILP (coding_system))
5934     return Qraw_text;
5935   spec = CODING_SYSTEM_SPEC (coding_system);
5936   attrs = AREF (spec, 0);
5937 
5938   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5939     return coding_system;
5940 
5941   eol_type = AREF (spec, 2);
5942   if (VECTORP (eol_type))
5943     return Qraw_text;
5944   spec = CODING_SYSTEM_SPEC (Qraw_text);
5945   raw_text_eol_type = AREF (spec, 2);
5946   return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5947 	  : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5948 	  : AREF (raw_text_eol_type, 2));
5949 }
5950 
5951 /* Return true if CODING corresponds to raw-text coding-system.  */
5952 
5953 bool
raw_text_coding_system_p(struct coding_system * coding)5954 raw_text_coding_system_p (struct coding_system *coding)
5955 {
5956   return (coding->decoder == decode_coding_raw_text
5957 	  && coding->encoder == encode_coding_raw_text) ? true : false;
5958 }
5959 
5960 
5961 /* If CODING_SYSTEM doesn't specify end-of-line format, return one of
5962    the subsidiary that has the same eol-spec as PARENT (if it is not
5963    nil and specifies end-of-line format) or the system's setting.  */
5964 
5965 Lisp_Object
coding_inherit_eol_type(Lisp_Object coding_system,Lisp_Object parent)5966 coding_inherit_eol_type (Lisp_Object coding_system, Lisp_Object parent)
5967 {
5968   Lisp_Object spec, eol_type;
5969 
5970   if (NILP (coding_system))
5971     coding_system = Qraw_text;
5972   else
5973     CHECK_CODING_SYSTEM (coding_system);
5974   spec = CODING_SYSTEM_SPEC (coding_system);
5975   eol_type = AREF (spec, 2);
5976   if (VECTORP (eol_type))
5977     {
5978       /* Format of end-of-line decided by system.
5979 	 This is Qunix on Unix and Mac, Qdos on DOS/Windows.
5980 	 This has an effect only for external encoding (i.e., for output to
5981 	 file and process), not for in-buffer or Lisp string encoding.  */
5982       Lisp_Object system_eol_type = Qunix;
5983       #ifdef DOS_NT
5984        system_eol_type = Qdos;
5985       #endif
5986 
5987       Lisp_Object parent_eol_type = system_eol_type;
5988       if (! NILP (parent))
5989 	{
5990 	  CHECK_CODING_SYSTEM (parent);
5991 	  Lisp_Object parent_spec = CODING_SYSTEM_SPEC (parent);
5992 	  Lisp_Object pspec_type = AREF (parent_spec, 2);
5993 	  if (!VECTORP (pspec_type))
5994 	    parent_eol_type = pspec_type;
5995 	}
5996       if (EQ (parent_eol_type, Qunix))
5997 	coding_system = AREF (eol_type, 0);
5998       else if (EQ (parent_eol_type, Qdos))
5999 	coding_system = AREF (eol_type, 1);
6000       else if (EQ (parent_eol_type, Qmac))
6001 	coding_system = AREF (eol_type, 2);
6002     }
6003   return coding_system;
6004 }
6005 
6006 
6007 /* Check if text-conversion and eol-conversion of CODING_SYSTEM are
6008    decided for writing to a process.  If not, complement them, and
6009    return a new coding system.  */
6010 
6011 Lisp_Object
complement_process_encoding_system(Lisp_Object coding_system)6012 complement_process_encoding_system (Lisp_Object coding_system)
6013 {
6014   Lisp_Object coding_base = Qnil, eol_base = Qnil;
6015   Lisp_Object spec, attrs;
6016   int i;
6017 
6018   for (i = 0; i < 3; i++)
6019     {
6020       if (i == 1)
6021 	coding_system = CDR_SAFE (Vdefault_process_coding_system);
6022       else if (i == 2)
6023 	coding_system = preferred_coding_system ();
6024       spec = CODING_SYSTEM_SPEC (coding_system);
6025       if (NILP (spec))
6026 	continue;
6027       attrs = AREF (spec, 0);
6028       if (NILP (coding_base) && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
6029 	coding_base = CODING_ATTR_BASE_NAME (attrs);
6030       if (NILP (eol_base) && ! VECTORP (AREF (spec, 2)))
6031 	eol_base = coding_system;
6032       if (! NILP (coding_base) && ! NILP (eol_base))
6033 	break;
6034     }
6035 
6036   if (i > 0)
6037     /* The original CODING_SYSTEM didn't specify text-conversion or
6038        eol-conversion.  Be sure that we return a fully complemented
6039        coding system.  */
6040     coding_system = coding_inherit_eol_type (coding_base, eol_base);
6041   return coding_system;
6042 }
6043 
6044 
6045 /* Emacs has a mechanism to automatically detect a coding system if it
6046    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
6047    it's impossible to distinguish some coding systems accurately
6048    because they use the same range of codes.  So, at first, coding
6049    systems are categorized into 7, those are:
6050 
6051    o coding-category-emacs-mule
6052 
6053    	The category for a coding system which has the same code range
6054 	as Emacs' internal format.  Assigned the coding-system (Lisp
6055 	symbol) `emacs-mule' by default.
6056 
6057    o coding-category-sjis
6058 
6059 	The category for a coding system which has the same code range
6060 	as SJIS.  Assigned the coding-system (Lisp
6061 	symbol) `japanese-shift-jis' by default.
6062 
6063    o coding-category-iso-7
6064 
6065    	The category for a coding system which has the same code range
6066 	as ISO2022 of 7-bit environment.  This doesn't use any locking
6067 	shift and single shift functions.  This can encode/decode all
6068 	charsets.  Assigned the coding-system (Lisp symbol)
6069 	`iso-2022-7bit' by default.
6070 
6071    o coding-category-iso-7-tight
6072 
6073 	Same as coding-category-iso-7 except that this can
6074 	encode/decode only the specified charsets.
6075 
6076    o coding-category-iso-8-1
6077 
6078    	The category for a coding system which has the same code range
6079 	as ISO2022 of 8-bit environment and graphic plane 1 used only
6080 	for DIMENSION1 charset.  This doesn't use any locking shift
6081 	and single shift functions.  Assigned the coding-system (Lisp
6082 	symbol) `iso-latin-1' by default.
6083 
6084    o coding-category-iso-8-2
6085 
6086    	The category for a coding system which has the same code range
6087 	as ISO2022 of 8-bit environment and graphic plane 1 used only
6088 	for DIMENSION2 charset.  This doesn't use any locking shift
6089 	and single shift functions.  Assigned the coding-system (Lisp
6090 	symbol) `japanese-iso-8bit' by default.
6091 
6092    o coding-category-iso-7-else
6093 
6094    	The category for a coding system which has the same code range
6095 	as ISO2022 of 7-bit environment but uses locking shift or
6096 	single shift functions.  Assigned the coding-system (Lisp
6097 	symbol) `iso-2022-7bit-lock' by default.
6098 
6099    o coding-category-iso-8-else
6100 
6101    	The category for a coding system which has the same code range
6102 	as ISO2022 of 8-bit environment but uses locking shift or
6103 	single shift functions.  Assigned the coding-system (Lisp
6104 	symbol) `iso-2022-8bit-ss2' by default.
6105 
6106    o coding-category-big5
6107 
6108    	The category for a coding system which has the same code range
6109 	as BIG5.  Assigned the coding-system (Lisp symbol)
6110 	`cn-big5' by default.
6111 
6112    o coding-category-utf-8
6113 
6114 	The category for a coding system which has the same code range
6115 	as UTF-8 (cf. RFC3629).  Assigned the coding-system (Lisp
6116 	symbol) `utf-8' by default.
6117 
6118    o coding-category-utf-16-be
6119 
6120 	The category for a coding system in which a text has an
6121 	Unicode signature (cf. Unicode Standard) in the order of BIG
6122 	endian at the head.  Assigned the coding-system (Lisp symbol)
6123 	`utf-16-be' by default.
6124 
6125    o coding-category-utf-16-le
6126 
6127 	The category for a coding system in which a text has an
6128 	Unicode signature (cf. Unicode Standard) in the order of
6129 	LITTLE endian at the head.  Assigned the coding-system (Lisp
6130 	symbol) `utf-16-le' by default.
6131 
6132    o coding-category-ccl
6133 
6134 	The category for a coding system of which encoder/decoder is
6135 	written in CCL programs.  The default value is nil, i.e., no
6136 	coding system is assigned.
6137 
6138    o coding-category-binary
6139 
6140    	The category for a coding system not categorized in any of the
6141 	above.  Assigned the coding-system (Lisp symbol)
6142 	`no-conversion' by default.
6143 
6144    Each of them is a Lisp symbol and the value is an actual
6145    `coding-system's (this is also a Lisp symbol) assigned by a user.
6146    What Emacs does actually is to detect a category of coding system.
6147    Then, it uses a `coding-system' assigned to it.  If Emacs can't
6148    decide only one possible category, it selects a category of the
6149    highest priority.  Priorities of categories are also specified by a
6150    user in a Lisp variable `coding-category-list'.
6151 
6152 */
6153 
6154 static Lisp_Object adjust_coding_eol_type (struct coding_system *coding,
6155 					   int eol_seen);
6156 
6157 
6158 /* Return the number of ASCII characters at the head of the source.
6159    By side effects, set coding->head_ascii and update
6160    coding->eol_seen.  The value of coding->eol_seen is "logical or" of
6161    EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but the value is
6162    reliable only when all the source bytes are ASCII.  */
6163 
6164 static ptrdiff_t
check_ascii(struct coding_system * coding)6165 check_ascii (struct coding_system *coding)
6166 {
6167   const unsigned char *src, *end;
6168   Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
6169   int eol_seen = coding->eol_seen;
6170 
6171   coding_set_source (coding);
6172   src = coding->source;
6173   end = src + coding->src_bytes;
6174 
6175   if (inhibit_eol_conversion
6176       || SYMBOLP (eol_type))
6177     {
6178       /* We don't have to check EOL format.  */
6179       while (src < end && !( *src & 0x80))
6180 	{
6181 	  if (*src++ == '\n')
6182 	    eol_seen |= EOL_SEEN_LF;
6183 	}
6184     }
6185   else
6186     {
6187       end--;		    /* We look ahead one byte for "CR LF".  */
6188       while (src < end)
6189 	{
6190 	  int c = *src;
6191 
6192 	  if (c & 0x80)
6193 	    break;
6194 	  src++;
6195 	  if (c == '\r')
6196 	    {
6197 	      if (*src == '\n')
6198 		{
6199 		  eol_seen |= EOL_SEEN_CRLF;
6200 		  src++;
6201 		}
6202 	      else
6203 		eol_seen |= EOL_SEEN_CR;
6204 	    }
6205 	  else if (c == '\n')
6206 	    eol_seen |= EOL_SEEN_LF;
6207 	}
6208       if (src == end)
6209 	{
6210 	  int c = *src;
6211 
6212 	  /* All bytes but the last one C are ASCII.  */
6213 	  if (! (c & 0x80))
6214 	    {
6215 	      if (c == '\r')
6216 		eol_seen |= EOL_SEEN_CR;
6217 	      else if (c  == '\n')
6218 		eol_seen |= EOL_SEEN_LF;
6219 	      src++;
6220 	    }
6221 	}
6222     }
6223   coding->head_ascii = src - coding->source;
6224   coding->eol_seen = eol_seen;
6225   return (coding->head_ascii);
6226 }
6227 
6228 
6229 /* Return the number of characters at the source if all the bytes are
6230    valid UTF-8 (of Unicode range).  Otherwise, return -1.  By side
6231    effects, update coding->eol_seen.  The value of coding->eol_seen is
6232    "logical or" of EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but
6233    the value is reliable only when all the source bytes are valid
6234    UTF-8.  */
6235 
6236 static ptrdiff_t
check_utf_8(struct coding_system * coding)6237 check_utf_8 (struct coding_system *coding)
6238 {
6239   const unsigned char *src, *end;
6240   int eol_seen;
6241   ptrdiff_t nchars = coding->head_ascii;
6242 
6243   if (coding->head_ascii < 0)
6244     check_ascii (coding);
6245   else
6246     coding_set_source (coding);
6247   src = coding->source + coding->head_ascii;
6248   /* We look ahead one byte for CR LF.  */
6249   end = coding->source + coding->src_bytes - 1;
6250   eol_seen = coding->eol_seen;
6251   while (src < end)
6252     {
6253       int c = *src;
6254 
6255       if (UTF_8_1_OCTET_P (*src))
6256 	{
6257 	  src++;
6258 	  if (c < 0x20)
6259 	    {
6260 	      if (c == '\r')
6261 		{
6262 		  if (*src == '\n')
6263 		    {
6264 		      eol_seen |= EOL_SEEN_CRLF;
6265 		      src++;
6266 		      nchars++;
6267 		    }
6268 		  else
6269 		    eol_seen |= EOL_SEEN_CR;
6270 		}
6271 	      else if (c == '\n')
6272 		eol_seen |= EOL_SEEN_LF;
6273 	    }
6274 	}
6275       else if (UTF_8_2_OCTET_LEADING_P (c))
6276 	{
6277 	  if (c < 0xC2		/* overlong sequence */
6278 	      || src + 1 >= end
6279 	      || ! UTF_8_EXTRA_OCTET_P (src[1]))
6280 	    return -1;
6281 	  src += 2;
6282 	}
6283       else if (UTF_8_3_OCTET_LEADING_P (c))
6284 	{
6285 	  if (src + 2 >= end
6286 	      || ! (UTF_8_EXTRA_OCTET_P (src[1])
6287 		    && UTF_8_EXTRA_OCTET_P (src[2])))
6288 	    return -1;
6289 	  c = (((c & 0xF) << 12)
6290 	       | ((src[1] & 0x3F) << 6) | (src[2] & 0x3F));
6291 	  if (c < 0x800			      /* overlong sequence */
6292 	      || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
6293 	    return -1;
6294 	  src += 3;
6295 	}
6296       else if (UTF_8_4_OCTET_LEADING_P (c))
6297 	{
6298 	  if (src + 3 >= end
6299 	      || ! (UTF_8_EXTRA_OCTET_P (src[1])
6300 		    && UTF_8_EXTRA_OCTET_P (src[2])
6301 		    && UTF_8_EXTRA_OCTET_P (src[3])))
6302 	    return -1;
6303 	  c = (((c & 0x7) << 18) | ((src[1] & 0x3F) << 12)
6304 	       | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F));
6305 	  if (c < 0x10000	/* overlong sequence */
6306 	      || c >= 0x110000)	/* non-Unicode character  */
6307 	    return -1;
6308 	  src += 4;
6309 	}
6310       else
6311 	return -1;
6312       nchars++;
6313     }
6314 
6315   if (src == end)
6316     {
6317       if (! UTF_8_1_OCTET_P (*src))
6318 	return -1;
6319       nchars++;
6320       if (*src == '\r')
6321 	eol_seen |= EOL_SEEN_CR;
6322       else if (*src  == '\n')
6323 	eol_seen |= EOL_SEEN_LF;
6324     }
6325   coding->eol_seen = eol_seen;
6326   return nchars;
6327 }
6328 
6329 
6330 /* Return whether STRING is a valid UTF-8 string.  STRING must be a
6331    unibyte string.  */
6332 
6333 bool
utf8_string_p(Lisp_Object string)6334 utf8_string_p (Lisp_Object string)
6335 {
6336   eassert (!STRING_MULTIBYTE (string));
6337   struct coding_system coding;
6338   setup_coding_system (Qutf_8_unix, &coding);
6339   /* We initialize only the fields that check_utf_8 accesses.  */
6340   coding.head_ascii = -1;
6341   coding.src_pos = 0;
6342   coding.src_pos_byte = 0;
6343   coding.src_chars = SCHARS (string);
6344   coding.src_bytes = SBYTES (string);
6345   coding.src_object = string;
6346   coding.eol_seen = EOL_SEEN_NONE;
6347   return check_utf_8 (&coding) != -1;
6348 }
6349 
6350 /* Like make_string, but always returns a multibyte Lisp string, and
6351    avoids decoding if TEXT is encoded in UTF-8.  */
6352 Lisp_Object
make_string_from_utf8(const char * text,ptrdiff_t nbytes)6353 make_string_from_utf8 (const char *text, ptrdiff_t nbytes)
6354 {
6355 #if 0
6356   /* This method is on average 2 times slower than if we use
6357      decode_string_utf_8.  However, please leave the slower
6358      implementation in the code for now, in case it needs to be reused
6359      in some situations.  */
6360   ptrdiff_t chars, bytes;
6361   parse_str_as_multibyte ((const unsigned char *) text, nbytes,
6362 			  &chars, &bytes);
6363   /* If TEXT is a valid UTF-8 string, we can convert it to a Lisp
6364      string directly.  Otherwise, we need to decode it.  */
6365   if (chars == nbytes || bytes == nbytes)
6366     return make_specified_string (text, chars, nbytes, true);
6367   else
6368     {
6369       struct coding_system coding;
6370       setup_coding_system (Qutf_8_unix, &coding);
6371       coding.mode |= CODING_MODE_LAST_BLOCK;
6372       coding.source = (const unsigned char *) text;
6373       decode_coding_object (&coding, Qnil, 0, 0, nbytes, nbytes, Qt);
6374       return coding.dst_object;
6375     }
6376 #else
6377   return decode_string_utf_8 (Qnil, text, nbytes, Qnil, false, Qt, Qt);
6378 #endif
6379 }
6380 
6381 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
6382    SOURCE is encoded.  If CATEGORY is one of
6383    coding_category_utf_16_XXXX, assume that CR and LF are encoded by
6384    two-byte, else they are encoded by one-byte.
6385 
6386    Return one of EOL_SEEN_XXX.  */
6387 
6388 #define MAX_EOL_CHECK_COUNT 3
6389 
6390 static int
detect_eol(const unsigned char * source,ptrdiff_t src_bytes,enum coding_category category)6391 detect_eol (const unsigned char *source, ptrdiff_t src_bytes,
6392 	    enum coding_category category)
6393 {
6394   const unsigned char *src = source, *src_end = src + src_bytes;
6395   unsigned char c;
6396   int total  = 0;
6397   int eol_seen = EOL_SEEN_NONE;
6398 
6399   if ((1 << category) & CATEGORY_MASK_UTF_16)
6400     {
6401       bool msb = category == (coding_category_utf_16_le
6402 			      | coding_category_utf_16_le_nosig);
6403       bool lsb = !msb;
6404 
6405       while (src + 1 < src_end)
6406 	{
6407 	  c = src[lsb];
6408 	  if (src[msb] == 0 && (c == '\n' || c == '\r'))
6409 	    {
6410 	      int this_eol;
6411 
6412 	      if (c == '\n')
6413 		this_eol = EOL_SEEN_LF;
6414 	      else if (src + 3 >= src_end
6415 		       || src[msb + 2] != 0
6416 		       || src[lsb + 2] != '\n')
6417 		this_eol = EOL_SEEN_CR;
6418 	      else
6419 		{
6420 		  this_eol = EOL_SEEN_CRLF;
6421 		  src += 2;
6422 		}
6423 
6424 	      if (eol_seen == EOL_SEEN_NONE)
6425 		/* This is the first end-of-line.  */
6426 		eol_seen = this_eol;
6427 	      else if (eol_seen != this_eol)
6428 		{
6429 		  /* The found type is different from what found before.
6430 		     Allow for stray ^M characters in DOS EOL files.  */
6431 		  if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6432 		      || (eol_seen == EOL_SEEN_CRLF
6433 			  && this_eol == EOL_SEEN_CR))
6434 		    eol_seen = EOL_SEEN_CRLF;
6435 		  else
6436 		    {
6437 		      eol_seen = EOL_SEEN_LF;
6438 		      break;
6439 		    }
6440 		}
6441 	      if (++total == MAX_EOL_CHECK_COUNT)
6442 		break;
6443 	    }
6444 	  src += 2;
6445 	}
6446     }
6447   else
6448     while (src < src_end)
6449       {
6450 	c = *src++;
6451 	if (c == '\n' || c == '\r')
6452 	  {
6453 	    int this_eol;
6454 
6455 	    if (c == '\n')
6456 	      this_eol = EOL_SEEN_LF;
6457 	    else if (src >= src_end || *src != '\n')
6458 	      this_eol = EOL_SEEN_CR;
6459 	    else
6460 	      this_eol = EOL_SEEN_CRLF, src++;
6461 
6462 	    if (eol_seen == EOL_SEEN_NONE)
6463 	      /* This is the first end-of-line.  */
6464 	      eol_seen = this_eol;
6465 	    else if (eol_seen != this_eol)
6466 	      {
6467 		/* The found type is different from what found before.
6468 		   Allow for stray ^M characters in DOS EOL files.  */
6469 		if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6470 		    || (eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR))
6471 		  eol_seen = EOL_SEEN_CRLF;
6472 		else
6473 		  {
6474 		    eol_seen = EOL_SEEN_LF;
6475 		    break;
6476 		  }
6477 	      }
6478 	    if (++total == MAX_EOL_CHECK_COUNT)
6479 	      break;
6480 	  }
6481       }
6482   return eol_seen;
6483 }
6484 
6485 
6486 static Lisp_Object
adjust_coding_eol_type(struct coding_system * coding,int eol_seen)6487 adjust_coding_eol_type (struct coding_system *coding, int eol_seen)
6488 {
6489   Lisp_Object eol_type;
6490 
6491   eol_type = CODING_ID_EOL_TYPE (coding->id);
6492   if (! VECTORP (eol_type))
6493     /* Already adjusted.  */
6494     return eol_type;
6495   if (eol_seen & EOL_SEEN_LF)
6496     {
6497       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
6498       eol_type = Qunix;
6499     }
6500   else if (eol_seen & EOL_SEEN_CRLF)
6501     {
6502       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
6503       eol_type = Qdos;
6504     }
6505   else if (eol_seen & EOL_SEEN_CR)
6506     {
6507       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
6508       eol_type = Qmac;
6509     }
6510   return eol_type;
6511 }
6512 
6513 /* Detect how a text specified in CODING is encoded.  If a coding
6514    system is detected, update fields of CODING by the detected coding
6515    system.  */
6516 
6517 static void
detect_coding(struct coding_system * coding)6518 detect_coding (struct coding_system *coding)
6519 {
6520   const unsigned char *src, *src_end;
6521   unsigned int saved_mode = coding->mode;
6522   Lisp_Object found = Qnil;
6523   Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
6524 
6525   coding->consumed = coding->consumed_char = 0;
6526   coding->produced = coding->produced_char = 0;
6527   coding_set_source (coding);
6528 
6529   src_end = coding->source + coding->src_bytes;
6530 
6531   coding->eol_seen = EOL_SEEN_NONE;
6532   /* If we have not yet decided the text encoding type, detect it
6533      now.  */
6534   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
6535     {
6536       int c, i;
6537       struct coding_detection_info detect_info;
6538       bool null_byte_found = 0, eight_bit_found = 0;
6539       bool inhibit_nbd = inhibit_flag (coding->spec.undecided.inhibit_nbd,
6540 				       inhibit_null_byte_detection);
6541       bool inhibit_ied = inhibit_flag (coding->spec.undecided.inhibit_ied,
6542 				       inhibit_iso_escape_detection);
6543       bool prefer_utf_8 = coding->spec.undecided.prefer_utf_8;
6544 
6545       coding->head_ascii = 0;
6546       detect_info.checked = detect_info.found = detect_info.rejected = 0;
6547       for (src = coding->source; src < src_end; src++)
6548 	{
6549 	  c = *src;
6550 	  if (c & 0x80)
6551 	    {
6552 	      eight_bit_found = 1;
6553 	      if (null_byte_found)
6554 		break;
6555 	    }
6556 	  else if (c < 0x20)
6557 	    {
6558 	      if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
6559 		  && ! inhibit_ied
6560 		  && ! detect_info.checked)
6561 		{
6562 		  if (detect_coding_iso_2022 (coding, &detect_info))
6563 		    {
6564 		      /* We have scanned the whole data.  */
6565 		      if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
6566 			{
6567 			  /* We didn't find an 8-bit code.  We may
6568 			     have found a null-byte, but it's very
6569 			     rare that a binary file conforms to
6570 			     ISO-2022.  */
6571 			  src = src_end;
6572 			  coding->head_ascii = src - coding->source;
6573 			}
6574 		      detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
6575 		      break;
6576 		    }
6577 		}
6578 	      else if (! c && !inhibit_nbd)
6579 		{
6580 		  null_byte_found = 1;
6581 		  if (eight_bit_found)
6582 		    break;
6583 		}
6584 	      else if (! disable_ascii_optimization
6585 		       && ! inhibit_eol_conversion)
6586 		{
6587 		  if (c == '\r')
6588 		    {
6589 		      if (src < src_end && src[1] == '\n')
6590 			{
6591 			  coding->eol_seen |= EOL_SEEN_CRLF;
6592 			  src++;
6593 			  if (! eight_bit_found)
6594 			    coding->head_ascii++;
6595 			}
6596 		      else
6597 			coding->eol_seen |= EOL_SEEN_CR;
6598 		    }
6599 		  else if (c == '\n')
6600 		    {
6601 		      coding->eol_seen |= EOL_SEEN_LF;
6602 		    }
6603 		}
6604 
6605 	      if (! eight_bit_found)
6606 		coding->head_ascii++;
6607 	    }
6608 	  else if (! eight_bit_found)
6609 	    coding->head_ascii++;
6610 	}
6611 
6612       if (null_byte_found || eight_bit_found
6613 	  || coding->head_ascii < coding->src_bytes
6614 	  || detect_info.found)
6615 	{
6616 	  enum coding_category category;
6617 	  struct coding_system *this;
6618 
6619 	  if (coding->head_ascii == coding->src_bytes)
6620 	    /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
6621 	    for (i = 0; i < coding_category_raw_text; i++)
6622 	      {
6623 		category = coding_priorities[i];
6624 		this = coding_categories + category;
6625 		if (detect_info.found & (1 << category))
6626 		  break;
6627 	      }
6628 	  else
6629 	    {
6630 	      if (null_byte_found)
6631 		{
6632 		  detect_info.checked |= ~CATEGORY_MASK_UTF_16;
6633 		  detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
6634 		}
6635 	      else if (prefer_utf_8
6636 		       && detect_coding_utf_8 (coding, &detect_info))
6637 		{
6638 		  detect_info.checked |= ~CATEGORY_MASK_UTF_8;
6639 		  detect_info.rejected |= ~CATEGORY_MASK_UTF_8;
6640 		}
6641 	      for (i = 0; i < coding_category_raw_text; i++)
6642 		{
6643 		  category = coding_priorities[i];
6644 		  this = coding_categories + category;
6645 		  /* Some of this->detector (e.g. detect_coding_sjis)
6646 		     require this information.  */
6647 		  coding->id = this->id;
6648 		  if (this->id < 0)
6649 		    {
6650 		      /* No coding system of this category is defined.  */
6651 		      detect_info.rejected |= (1 << category);
6652 		    }
6653 		  else if (category >= coding_category_raw_text)
6654 		    continue;
6655 		  else if (detect_info.checked & (1 << category))
6656 		    {
6657 		      if (detect_info.found & (1 << category))
6658 			break;
6659 		    }
6660 		  else if ((*(this->detector)) (coding, &detect_info)
6661 			   && detect_info.found & (1 << category))
6662 		    break;
6663 		}
6664 	    }
6665 
6666 	  if (i < coding_category_raw_text)
6667 	    {
6668 	      if (category == coding_category_utf_8_auto)
6669 		{
6670 		  Lisp_Object coding_systems;
6671 
6672 		  coding_systems = AREF (CODING_ID_ATTRS (this->id),
6673 					 coding_attr_utf_bom);
6674 		  if (CONSP (coding_systems))
6675 		    {
6676 		      if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6677 			found = XCAR (coding_systems);
6678 		      else
6679 			found = XCDR (coding_systems);
6680 		    }
6681 		  else
6682 		    found = CODING_ID_NAME (this->id);
6683 		}
6684 	      else if (category == coding_category_utf_16_auto)
6685 		{
6686 		  Lisp_Object coding_systems;
6687 
6688 		  coding_systems = AREF (CODING_ID_ATTRS (this->id),
6689 					 coding_attr_utf_bom);
6690 		  if (CONSP (coding_systems))
6691 		    {
6692 		      if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6693 			found = XCAR (coding_systems);
6694 		      else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6695 			found = XCDR (coding_systems);
6696 		    }
6697 		  else
6698 		    found = CODING_ID_NAME (this->id);
6699 		}
6700 	      else
6701 		found = CODING_ID_NAME (this->id);
6702 	    }
6703 	  else if (null_byte_found)
6704 	    found = Qno_conversion;
6705 	  else if ((detect_info.rejected & CATEGORY_MASK_ANY)
6706 		   == CATEGORY_MASK_ANY)
6707 	    found = Qraw_text;
6708 	  else if (detect_info.rejected)
6709 	    for (i = 0; i < coding_category_raw_text; i++)
6710 	      if (! (detect_info.rejected & (1 << coding_priorities[i])))
6711 		{
6712 		  this = coding_categories + coding_priorities[i];
6713 		  found = CODING_ID_NAME (this->id);
6714 		  break;
6715 		}
6716 	}
6717     }
6718   else if (XFIXNUM (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6719 	   == coding_category_utf_8_auto)
6720     {
6721       Lisp_Object coding_systems;
6722       struct coding_detection_info detect_info;
6723 
6724       coding_systems
6725 	= AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6726       detect_info.found = detect_info.rejected = 0;
6727       if (check_ascii (coding) == coding->src_bytes)
6728 	{
6729 	  if (CONSP (coding_systems))
6730 	    found = XCDR (coding_systems);
6731 	}
6732       else
6733 	{
6734 	  if (CONSP (coding_systems)
6735 	      && detect_coding_utf_8 (coding, &detect_info))
6736 	    {
6737 	      if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6738 		found = XCAR (coding_systems);
6739 	      else
6740 		found = XCDR (coding_systems);
6741 	    }
6742 	}
6743     }
6744   else if (XFIXNUM (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6745 	   == coding_category_utf_16_auto)
6746     {
6747       Lisp_Object coding_systems;
6748       struct coding_detection_info detect_info;
6749 
6750       coding_systems
6751 	= AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6752       detect_info.found = detect_info.rejected = 0;
6753       coding->head_ascii = 0;
6754       if (CONSP (coding_systems)
6755 	  && detect_coding_utf_16 (coding, &detect_info))
6756 	{
6757 	  if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6758 	    found = XCAR (coding_systems);
6759 	  else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6760 	    found = XCDR (coding_systems);
6761 	}
6762     }
6763 
6764   if (! NILP (found))
6765     {
6766       int specified_eol = (VECTORP (eol_type) ? EOL_SEEN_NONE
6767 			   : EQ (eol_type, Qdos) ? EOL_SEEN_CRLF
6768 			   : EQ (eol_type, Qmac) ? EOL_SEEN_CR
6769 			   : EOL_SEEN_LF);
6770 
6771       setup_coding_system (found, coding);
6772       if (specified_eol != EOL_SEEN_NONE)
6773 	adjust_coding_eol_type (coding, specified_eol);
6774     }
6775 
6776   coding->mode = saved_mode;
6777 }
6778 
6779 
6780 static void
decode_eol(struct coding_system * coding)6781 decode_eol (struct coding_system *coding)
6782 {
6783   Lisp_Object eol_type;
6784   unsigned char *p, *pbeg, *pend;
6785 
6786   eol_type = CODING_ID_EOL_TYPE (coding->id);
6787   if (EQ (eol_type, Qunix) || inhibit_eol_conversion)
6788     return;
6789 
6790   if (NILP (coding->dst_object))
6791     pbeg = coding->destination;
6792   else
6793     pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
6794   pend = pbeg + coding->produced;
6795 
6796   if (VECTORP (eol_type))
6797     {
6798       int eol_seen = EOL_SEEN_NONE;
6799 
6800       for (p = pbeg; p < pend; p++)
6801 	{
6802 	  if (*p == '\n')
6803 	    eol_seen |= EOL_SEEN_LF;
6804 	  else if (*p == '\r')
6805 	    {
6806 	      if (p + 1 < pend && *(p + 1) == '\n')
6807 		{
6808 		  eol_seen |= EOL_SEEN_CRLF;
6809 		  p++;
6810 		}
6811 	      else
6812 		eol_seen |= EOL_SEEN_CR;
6813 	    }
6814 	}
6815       /* Handle DOS-style EOLs in a file with stray ^M characters.  */
6816       if ((eol_seen & EOL_SEEN_CRLF) != 0
6817 	  && (eol_seen & EOL_SEEN_CR) != 0
6818 	  && (eol_seen & EOL_SEEN_LF) == 0)
6819 	eol_seen = EOL_SEEN_CRLF;
6820       else if (eol_seen != EOL_SEEN_NONE
6821 	  && eol_seen != EOL_SEEN_LF
6822 	  && eol_seen != EOL_SEEN_CRLF
6823 	  && eol_seen != EOL_SEEN_CR)
6824 	eol_seen = EOL_SEEN_LF;
6825       if (eol_seen != EOL_SEEN_NONE)
6826 	eol_type = adjust_coding_eol_type (coding, eol_seen);
6827     }
6828 
6829   if (EQ (eol_type, Qmac))
6830     {
6831       for (p = pbeg; p < pend; p++)
6832 	if (*p == '\r')
6833 	  *p = '\n';
6834     }
6835   else if (EQ (eol_type, Qdos))
6836     {
6837       ptrdiff_t n = 0;
6838       ptrdiff_t pos = coding->dst_pos;
6839       ptrdiff_t pos_byte = coding->dst_pos_byte;
6840       ptrdiff_t pos_end = pos_byte + coding->produced - 1;
6841 
6842       /* This assertion is here instead of code, now deleted, that
6843 	 handled the NILP case, which no longer happens with the
6844 	 current codebase.  */
6845       eassert (!NILP (coding->dst_object));
6846 
6847       while (pos_byte < pos_end)
6848 	{
6849 	  int incr;
6850 
6851 	  p = BYTE_POS_ADDR (pos_byte);
6852 	  if (coding->dst_multibyte)
6853 	    incr = BYTES_BY_CHAR_HEAD (*p);
6854 	  else
6855 	    incr = 1;
6856 
6857 	  if (*p == '\r' && p[1] == '\n')
6858 	    {
6859 	      del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
6860 	      n++;
6861 	      pos_end--;
6862 	    }
6863 	  pos++;
6864 	  pos_byte += incr;
6865 	}
6866       coding->produced -= n;
6867       coding->produced_char -= n;
6868     }
6869 }
6870 
6871 
6872 /* MAX_LOOKUP's maximum value.  MAX_LOOKUP is an int and so cannot
6873    exceed INT_MAX.  Also, MAX_LOOKUP is multiplied by sizeof (int) for
6874    alloca, so it cannot exceed MAX_ALLOCA / sizeof (int).  */
6875 enum { MAX_LOOKUP_MAX = min (INT_MAX, MAX_ALLOCA / sizeof (int)) };
6876 
6877 /* Return a translation table (or list of them) from coding system
6878    attribute vector ATTRS for encoding (if ENCODEP) or decoding (if
6879    not ENCODEP). */
6880 
6881 static Lisp_Object
get_translation_table(Lisp_Object attrs,bool encodep,int * max_lookup)6882 get_translation_table (Lisp_Object attrs, bool encodep, int *max_lookup)
6883 {
6884   Lisp_Object standard, translation_table;
6885   Lisp_Object val;
6886 
6887   if (NILP (Venable_character_translation))
6888     {
6889       if (max_lookup)
6890 	*max_lookup = 0;
6891       return Qnil;
6892     }
6893   if (encodep)
6894     translation_table = CODING_ATTR_ENCODE_TBL (attrs),
6895       standard = Vstandard_translation_table_for_encode;
6896   else
6897     translation_table = CODING_ATTR_DECODE_TBL (attrs),
6898       standard = Vstandard_translation_table_for_decode;
6899   if (NILP (translation_table))
6900     translation_table = standard;
6901   else
6902     {
6903       if (SYMBOLP (translation_table))
6904 	translation_table = Fget (translation_table, Qtranslation_table);
6905       else if (CONSP (translation_table))
6906 	{
6907 	  translation_table = Fcopy_sequence (translation_table);
6908 	  for (val = translation_table; CONSP (val); val = XCDR (val))
6909 	    if (SYMBOLP (XCAR (val)))
6910 	      XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
6911 	}
6912       if (CHAR_TABLE_P (standard))
6913 	{
6914 	  if (CONSP (translation_table))
6915 	    translation_table = nconc2 (translation_table, list1 (standard));
6916 	  else
6917 	    translation_table = list2 (translation_table, standard);
6918 	}
6919     }
6920 
6921   if (max_lookup)
6922     {
6923       *max_lookup = 1;
6924       if (CHAR_TABLE_P (translation_table)
6925 	  && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
6926 	{
6927 	  val = XCHAR_TABLE (translation_table)->extras[1];
6928 	  if (FIXNATP (val) && *max_lookup < XFIXNAT (val))
6929 	    *max_lookup = min (XFIXNAT (val), MAX_LOOKUP_MAX);
6930 	}
6931       else if (CONSP (translation_table))
6932 	{
6933 	  Lisp_Object tail;
6934 
6935 	  for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
6936 	    if (CHAR_TABLE_P (XCAR (tail))
6937 		&& CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
6938 	      {
6939 		Lisp_Object tailval = XCHAR_TABLE (XCAR (tail))->extras[1];
6940 		if (FIXNATP (tailval) && *max_lookup < XFIXNAT (tailval))
6941 		  *max_lookup = min (XFIXNAT (tailval), MAX_LOOKUP_MAX);
6942 	      }
6943 	}
6944     }
6945   return translation_table;
6946 }
6947 
6948 #define LOOKUP_TRANSLATION_TABLE(table, c, trans)		\
6949   do {								\
6950     trans = Qnil;						\
6951     if (CHAR_TABLE_P (table))					\
6952       {								\
6953 	trans = CHAR_TABLE_REF (table, c);			\
6954 	if (CHARACTERP (trans))					\
6955 	  c = XFIXNAT (trans), trans = Qnil;			\
6956       }								\
6957     else if (CONSP (table))					\
6958       {								\
6959 	Lisp_Object tail;					\
6960 								\
6961 	for (tail = table; CONSP (tail); tail = XCDR (tail))	\
6962 	  if (CHAR_TABLE_P (XCAR (tail)))			\
6963 	    {							\
6964 	      trans = CHAR_TABLE_REF (XCAR (tail), c);		\
6965 	      if (CHARACTERP (trans))				\
6966 		c = XFIXNAT (trans), trans = Qnil;		\
6967 	      else if (! NILP (trans))				\
6968 		break;						\
6969 	    }							\
6970       }								\
6971   } while (0)
6972 
6973 
6974 /* Return a translation of character(s) at BUF according to TRANS.
6975    TRANS is TO-CHAR, [TO-CHAR ...], or ((FROM .  TO) ...) where FROM =
6976    [FROM-CHAR ...], TO is TO-CHAR or [TO-CHAR ...].  The return value
6977    is TO-CHAR or [TO-CHAR ...] if a translation is found, Qnil if not
6978    found, or Qt if BUF is too short to lookup characters in FROM.  As
6979    a side effect, if a translation is found, *NCHARS is set to the
6980    number of characters being translated.  */
6981 
6982 static Lisp_Object
get_translation(Lisp_Object trans,int * buf,int * buf_end,ptrdiff_t * nchars)6983 get_translation (Lisp_Object trans, int *buf, int *buf_end, ptrdiff_t *nchars)
6984 {
6985   if (FIXNUMP (trans) || VECTORP (trans))
6986     {
6987       *nchars = 1;
6988       return trans;
6989     }
6990   for (; CONSP (trans); trans = XCDR (trans))
6991     {
6992       Lisp_Object val = XCAR (trans);
6993       Lisp_Object from = XCAR (val);
6994       ptrdiff_t len = ASIZE (from);
6995       ptrdiff_t i;
6996 
6997       for (i = 0; i < len; i++)
6998 	{
6999 	  if (buf + i == buf_end)
7000 	    return Qt;
7001 	  if (XFIXNUM (AREF (from, i)) != buf[i])
7002 	    break;
7003 	}
7004       if (i == len)
7005 	{
7006 	  *nchars = len;
7007 	  return XCDR (val);
7008 	}
7009     }
7010   return Qnil;
7011 }
7012 
7013 
7014 static int
produce_chars(struct coding_system * coding,Lisp_Object translation_table,bool last_block)7015 produce_chars (struct coding_system *coding, Lisp_Object translation_table,
7016 	       bool last_block)
7017 {
7018   unsigned char *dst = coding->destination + coding->produced;
7019   unsigned char *dst_end = coding->destination + coding->dst_bytes;
7020   ptrdiff_t produced;
7021   ptrdiff_t produced_chars = 0;
7022   int carryover = 0;
7023 
7024   if (! coding->chars_at_source)
7025     {
7026       /* Source characters are in coding->charbuf.  */
7027       int *buf = coding->charbuf;
7028       int *buf_end = buf + coding->charbuf_used;
7029 
7030       if (EQ (coding->src_object, coding->dst_object)
7031 	  && ! NILP (coding->dst_object))
7032 	{
7033 	  eassert (growable_destination (coding));
7034 	  coding_set_source (coding);
7035 	  dst_end = ((unsigned char *) coding->source) + coding->consumed;
7036 	}
7037 
7038       while (buf < buf_end)
7039 	{
7040 	  int c = *buf;
7041 	  ptrdiff_t i;
7042 
7043 	  if (c >= 0)
7044 	    {
7045 	      ptrdiff_t from_nchars = 1, to_nchars = 1;
7046 	      Lisp_Object trans = Qnil;
7047 
7048 	      LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7049 	      if (! NILP (trans))
7050 		{
7051 		  trans = get_translation (trans, buf, buf_end, &from_nchars);
7052 		  if (FIXNUMP (trans))
7053 		    c = XFIXNUM (trans);
7054 		  else if (VECTORP (trans))
7055 		    {
7056 		      to_nchars = ASIZE (trans);
7057 		      c = XFIXNUM (AREF (trans, 0));
7058 		    }
7059 		  else if (EQ (trans, Qt) && ! last_block)
7060 		    break;
7061 		}
7062 
7063 	      if ((dst_end - dst) / MAX_MULTIBYTE_LENGTH < to_nchars)
7064 		{
7065 		  eassert (growable_destination (coding));
7066 		  ptrdiff_t dst_size;
7067 		  if (INT_MULTIPLY_WRAPV (to_nchars, MAX_MULTIBYTE_LENGTH,
7068 					  &dst_size)
7069 		      || INT_ADD_WRAPV (buf_end - buf, dst_size, &dst_size))
7070 		    memory_full (SIZE_MAX);
7071 		  dst = alloc_destination (coding, dst_size, dst);
7072 		  if (EQ (coding->src_object, coding->dst_object))
7073 		    {
7074 		      coding_set_source (coding);
7075 		      dst_end = (((unsigned char *) coding->source)
7076 				 + coding->consumed);
7077 		    }
7078 		  else
7079 		    dst_end = coding->destination + coding->dst_bytes;
7080 		}
7081 
7082 	      for (i = 0; i < to_nchars; i++)
7083 		{
7084 		  if (i > 0)
7085 		    c = XFIXNUM (AREF (trans, i));
7086 		  if (coding->dst_multibyte
7087 		      || ! CHAR_BYTE8_P (c))
7088 		    CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
7089 		  else
7090 		    *dst++ = CHAR_TO_BYTE8 (c);
7091 		}
7092 	      produced_chars += to_nchars;
7093 	      buf += from_nchars;
7094 	    }
7095 	  else
7096 	    /* This is an annotation datum.  (-C) is the length.  */
7097 	    buf += -c;
7098 	}
7099       carryover = buf_end - buf;
7100     }
7101   else
7102     {
7103       /* Source characters are at coding->source.  */
7104       const unsigned char *src = coding->source;
7105       const unsigned char *src_end = src + coding->consumed;
7106 
7107       if (EQ (coding->dst_object, coding->src_object))
7108 	{
7109 	  eassert (growable_destination (coding));
7110 	  dst_end = (unsigned char *) src;
7111 	}
7112       if (coding->src_multibyte != coding->dst_multibyte)
7113 	{
7114 	  if (coding->src_multibyte)
7115 	    {
7116 	      bool multibytep = 1;
7117 	      ptrdiff_t consumed_chars = 0;
7118 
7119 	      while (1)
7120 		{
7121 		  const unsigned char *src_base = src;
7122 		  int c;
7123 
7124 		  ONE_MORE_BYTE (c);
7125 		  if (dst == dst_end)
7126 		    {
7127 		      eassert (growable_destination (coding));
7128 		      if (EQ (coding->src_object, coding->dst_object))
7129 			dst_end = (unsigned char *) src;
7130 		      if (dst == dst_end)
7131 			{
7132 			  ptrdiff_t offset = src - coding->source;
7133 
7134 			  dst = alloc_destination (coding, src_end - src + 1,
7135 						   dst);
7136 			  dst_end = coding->destination + coding->dst_bytes;
7137 			  coding_set_source (coding);
7138 			  src = coding->source + offset;
7139 			  src_end = coding->source + coding->consumed;
7140 			  if (EQ (coding->src_object, coding->dst_object))
7141 			    dst_end = (unsigned char *) src;
7142 			}
7143 		    }
7144 		  *dst++ = c;
7145 		  produced_chars++;
7146 		}
7147 	    no_more_source:
7148 	      ;
7149 	    }
7150 	  else
7151 	    while (src < src_end)
7152 	      {
7153 		bool multibytep = 1;
7154 		int c = *src++;
7155 
7156 		if (dst >= dst_end - 1)
7157 		  {
7158 		    eassert (growable_destination (coding));
7159 		    if (EQ (coding->src_object, coding->dst_object))
7160 		      dst_end = (unsigned char *) src;
7161 		    if (dst >= dst_end - 1)
7162 		      {
7163 			ptrdiff_t offset = src - coding->source;
7164 			ptrdiff_t more_bytes;
7165 
7166 			if (EQ (coding->src_object, coding->dst_object))
7167 			  more_bytes = ((src_end - src) / 2) + 2;
7168 			else
7169 			  more_bytes = src_end - src + 2;
7170 			dst = alloc_destination (coding, more_bytes, dst);
7171 			dst_end = coding->destination + coding->dst_bytes;
7172 			coding_set_source (coding);
7173 			src = coding->source + offset;
7174 			src_end = coding->source + coding->consumed;
7175 			if (EQ (coding->src_object, coding->dst_object))
7176 			  dst_end = (unsigned char *) src;
7177 		      }
7178 		  }
7179 		EMIT_ONE_BYTE (c);
7180 	      }
7181 	}
7182       else
7183 	{
7184 	  if (!EQ (coding->src_object, coding->dst_object))
7185 	    {
7186 	      ptrdiff_t require = coding->src_bytes - coding->dst_bytes;
7187 
7188 	      if (require > 0)
7189 		{
7190 		  ptrdiff_t offset = src - coding->source;
7191 
7192 		  dst = alloc_destination (coding, require, dst);
7193 		  coding_set_source (coding);
7194 		  src = coding->source + offset;
7195 		  src_end = coding->source + coding->consumed;
7196 		}
7197 	    }
7198 	  produced_chars = coding->consumed_char;
7199 	  while (src < src_end)
7200 	    *dst++ = *src++;
7201 	}
7202     }
7203 
7204   produced = dst - (coding->destination + coding->produced);
7205   if (BUFFERP (coding->dst_object) && produced_chars > 0)
7206     insert_from_gap (produced_chars, produced, 0);
7207   coding->produced += produced;
7208   coding->produced_char += produced_chars;
7209   return carryover;
7210 }
7211 
7212 /* Compose text in CODING->object according to the annotation data at
7213    CHARBUF.  CHARBUF is an array:
7214      [ -LENGTH ANNOTATION_MASK NCHARS NBYTES METHOD [ COMPONENTS... ] ]
7215  */
7216 
7217 static void
produce_composition(struct coding_system * coding,int * charbuf,ptrdiff_t pos)7218 produce_composition (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
7219 {
7220   int len;
7221   ptrdiff_t to;
7222   enum composition_method method;
7223   Lisp_Object components;
7224 
7225   len = -charbuf[0] - MAX_ANNOTATION_LENGTH;
7226   to = pos + charbuf[2];
7227   method = (enum composition_method) (charbuf[4]);
7228 
7229   if (method == COMPOSITION_RELATIVE)
7230     components = Qnil;
7231   else
7232     {
7233       Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
7234       int i, j;
7235 
7236       if (method == COMPOSITION_WITH_RULE)
7237 	len = charbuf[2] * 3 - 2;
7238       charbuf += MAX_ANNOTATION_LENGTH;
7239       /* charbuf = [ CHRA ... CHAR] or [ CHAR -2 RULE ... CHAR ] */
7240       for (i = j = 0; i < len && charbuf[i] != -1; i++, j++)
7241 	{
7242 	  if (charbuf[i] >= 0)
7243 	    args[j] = make_fixnum (charbuf[i]);
7244 	  else
7245 	    {
7246 	      i++;
7247 	      args[j] = make_fixnum (charbuf[i] % 0x100);
7248 	    }
7249 	}
7250       components = (i == j ? Fstring (j, args) : Fvector (j, args));
7251     }
7252   compose_text (pos, to, components, Qnil, coding->dst_object);
7253 }
7254 
7255 
7256 /* Put `charset' property on text in CODING->object according to
7257    the annotation data at CHARBUF.  CHARBUF is an array:
7258      [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
7259  */
7260 
7261 static void
produce_charset(struct coding_system * coding,int * charbuf,ptrdiff_t pos)7262 produce_charset (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
7263 {
7264   ptrdiff_t from = pos - charbuf[2];
7265   struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
7266 
7267   Fput_text_property (make_fixnum (from), make_fixnum (pos),
7268 		      Qcharset, CHARSET_NAME (charset),
7269 		      coding->dst_object);
7270 }
7271 
7272 #define MAX_CHARBUF_SIZE 0x4000
7273 /* How many units decoding functions expect in coding->charbuf at
7274    most.  Currently, decode_coding_emacs_mule expects the following
7275    size, and that is the largest value.  */
7276 #define MAX_CHARBUF_EXTRA_SIZE ((MAX_ANNOTATION_LENGTH * 3) + 1)
7277 
7278 #define ALLOC_CONVERSION_WORK_AREA(coding, size)		\
7279   do {								\
7280     ptrdiff_t units = min ((size) + MAX_CHARBUF_EXTRA_SIZE,	\
7281 			   MAX_CHARBUF_SIZE);			\
7282     coding->charbuf = SAFE_ALLOCA (units * sizeof (int));	\
7283     coding->charbuf_size = units;				\
7284   } while (0)
7285 
7286 static void
produce_annotation(struct coding_system * coding,ptrdiff_t pos)7287 produce_annotation (struct coding_system *coding, ptrdiff_t pos)
7288 {
7289   int *charbuf = coding->charbuf;
7290   int *charbuf_end = charbuf + coding->charbuf_used;
7291 
7292   if (NILP (coding->dst_object))
7293     return;
7294 
7295   while (charbuf < charbuf_end)
7296     {
7297       if (*charbuf >= 0)
7298 	pos++, charbuf++;
7299       else
7300 	{
7301 	  int len = -*charbuf;
7302 
7303 	  if (len > 2)
7304 	    switch (charbuf[1])
7305 	      {
7306 	      case CODING_ANNOTATE_COMPOSITION_MASK:
7307 		produce_composition (coding, charbuf, pos);
7308 		break;
7309 	      case CODING_ANNOTATE_CHARSET_MASK:
7310 		produce_charset (coding, charbuf, pos);
7311 		break;
7312 	      default:
7313 		break;
7314 	      }
7315 	  charbuf += len;
7316 	}
7317     }
7318 }
7319 
7320 /* Decode the data at CODING->src_object into CODING->dst_object.
7321    CODING->src_object is a buffer, a string, or nil.
7322    CODING->dst_object is a buffer.
7323 
7324    If CODING->src_object is a buffer, it must be the current buffer.
7325    In this case, if CODING->src_pos is positive, it is a position of
7326    the source text in the buffer, otherwise, the source text is in the
7327    gap area of the buffer, and CODING->src_pos specifies the offset of
7328    the text from the end of the gap (and GPT must be equal to PT).
7329 
7330    When the text is taken from the gap, it can't be at the beginning
7331    of the gap because the new decoded text is progressively accumulated
7332    at the beginning of the gap before it gets inserted at PT (this way,
7333    as the output grows, the input shrinks, so we only need to allocate
7334    enough space for `max(IN, OUT)` instead of `IN + OUT`).
7335 
7336    If CODING->src_object is a string, CODING->src_pos is an index to
7337    that string.
7338 
7339    If CODING->src_object is nil, CODING->source must already point to
7340    the non-relocatable memory area.  In this case, CODING->src_pos is
7341    an offset from CODING->source.
7342 
7343    The decoded data is inserted at the current point of the buffer
7344    CODING->dst_object.
7345 */
7346 
7347 static void
decode_coding(struct coding_system * coding)7348 decode_coding (struct coding_system *coding)
7349 {
7350   Lisp_Object attrs;
7351   Lisp_Object undo_list;
7352   Lisp_Object translation_table;
7353   struct ccl_spec cclspec;
7354   int carryover;
7355   int i;
7356 
7357   USE_SAFE_ALLOCA;
7358 
7359   if (BUFFERP (coding->src_object)
7360       && coding->src_pos > 0
7361       && coding->src_pos < GPT
7362       && coding->src_pos + coding->src_chars > GPT)
7363     move_gap_both (coding->src_pos, coding->src_pos_byte);
7364 
7365   undo_list = Qt;
7366   if (BUFFERP (coding->dst_object))
7367     {
7368       set_buffer_internal (XBUFFER (coding->dst_object));
7369       if (GPT != PT)
7370 	move_gap_both (PT, PT_BYTE);
7371 
7372       /* We must disable undo_list in order to record the whole insert
7373 	 transaction via record_insert at the end.  But doing so also
7374 	 disables the recording of the first change to the undo_list.
7375 	 Therefore we check for first change here and record it via
7376 	 record_first_change if needed.  */
7377       if (MODIFF <= SAVE_MODIFF)
7378 	record_first_change ();
7379 
7380       undo_list = BVAR (current_buffer, undo_list);
7381       bset_undo_list (current_buffer, Qt);
7382     }
7383 
7384   coding->consumed = coding->consumed_char = 0;
7385   coding->produced = coding->produced_char = 0;
7386   coding->chars_at_source = 0;
7387   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7388 
7389   ALLOC_CONVERSION_WORK_AREA (coding, coding->src_bytes);
7390 
7391   attrs = CODING_ID_ATTRS (coding->id);
7392   translation_table = get_translation_table (attrs, 0, NULL);
7393 
7394   carryover = 0;
7395   if (coding->decoder == decode_coding_ccl)
7396     {
7397       coding->spec.ccl = &cclspec;
7398       setup_ccl_program (&cclspec.ccl, CODING_CCL_DECODER (coding));
7399     }
7400   do
7401     {
7402       ptrdiff_t pos = coding->dst_pos + coding->produced_char;
7403 
7404       coding_set_source (coding);
7405       coding->annotated = 0;
7406       coding->charbuf_used = carryover;
7407       (*(coding->decoder)) (coding);
7408       coding_set_destination (coding);
7409       carryover = produce_chars (coding, translation_table, 0);
7410       if (coding->annotated)
7411 	produce_annotation (coding, pos);
7412       for (i = 0; i < carryover; i++)
7413 	coding->charbuf[i]
7414 	  = coding->charbuf[coding->charbuf_used - carryover + i];
7415     }
7416   while (coding->result == CODING_RESULT_INSUFFICIENT_DST
7417 	 || (coding->consumed < coding->src_bytes
7418 	     && (coding->result == CODING_RESULT_SUCCESS
7419 		 || coding->result == CODING_RESULT_INVALID_SRC)));
7420 
7421   if (carryover > 0)
7422     {
7423       coding_set_destination (coding);
7424       coding->charbuf_used = carryover;
7425       produce_chars (coding, translation_table, 1);
7426     }
7427 
7428   coding->carryover_bytes = 0;
7429   if (coding->consumed < coding->src_bytes)
7430     {
7431       ptrdiff_t nbytes = coding->src_bytes - coding->consumed;
7432       const unsigned char *src;
7433 
7434       coding_set_source (coding);
7435       coding_set_destination (coding);
7436       src = coding->source + coding->consumed;
7437 
7438       if (coding->mode & CODING_MODE_LAST_BLOCK)
7439 	{
7440 	  /* Flush out unprocessed data as binary chars.  We are sure
7441 	     that the number of data is less than the size of
7442 	     coding->charbuf.  */
7443 	  coding->charbuf_used = 0;
7444 	  coding->chars_at_source = 0;
7445 
7446 	  while (nbytes-- > 0)
7447 	    {
7448 	      int c;
7449 
7450 	      /* Copy raw bytes in their 2-byte forms from multibyte
7451 		 text as single characters.  */
7452 	      if (coding->src_multibyte
7453 		  && CHAR_BYTE8_HEAD_P (*src) && nbytes > 0)
7454 		{
7455 		  c = string_char_advance (&src);
7456 		  nbytes--;
7457 		}
7458 	      else
7459 		{
7460 		  c = *src++;
7461 
7462 		  if (c & 0x80)
7463 		    c = BYTE8_TO_CHAR (c);
7464 		}
7465 	      coding->charbuf[coding->charbuf_used++] = c;
7466 	    }
7467 	  produce_chars (coding, Qnil, 1);
7468 	}
7469       else
7470 	{
7471 	  /* Record unprocessed bytes in coding->carryover.  We are
7472 	     sure that the number of data is less than the size of
7473 	     coding->carryover.  */
7474 	  unsigned char *p = coding->carryover;
7475 
7476 	  if (nbytes > sizeof coding->carryover)
7477 	    nbytes = sizeof coding->carryover;
7478 	  coding->carryover_bytes = nbytes;
7479 	  while (nbytes-- > 0)
7480 	    *p++ = *src++;
7481 	}
7482       coding->consumed = coding->src_bytes;
7483     }
7484 
7485   if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)
7486       && !inhibit_eol_conversion)
7487     decode_eol (coding);
7488   if (BUFFERP (coding->dst_object))
7489     {
7490       bset_undo_list (current_buffer, undo_list);
7491       record_insert (coding->dst_pos, coding->produced_char);
7492     }
7493 
7494   SAFE_FREE ();
7495 }
7496 
7497 
7498 /* Extract an annotation datum from a composition starting at POS and
7499    ending before LIMIT of CODING->src_object (buffer or string), store
7500    the data in BUF, set *STOP to a starting position of the next
7501    composition (if any) or to LIMIT, and return the address of the
7502    next element of BUF.
7503 
7504    If such an annotation is not found, set *STOP to a starting
7505    position of a composition after POS (if any) or to LIMIT, and
7506    return BUF.  */
7507 
7508 static int *
handle_composition_annotation(ptrdiff_t pos,ptrdiff_t limit,struct coding_system * coding,int * buf,ptrdiff_t * stop)7509 handle_composition_annotation (ptrdiff_t pos, ptrdiff_t limit,
7510 			       struct coding_system *coding, int *buf,
7511 			       ptrdiff_t *stop)
7512 {
7513   ptrdiff_t start, end;
7514   Lisp_Object prop;
7515 
7516   if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
7517       || end > limit)
7518     *stop = limit;
7519   else if (start > pos)
7520     *stop = start;
7521   else
7522     {
7523       if (start == pos)
7524 	{
7525 	  /* We found a composition.  Store the corresponding
7526 	     annotation data in BUF.  */
7527 	  int *head = buf;
7528 	  enum composition_method method = composition_method (prop);
7529 	  int nchars = COMPOSITION_LENGTH (prop);
7530 
7531 	  ADD_COMPOSITION_DATA (buf, nchars, 0, method);
7532 	  if (method != COMPOSITION_RELATIVE)
7533 	    {
7534 	      Lisp_Object components;
7535 	      ptrdiff_t i, len, i_byte;
7536 
7537 	      components = COMPOSITION_COMPONENTS (prop);
7538 	      if (VECTORP (components))
7539 		{
7540 		  len = ASIZE (components);
7541 		  for (i = 0; i < len; i++)
7542 		    *buf++ = XFIXNUM (AREF (components, i));
7543 		}
7544 	      else if (STRINGP (components))
7545 		{
7546 		  len = SCHARS (components);
7547 		  i = i_byte = 0;
7548 		  while (i < len)
7549 		    *buf++ = fetch_string_char_advance (components,
7550 							&i, &i_byte);
7551 		}
7552 	      else if (FIXNUMP (components))
7553 		{
7554 		  len = 1;
7555 		  *buf++ = XFIXNUM (components);
7556 		}
7557 	      else if (CONSP (components))
7558 		{
7559 		  for (len = 0; CONSP (components);
7560 		       len++, components = XCDR (components))
7561 		    *buf++ = XFIXNUM (XCAR (components));
7562 		}
7563 	      else
7564 		emacs_abort ();
7565 	      *head -= len;
7566 	    }
7567 	}
7568 
7569       if (find_composition (end, limit, &start, &end, &prop,
7570 			    coding->src_object)
7571 	  && end <= limit)
7572 	*stop = start;
7573       else
7574 	*stop = limit;
7575     }
7576   return buf;
7577 }
7578 
7579 
7580 /* Extract an annotation datum from a text property `charset' at POS of
7581    CODING->src_object (buffer of string), store the data in BUF, set
7582    *STOP to the position where the value of `charset' property changes
7583    (limiting by LIMIT), and return the address of the next element of
7584    BUF.
7585 
7586    If the property value is nil, set *STOP to the position where the
7587    property value is non-nil (limiting by LIMIT), and return BUF.  */
7588 
7589 static int *
handle_charset_annotation(ptrdiff_t pos,ptrdiff_t limit,struct coding_system * coding,int * buf,ptrdiff_t * stop)7590 handle_charset_annotation (ptrdiff_t pos, ptrdiff_t limit,
7591 			   struct coding_system *coding, int *buf,
7592 			   ptrdiff_t *stop)
7593 {
7594   Lisp_Object val, next;
7595   int id;
7596 
7597   val = Fget_text_property (make_fixnum (pos), Qcharset, coding->src_object);
7598   if (! NILP (val) && CHARSETP (val))
7599     id = XFIXNUM (CHARSET_SYMBOL_ID (val));
7600   else
7601     id = -1;
7602   ADD_CHARSET_DATA (buf, 0, id);
7603   next = Fnext_single_property_change (make_fixnum (pos), Qcharset,
7604 				       coding->src_object,
7605 				       make_fixnum (limit));
7606   *stop = XFIXNUM (next);
7607   return buf;
7608 }
7609 
7610 
7611 static void
consume_chars(struct coding_system * coding,Lisp_Object translation_table,int max_lookup)7612 consume_chars (struct coding_system *coding, Lisp_Object translation_table,
7613 	       int max_lookup)
7614 {
7615   int *buf = coding->charbuf;
7616   int *buf_end = coding->charbuf + coding->charbuf_size;
7617   const unsigned char *src = coding->source + coding->consumed;
7618   const unsigned char *src_end = coding->source + coding->src_bytes;
7619   ptrdiff_t pos = coding->src_pos + coding->consumed_char;
7620   ptrdiff_t end_pos = coding->src_pos + coding->src_chars;
7621   bool multibytep = coding->src_multibyte;
7622   Lisp_Object eol_type;
7623   int c;
7624   ptrdiff_t stop, stop_composition, stop_charset;
7625   int *lookup_buf = NULL;
7626 
7627   if (! NILP (translation_table))
7628     lookup_buf = alloca (sizeof (int) * max_lookup);
7629 
7630   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
7631   if (VECTORP (eol_type))
7632     eol_type = Qunix;
7633 
7634   /* Note: composition handling is not yet implemented.  */
7635   coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7636 
7637   if (NILP (coding->src_object))
7638     stop = stop_composition = stop_charset = end_pos;
7639   else
7640     {
7641       if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
7642 	stop = stop_composition = pos;
7643       else
7644 	stop = stop_composition = end_pos;
7645       if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
7646 	stop = stop_charset = pos;
7647       else
7648 	stop_charset = end_pos;
7649     }
7650 
7651   /* Compensate for CRLF and conversion.  */
7652   buf_end -= 1 + MAX_ANNOTATION_LENGTH;
7653   while (buf < buf_end)
7654     {
7655       Lisp_Object trans;
7656 
7657       if (pos == stop)
7658 	{
7659 	  if (pos == end_pos)
7660 	    break;
7661 	  if (pos == stop_composition)
7662 	    buf = handle_composition_annotation (pos, end_pos, coding,
7663 						 buf, &stop_composition);
7664 	  if (pos == stop_charset)
7665 	    buf = handle_charset_annotation (pos, end_pos, coding,
7666 					     buf, &stop_charset);
7667 	  stop = (stop_composition < stop_charset
7668 		  ? stop_composition : stop_charset);
7669 	}
7670 
7671       if (! multibytep)
7672 	{
7673 	  if (coding->encoder == encode_coding_raw_text
7674 	      || coding->encoder == encode_coding_ccl)
7675 	    c = *src++, pos++;
7676 	  else
7677 	    {
7678 	      int bytes = multibyte_length (src, src_end, true, true);
7679 	      if (0 < bytes)
7680 		c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
7681 	      else
7682 		c = BYTE8_TO_CHAR (*src), src++, pos++;
7683 	    }
7684 	}
7685       else
7686 	c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
7687       if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
7688 	c = '\n';
7689       if (! EQ (eol_type, Qunix))
7690 	{
7691 	  if (c == '\n')
7692 	    {
7693 	      if (EQ (eol_type, Qdos))
7694 		*buf++ = '\r';
7695 	      else
7696 		c = '\r';
7697 	    }
7698 	}
7699 
7700       trans = Qnil;
7701       LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7702       if (NILP (trans))
7703 	*buf++ = c;
7704       else
7705 	{
7706 	  ptrdiff_t from_nchars = 1, to_nchars = 1;
7707 	  int *lookup_buf_end;
7708 	  const unsigned char *p = src;
7709 	  int i;
7710 
7711 	  lookup_buf[0] = c;
7712 	  for (i = 1; i < max_lookup && p < src_end; i++)
7713 	    lookup_buf[i] = string_char_advance (&p);
7714 	  lookup_buf_end = lookup_buf + i;
7715 	  trans = get_translation (trans, lookup_buf, lookup_buf_end,
7716 				   &from_nchars);
7717 	  if (FIXNUMP (trans))
7718 	    c = XFIXNUM (trans);
7719 	  else if (VECTORP (trans))
7720 	    {
7721 	      to_nchars = ASIZE (trans);
7722 	      if (buf_end - buf < to_nchars)
7723 		break;
7724 	      c = XFIXNUM (AREF (trans, 0));
7725 	    }
7726 	  else
7727 	    break;
7728 	  *buf++ = c;
7729 	  for (i = 1; i < to_nchars; i++)
7730 	    *buf++ = XFIXNUM (AREF (trans, i));
7731 	  for (i = 1; i < from_nchars; i++, pos++)
7732 	    src += multibyte_length (src, NULL, false, true);
7733 	}
7734     }
7735 
7736   coding->consumed = src - coding->source;
7737   coding->consumed_char = pos - coding->src_pos;
7738   coding->charbuf_used = buf - coding->charbuf;
7739   coding->chars_at_source = 0;
7740 }
7741 
7742 
7743 /* Encode the text at CODING->src_object into CODING->dst_object.
7744    CODING->src_object is a buffer or a string.
7745    CODING->dst_object is a buffer or nil.
7746 
7747    If CODING->src_object is a buffer, it must be the current buffer.
7748    In this case, if CODING->src_pos is positive, it is a position of
7749    the source text in the buffer, otherwise. the source text is in the
7750    gap area of the buffer, and coding->src_pos specifies the offset of
7751    the text from GPT (which must be the same as PT).  If this is the
7752    same buffer as CODING->dst_object, CODING->src_pos must be
7753    negative and CODING should not have `pre-write-conversion'.
7754 
7755    If CODING->src_object is a string, CODING should not have
7756    `pre-write-conversion'.
7757 
7758    If CODING->dst_object is a buffer, the encoded data is inserted at
7759    the current point of that buffer.
7760 
7761    If CODING->dst_object is nil, the encoded data is placed at the
7762    memory area specified by CODING->destination.  */
7763 
7764 static void
encode_coding(struct coding_system * coding)7765 encode_coding (struct coding_system *coding)
7766 {
7767   Lisp_Object attrs;
7768   Lisp_Object translation_table;
7769   int max_lookup;
7770   struct ccl_spec cclspec;
7771 
7772   USE_SAFE_ALLOCA;
7773 
7774   attrs = CODING_ID_ATTRS (coding->id);
7775   if (coding->encoder == encode_coding_raw_text)
7776     translation_table = Qnil, max_lookup = 0;
7777   else
7778     translation_table = get_translation_table (attrs, 1, &max_lookup);
7779 
7780   if (BUFFERP (coding->dst_object))
7781     {
7782       set_buffer_internal (XBUFFER (coding->dst_object));
7783       coding->dst_multibyte
7784 	= ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7785     }
7786 
7787   coding->consumed = coding->consumed_char = 0;
7788   coding->produced = coding->produced_char = 0;
7789   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7790 
7791   ALLOC_CONVERSION_WORK_AREA (coding, coding->src_chars);
7792 
7793   if (coding->encoder == encode_coding_ccl)
7794     {
7795       coding->spec.ccl = &cclspec;
7796       setup_ccl_program (&cclspec.ccl, CODING_CCL_ENCODER (coding));
7797     }
7798   do {
7799     coding_set_source (coding);
7800     consume_chars (coding, translation_table, max_lookup);
7801     coding_set_destination (coding);
7802     /* The CODING_MODE_LAST_BLOCK flag should be set only for the last
7803        iteration of the encoding.  */
7804     unsigned saved_mode = coding->mode;
7805     if (coding->consumed_char < coding->src_chars)
7806       coding->mode &= ~CODING_MODE_LAST_BLOCK;
7807     (*(coding->encoder)) (coding);
7808     coding->mode = saved_mode;
7809   } while (coding->consumed_char < coding->src_chars);
7810 
7811   if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
7812     insert_from_gap (coding->produced_char, coding->produced, 0);
7813 
7814   SAFE_FREE ();
7815 }
7816 
7817 /* Code-conversion operations use internal buffers.  There's a single
7818    reusable buffer, which is created the first time it is needed, and
7819    then never killed.  When this reusable buffer is being used, the
7820    reused_workbuf_in_use flag is set.  If we need another conversion
7821    buffer while the reusable one is in use (e.g., if code-conversion
7822    is reentered when another code-conversion is in progress), we
7823    create temporary buffers using the name of the reusable buffer as
7824    the base name, see code_conversion_save below.  These temporary
7825    buffers are killed when the code-conversion operations that use
7826    them return, see code_conversion_restore below.  */
7827 
7828 /* A string that serves as name of the reusable work buffer, and as base
7829    name of temporary work buffers used for code-conversion operations.  */
7830 static Lisp_Object Vcode_conversion_workbuf_name;
7831 
7832 /* The reusable working buffer, created once and never killed.  */
7833 static Lisp_Object Vcode_conversion_reused_workbuf;
7834 
7835 /* True iff Vcode_conversion_reused_workbuf is already in use.  */
7836 static bool reused_workbuf_in_use;
7837 
7838 static void
code_conversion_restore(Lisp_Object arg)7839 code_conversion_restore (Lisp_Object arg)
7840 {
7841   Lisp_Object current, workbuf;
7842 
7843   current = XCAR (arg);
7844   workbuf = XCDR (arg);
7845   if (! NILP (workbuf))
7846     {
7847       if (EQ (workbuf, Vcode_conversion_reused_workbuf))
7848 	reused_workbuf_in_use = false;
7849       else
7850 	Fkill_buffer (workbuf);
7851     }
7852   set_buffer_internal (XBUFFER (current));
7853 }
7854 
7855 Lisp_Object
code_conversion_save(bool with_work_buf,bool multibyte)7856 code_conversion_save (bool with_work_buf, bool multibyte)
7857 {
7858   Lisp_Object workbuf = Qnil;
7859 
7860   if (with_work_buf)
7861     {
7862       if (reused_workbuf_in_use)
7863 	{
7864 	  Lisp_Object name
7865 	    = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
7866 	  workbuf = Fget_buffer_create (name, Qt);
7867 	}
7868       else
7869 	{
7870 	  if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf)))
7871 	    Vcode_conversion_reused_workbuf
7872 	      = Fget_buffer_create (Vcode_conversion_workbuf_name, Qt);
7873 	  workbuf = Vcode_conversion_reused_workbuf;
7874 	}
7875     }
7876   record_unwind_protect (code_conversion_restore,
7877 			 Fcons (Fcurrent_buffer (), workbuf));
7878   if (!NILP (workbuf))
7879     {
7880       struct buffer *current = current_buffer;
7881       set_buffer_internal (XBUFFER (workbuf));
7882       /* We can't allow modification hooks to run in the work buffer.  For
7883 	 instance, directory_files_internal assumes that file decoding
7884 	 doesn't compile new regexps.  */
7885       Fset (Fmake_local_variable (Qinhibit_modification_hooks), Qt);
7886       Ferase_buffer ();
7887       bset_undo_list (current_buffer, Qt);
7888       bset_enable_multibyte_characters (current_buffer, multibyte ? Qt : Qnil);
7889       if (EQ (workbuf, Vcode_conversion_reused_workbuf))
7890 	reused_workbuf_in_use = true;
7891       set_buffer_internal (current);
7892     }
7893 
7894   return workbuf;
7895 }
7896 
7897 static void
coding_restore_undo_list(Lisp_Object arg)7898 coding_restore_undo_list (Lisp_Object arg)
7899 {
7900   Lisp_Object undo_list = XCAR (arg);
7901   struct buffer *buf = XBUFFER (XCDR (arg));
7902 
7903   bset_undo_list (buf, undo_list);
7904 }
7905 
7906 /* Decode the *last* BYTES of the gap and insert them at point.  */
7907 void
decode_coding_gap(struct coding_system * coding,ptrdiff_t bytes)7908 decode_coding_gap (struct coding_system *coding, ptrdiff_t bytes)
7909 {
7910   ptrdiff_t count = SPECPDL_INDEX ();
7911   Lisp_Object attrs;
7912 
7913   eassert (GPT_BYTE == PT_BYTE);
7914 
7915   coding->src_object = Fcurrent_buffer ();
7916   coding->src_chars = bytes;
7917   coding->src_bytes = bytes;
7918   coding->src_pos = -bytes;
7919   coding->src_pos_byte = -bytes;
7920   coding->src_multibyte = false;
7921   coding->dst_object = coding->src_object;
7922   coding->dst_pos = PT;
7923   coding->dst_pos_byte = PT_BYTE;
7924   eassert (coding->dst_multibyte
7925            == !NILP (BVAR (current_buffer, enable_multibyte_characters)));
7926 
7927   coding->head_ascii = -1;
7928   coding->detected_utf8_bytes = coding->detected_utf8_chars = -1;
7929   coding->eol_seen = EOL_SEEN_NONE;
7930   if (CODING_REQUIRE_DETECTION (coding))
7931     detect_coding (coding);
7932   attrs = CODING_ID_ATTRS (coding->id);
7933   if (! disable_ascii_optimization
7934       && ! coding->src_multibyte
7935       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
7936       && NILP (CODING_ATTR_POST_READ (attrs))
7937       && NILP (get_translation_table (attrs, 0, NULL)))
7938     {
7939       ptrdiff_t chars = coding->head_ascii;
7940       if (chars < 0)
7941 	chars = check_ascii (coding);
7942       if (chars != bytes)
7943 	{
7944 	  /* There exists a non-ASCII byte.  */
7945 	  if (EQ (CODING_ATTR_TYPE (attrs), Qutf_8)
7946 	      && coding->detected_utf8_bytes == coding->src_bytes)
7947 	    {
7948 	      if (coding->detected_utf8_chars >= 0)
7949 		chars = coding->detected_utf8_chars;
7950 	      else
7951 		chars = check_utf_8 (coding);
7952 	      if (CODING_UTF_8_BOM (coding) != utf_without_bom
7953 		  && coding->head_ascii == 0
7954 		  && coding->source[0] == UTF_8_BOM_1
7955 		  && coding->source[1] == UTF_8_BOM_2
7956 		  && coding->source[2] == UTF_8_BOM_3)
7957 		{
7958 		  chars--;
7959 		  bytes -= 3;
7960 		  coding->src_bytes -= 3;
7961 		}
7962 	    }
7963 	  else
7964 	    chars = -1;
7965 	}
7966       if (chars >= 0)
7967 	{
7968 	  Lisp_Object eol_type;
7969 
7970 	  eol_type = CODING_ID_EOL_TYPE (coding->id);
7971 	  if (VECTORP (eol_type))
7972 	    {
7973 	      if (coding->eol_seen != EOL_SEEN_NONE)
7974 		eol_type = adjust_coding_eol_type (coding, coding->eol_seen);
7975 	    }
7976 	  if (EQ (eol_type, Qmac))
7977 	    {
7978 	      unsigned char *src_end = GAP_END_ADDR;
7979 	      unsigned char *src = src_end - coding->src_bytes;
7980 
7981 	      while (src < src_end)
7982 		{
7983 		  if (*src++ == '\r')
7984 		    src[-1] = '\n';
7985 		}
7986 	    }
7987 	  else if (EQ (eol_type, Qdos))
7988 	    {
7989 	      unsigned char *src = GAP_END_ADDR;
7990 	      unsigned char *src_beg = src - coding->src_bytes;
7991 	      unsigned char *dst = src;
7992 	      ptrdiff_t diff;
7993 
7994 	      while (src_beg < src)
7995 		{
7996 		  *--dst = *--src;
7997 		  if (*src == '\n' && src > src_beg && src[-1] == '\r')
7998 		    src--;
7999 		}
8000 	      diff = dst - src;
8001 	      bytes -= diff;
8002 	      chars -= diff;
8003 	    }
8004 	  coding->produced = bytes;
8005 	  coding->produced_char = chars;
8006 	  insert_from_gap (chars, bytes, 1);
8007 	  return;
8008 	}
8009     }
8010   code_conversion_save (0, 0);
8011 
8012   coding->mode |= CODING_MODE_LAST_BLOCK;
8013   current_buffer->text->inhibit_shrinking = 1;
8014   decode_coding (coding);
8015   current_buffer->text->inhibit_shrinking = 0;
8016 
8017   if (! NILP (CODING_ATTR_POST_READ (attrs)))
8018     {
8019       ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
8020       Lisp_Object val;
8021       Lisp_Object undo_list = BVAR (current_buffer, undo_list);
8022 
8023       record_unwind_protect (coding_restore_undo_list,
8024 			     Fcons (undo_list, Fcurrent_buffer ()));
8025       bset_undo_list (current_buffer, Qt);
8026       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
8027       val = call1 (CODING_ATTR_POST_READ (attrs),
8028 		   make_fixnum (coding->produced_char));
8029       CHECK_FIXNAT (val);
8030       coding->produced_char += Z - prev_Z;
8031       coding->produced += Z_BYTE - prev_Z_BYTE;
8032     }
8033 
8034   unbind_to (count, Qnil);
8035 }
8036 
8037 
8038 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
8039    SRC_OBJECT into DST_OBJECT by coding context CODING.
8040 
8041    SRC_OBJECT is a buffer, a string, or Qnil.
8042 
8043    If it is a buffer, the text is at point of the buffer.  FROM and TO
8044    are positions in the buffer.
8045 
8046    If it is a string, the text is at the beginning of the string.
8047    FROM and TO are indices to the string.
8048 
8049    If it is nil, the text is at coding->source.  FROM and TO are
8050    indices to coding->source.
8051 
8052    DST_OBJECT is a buffer, Qt, or Qnil.
8053 
8054    If it is a buffer, the decoded text is inserted at point of the
8055    buffer.  If the buffer is the same as SRC_OBJECT, the source text
8056    is deleted.
8057 
8058    If it is Qt, a string is made from the decoded text, and
8059    set in CODING->dst_object.
8060 
8061    If it is Qnil, the decoded text is stored at CODING->destination.
8062    The caller must allocate CODING->dst_bytes bytes at
8063    CODING->destination by xmalloc.  If the decoded text is longer than
8064    CODING->dst_bytes, CODING->destination is relocated by xrealloc.
8065  */
8066 
8067 void
decode_coding_object(struct coding_system * coding,Lisp_Object src_object,ptrdiff_t from,ptrdiff_t from_byte,ptrdiff_t to,ptrdiff_t to_byte,Lisp_Object dst_object)8068 decode_coding_object (struct coding_system *coding,
8069 		      Lisp_Object src_object,
8070 		      ptrdiff_t from, ptrdiff_t from_byte,
8071 		      ptrdiff_t to, ptrdiff_t to_byte,
8072 		      Lisp_Object dst_object)
8073 {
8074   ptrdiff_t count = SPECPDL_INDEX ();
8075   unsigned char *destination UNINIT;
8076   ptrdiff_t dst_bytes UNINIT;
8077   ptrdiff_t chars = to - from;
8078   ptrdiff_t bytes = to_byte - from_byte;
8079   Lisp_Object attrs;
8080   ptrdiff_t saved_pt = -1, saved_pt_byte UNINIT;
8081   bool need_marker_adjustment = 0;
8082   Lisp_Object old_deactivate_mark;
8083 
8084   old_deactivate_mark = Vdeactivate_mark;
8085 
8086   if (NILP (dst_object))
8087     {
8088       destination = coding->destination;
8089       dst_bytes = coding->dst_bytes;
8090     }
8091 
8092   coding->src_object = src_object;
8093   coding->src_chars = chars;
8094   coding->src_bytes = bytes;
8095   coding->src_multibyte = chars < bytes;
8096 
8097   if (STRINGP (src_object))
8098     {
8099       coding->src_pos = from;
8100       coding->src_pos_byte = from_byte;
8101     }
8102   else if (BUFFERP (src_object))
8103     {
8104       set_buffer_internal (XBUFFER (src_object));
8105       if (from != GPT)
8106 	move_gap_both (from, from_byte);
8107       if (EQ (src_object, dst_object))
8108 	{
8109 	  struct Lisp_Marker *tail;
8110 
8111 	  for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8112 	    {
8113 	      tail->need_adjustment
8114 		= tail->charpos == (tail->insertion_type ? from : to);
8115 	      need_marker_adjustment |= tail->need_adjustment;
8116 	    }
8117 	  saved_pt = PT, saved_pt_byte = PT_BYTE;
8118 	  TEMP_SET_PT_BOTH (from, from_byte);
8119 	  current_buffer->text->inhibit_shrinking = 1;
8120 	  del_range_both (from, from_byte, to, to_byte, 1);
8121 	  coding->src_pos = -chars;
8122 	  coding->src_pos_byte = -bytes;
8123 	}
8124       else
8125 	{
8126 	  coding->src_pos = from;
8127 	  coding->src_pos_byte = from_byte;
8128 	}
8129     }
8130 
8131   if (CODING_REQUIRE_DETECTION (coding))
8132     detect_coding (coding);
8133   attrs = CODING_ID_ATTRS (coding->id);
8134 
8135   if (EQ (dst_object, Qt)
8136       || (! NILP (CODING_ATTR_POST_READ (attrs))
8137 	  && NILP (dst_object)))
8138     {
8139       coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
8140       coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
8141       coding->dst_pos = BEG;
8142       coding->dst_pos_byte = BEG_BYTE;
8143     }
8144   else if (BUFFERP (dst_object))
8145     {
8146       code_conversion_save (0, 0);
8147       coding->dst_object = dst_object;
8148       coding->dst_pos = BUF_PT (XBUFFER (dst_object));
8149       coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
8150       coding->dst_multibyte
8151 	= ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
8152     }
8153   else
8154     {
8155       code_conversion_save (0, 0);
8156       coding->dst_object = Qnil;
8157       /* Most callers presume this will return a multibyte result, and they
8158 	 won't use `binary' or `raw-text' anyway, so let's not worry about
8159 	 CODING_FOR_UNIBYTE.  */
8160       coding->dst_multibyte = 1;
8161     }
8162 
8163   decode_coding (coding);
8164 
8165   if (BUFFERP (coding->dst_object))
8166     set_buffer_internal (XBUFFER (coding->dst_object));
8167 
8168   if (! NILP (CODING_ATTR_POST_READ (attrs)))
8169     {
8170       ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
8171       Lisp_Object val;
8172       Lisp_Object undo_list = BVAR (current_buffer, undo_list);
8173       ptrdiff_t count1 = SPECPDL_INDEX ();
8174 
8175       record_unwind_protect (coding_restore_undo_list,
8176 			     Fcons (undo_list, Fcurrent_buffer ()));
8177       bset_undo_list (current_buffer, Qt);
8178       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
8179       val = safe_call1 (CODING_ATTR_POST_READ (attrs),
8180 			make_fixnum (coding->produced_char));
8181       CHECK_FIXNAT (val);
8182       coding->produced_char += Z - prev_Z;
8183       coding->produced += Z_BYTE - prev_Z_BYTE;
8184       unbind_to (count1, Qnil);
8185     }
8186 
8187   if (EQ (dst_object, Qt))
8188     {
8189       coding->dst_object = Fbuffer_string ();
8190     }
8191   else if (NILP (dst_object) && BUFFERP (coding->dst_object))
8192     {
8193       set_buffer_internal (XBUFFER (coding->dst_object));
8194       if (dst_bytes < coding->produced)
8195 	{
8196 	  eassert (coding->produced > 0);
8197 	  destination = xrealloc (destination, coding->produced);
8198 	  if (BEGV < GPT && GPT < BEGV + coding->produced_char)
8199 	    move_gap_both (BEGV, BEGV_BYTE);
8200 	  memcpy (destination, BEGV_ADDR, coding->produced);
8201 	  coding->destination = destination;
8202 	}
8203     }
8204 
8205   if (saved_pt >= 0)
8206     {
8207       /* This is the case of:
8208 	 (BUFFERP (src_object) && EQ (src_object, dst_object))
8209 	 As we have moved PT while replacing the original buffer
8210 	 contents, we must recover it now.  */
8211       set_buffer_internal (XBUFFER (src_object));
8212       current_buffer->text->inhibit_shrinking = 0;
8213       if (saved_pt < from)
8214 	TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8215       else if (saved_pt < from + chars)
8216 	TEMP_SET_PT_BOTH (from, from_byte);
8217       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
8218 	TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8219 			  saved_pt_byte + (coding->produced - bytes));
8220       else
8221 	TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8222 			  saved_pt_byte + (coding->produced - bytes));
8223 
8224       if (need_marker_adjustment)
8225 	{
8226 	  struct Lisp_Marker *tail;
8227 
8228 	  for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8229 	    if (tail->need_adjustment)
8230 	      {
8231 		tail->need_adjustment = 0;
8232 		if (tail->insertion_type)
8233 		  {
8234 		    tail->bytepos = from_byte;
8235 		    tail->charpos = from;
8236 		  }
8237 		else
8238 		  {
8239 		    tail->bytepos = from_byte + coding->produced;
8240 		    tail->charpos
8241 		      = (NILP (BVAR (current_buffer, enable_multibyte_characters))
8242 			 ? tail->bytepos : from + coding->produced_char);
8243 		  }
8244 	      }
8245 	}
8246     }
8247 
8248   Vdeactivate_mark = old_deactivate_mark;
8249   unbind_to (count, coding->dst_object);
8250 }
8251 
8252 
8253 /* Encode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
8254    SRC_OBJECT into DST_OBJECT by coding context CODING.
8255 
8256    SRC_OBJECT is a buffer, a string, or Qnil.
8257 
8258    If it is a buffer, the text is at point of the buffer.  FROM and TO
8259    are positions in the buffer.
8260 
8261    If it is a string, the text is at the beginning of the string.
8262    FROM and TO are indices into the string.
8263 
8264    If it is nil, the text is at coding->source.  FROM and TO are
8265    indices into coding->source.
8266 
8267    DST_OBJECT is a buffer, Qt, or Qnil.
8268 
8269    If it is a buffer, the encoded text is inserted at point of the
8270    buffer.  If the buffer is the same as SRC_OBJECT, the source text
8271    is replaced with the encoded text.
8272 
8273    If it is Qt, a string is made from the encoded text, and set in
8274    CODING->dst_object.  However, if CODING->raw_destination is non-zero,
8275    the encoded text is instead returned in CODING->destination as a C string,
8276    and the caller is responsible for freeing CODING->destination.  This
8277    feature is meant to be used when the caller doesn't need the result as
8278    a Lisp string, and wants to avoid unnecessary consing of large strings.
8279 
8280    If it is Qnil, the encoded text is stored at CODING->destination.
8281    The caller must allocate CODING->dst_bytes bytes at
8282    CODING->destination by xmalloc.  If the encoded text is longer than
8283    CODING->dst_bytes, CODING->destination is reallocated by xrealloc
8284    (and CODING->dst_bytes is enlarged accordingly).  */
8285 
8286 void
encode_coding_object(struct coding_system * coding,Lisp_Object src_object,ptrdiff_t from,ptrdiff_t from_byte,ptrdiff_t to,ptrdiff_t to_byte,Lisp_Object dst_object)8287 encode_coding_object (struct coding_system *coding,
8288 		      Lisp_Object src_object,
8289 		      ptrdiff_t from, ptrdiff_t from_byte,
8290 		      ptrdiff_t to, ptrdiff_t to_byte,
8291 		      Lisp_Object dst_object)
8292 {
8293   ptrdiff_t count = SPECPDL_INDEX ();
8294   ptrdiff_t chars = to - from;
8295   ptrdiff_t bytes = to_byte - from_byte;
8296   Lisp_Object attrs;
8297   ptrdiff_t saved_pt = -1, saved_pt_byte;
8298   bool need_marker_adjustment = 0;
8299   bool kill_src_buffer = 0;
8300   Lisp_Object old_deactivate_mark;
8301 
8302   old_deactivate_mark = Vdeactivate_mark;
8303 
8304   coding->src_object = src_object;
8305   coding->src_chars = chars;
8306   coding->src_bytes = bytes;
8307   coding->src_multibyte = chars < bytes;
8308 
8309   attrs = CODING_ID_ATTRS (coding->id);
8310 
8311   bool same_buffer = false;
8312   if (EQ (src_object, dst_object) && BUFFERP (src_object))
8313     {
8314       struct Lisp_Marker *tail;
8315 
8316       same_buffer = true;
8317 
8318       for (tail = BUF_MARKERS (XBUFFER (src_object)); tail; tail = tail->next)
8319 	{
8320 	  tail->need_adjustment
8321 	    = tail->charpos == (tail->insertion_type ? from : to);
8322 	  need_marker_adjustment |= tail->need_adjustment;
8323 	}
8324     }
8325 
8326   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
8327     {
8328       coding->src_object = code_conversion_save (1, coding->src_multibyte);
8329       set_buffer_internal (XBUFFER (coding->src_object));
8330       if (STRINGP (src_object))
8331 	insert_from_string (src_object, from, from_byte, chars, bytes, 0);
8332       else if (BUFFERP (src_object))
8333 	insert_from_buffer (XBUFFER (src_object), from, chars, 0);
8334       else
8335 	insert_1_both ((char *) coding->source + from, chars, bytes, 0, 0, 0);
8336 
8337       if (same_buffer)
8338 	{
8339 	  set_buffer_internal (XBUFFER (src_object));
8340 	  saved_pt = PT, saved_pt_byte = PT_BYTE;
8341 	  del_range_both (from, from_byte, to, to_byte, 1);
8342 	  set_buffer_internal (XBUFFER (coding->src_object));
8343 	}
8344 
8345       safe_call2 (CODING_ATTR_PRE_WRITE (attrs),
8346 		  make_fixnum (BEG), make_fixnum (Z));
8347       if (XBUFFER (coding->src_object) != current_buffer)
8348 	kill_src_buffer = 1;
8349       coding->src_object = Fcurrent_buffer ();
8350       if (BEG != GPT)
8351 	move_gap_both (BEG, BEG_BYTE);
8352       coding->src_chars = Z - BEG;
8353       coding->src_bytes = Z_BYTE - BEG_BYTE;
8354       coding->src_pos = BEG;
8355       coding->src_pos_byte = BEG_BYTE;
8356       coding->src_multibyte = Z < Z_BYTE;
8357     }
8358   else if (STRINGP (src_object))
8359     {
8360       code_conversion_save (0, 0);
8361       coding->src_pos = from;
8362       coding->src_pos_byte = from_byte;
8363     }
8364   else if (BUFFERP (src_object))
8365     {
8366       code_conversion_save (0, 0);
8367       set_buffer_internal (XBUFFER (src_object));
8368       if (same_buffer)
8369 	{
8370 	  saved_pt = PT, saved_pt_byte = PT_BYTE;
8371 	  coding->src_object = del_range_1 (from, to, 1, 1);
8372 	  coding->src_pos = 0;
8373 	  coding->src_pos_byte = 0;
8374 	}
8375       else
8376 	{
8377 	  if (from < GPT && to >= GPT)
8378 	    move_gap_both (from, from_byte);
8379 	  coding->src_pos = from;
8380 	  coding->src_pos_byte = from_byte;
8381 	}
8382     }
8383   else
8384     {
8385       code_conversion_save (0, 0);
8386       coding->src_pos = from;
8387       coding->src_pos_byte = from_byte;
8388     }
8389 
8390   if (BUFFERP (dst_object))
8391     {
8392       coding->dst_object = dst_object;
8393       if (EQ (src_object, dst_object))
8394 	{
8395 	  coding->dst_pos = from;
8396 	  coding->dst_pos_byte = from_byte;
8397 	}
8398       else
8399 	{
8400 	  struct buffer *current = current_buffer;
8401 
8402 	  set_buffer_temp (XBUFFER (dst_object));
8403 	  coding->dst_pos = PT;
8404 	  coding->dst_pos_byte = PT_BYTE;
8405 	  move_gap_both (coding->dst_pos, coding->dst_pos_byte);
8406 	  set_buffer_temp (current);
8407 	}
8408       coding->dst_multibyte
8409 	= ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
8410     }
8411   else if (EQ (dst_object, Qt))
8412     {
8413       ptrdiff_t dst_bytes = max (1, coding->src_chars);
8414       coding->dst_object = Qnil;
8415       coding->destination = xmalloc (dst_bytes);
8416       coding->dst_bytes = dst_bytes;
8417       coding->dst_multibyte = 0;
8418     }
8419   else
8420     {
8421       coding->dst_object = Qnil;
8422       coding->dst_multibyte = 0;
8423     }
8424 
8425   encode_coding (coding);
8426 
8427   if (EQ (dst_object, Qt))
8428     {
8429       if (BUFFERP (coding->dst_object))
8430 	coding->dst_object = Fbuffer_string ();
8431       else if (coding->raw_destination)
8432 	/* This is used to avoid creating huge Lisp string.
8433 	   NOTE: caller who sets `raw_destination' is also
8434 	   responsible for freeing `destination' buffer.  */
8435 	coding->dst_object = Qnil;
8436       else
8437 	{
8438 	  coding->dst_object
8439 	    = make_unibyte_string ((char *) coding->destination,
8440 				   coding->produced);
8441 	  xfree (coding->destination);
8442 	}
8443     }
8444 
8445   if (saved_pt >= 0)
8446     {
8447       /* This is the case of:
8448 	 (BUFFERP (src_object) && EQ (src_object, dst_object))
8449 	 As we have moved PT while replacing the original buffer
8450 	 contents, we must recover it now.  */
8451       set_buffer_internal (XBUFFER (src_object));
8452       if (saved_pt < from)
8453 	TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8454       else if (saved_pt < from + chars)
8455 	TEMP_SET_PT_BOTH (from, from_byte);
8456       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
8457 	TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8458 			  saved_pt_byte + (coding->produced - bytes));
8459       else
8460 	TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8461 			  saved_pt_byte + (coding->produced - bytes));
8462 
8463       if (need_marker_adjustment)
8464 	{
8465 	  struct Lisp_Marker *tail;
8466 
8467 	  for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8468 	    if (tail->need_adjustment)
8469 	      {
8470 		tail->need_adjustment = 0;
8471 		if (tail->insertion_type)
8472 		  {
8473 		    tail->bytepos = from_byte;
8474 		    tail->charpos = from;
8475 		  }
8476 		else
8477 		  {
8478 		    tail->bytepos = from_byte + coding->produced;
8479 		    tail->charpos
8480 		      = (NILP (BVAR (current_buffer, enable_multibyte_characters))
8481 			 ? tail->bytepos : from + coding->produced_char);
8482 		  }
8483 	      }
8484 	}
8485     }
8486 
8487   if (kill_src_buffer)
8488     Fkill_buffer (coding->src_object);
8489 
8490   Vdeactivate_mark = old_deactivate_mark;
8491   unbind_to (count, Qnil);
8492 }
8493 
8494 
8495 Lisp_Object
preferred_coding_system(void)8496 preferred_coding_system (void)
8497 {
8498   int id = coding_categories[coding_priorities[0]].id;
8499 
8500   return CODING_ID_NAME (id);
8501 }
8502 
8503 #if defined (WINDOWSNT) || defined (CYGWIN)
8504 
8505 Lisp_Object
from_unicode(Lisp_Object str)8506 from_unicode (Lisp_Object str)
8507 {
8508   CHECK_STRING (str);
8509   if (!STRING_MULTIBYTE (str) &&
8510       SBYTES (str) & 1)
8511     {
8512       str = Fsubstring (str, make_fixnum (0), make_fixnum (-1));
8513     }
8514 
8515   return code_convert_string_norecord (str, Qutf_16le, 0);
8516 }
8517 
8518 Lisp_Object
from_unicode_buffer(const wchar_t * wstr)8519 from_unicode_buffer (const wchar_t *wstr)
8520 {
8521   /* We get one of the two final null bytes for free.  */
8522   ptrdiff_t len = 1 + sizeof (wchar_t) * wcslen (wstr);
8523   AUTO_STRING_WITH_LEN (str, (char *) wstr, len);
8524   return from_unicode (str);
8525 }
8526 
8527 wchar_t *
to_unicode(Lisp_Object str,Lisp_Object * buf)8528 to_unicode (Lisp_Object str, Lisp_Object *buf)
8529 {
8530   *buf = code_convert_string_norecord (str, Qutf_16le, 1);
8531   /* We need to make another copy (in addition to the one made by
8532      code_convert_string_norecord) to ensure that the final string is
8533      _doubly_ zero terminated --- that is, that the string is
8534      terminated by two zero bytes and one utf-16le null character.
8535      Because strings are already terminated with a single zero byte,
8536      we just add one additional zero. */
8537   str = make_uninit_string (SBYTES (*buf) + 1);
8538   memcpy (SDATA (str), SDATA (*buf), SBYTES (*buf));
8539   SDATA (str) [SBYTES (*buf)] = '\0';
8540   *buf = str;
8541   return WCSDATA (*buf);
8542 }
8543 
8544 #endif /* WINDOWSNT || CYGWIN */
8545 
8546 
8547 /*** 8. Emacs Lisp library functions ***/
8548 
8549 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
8550        doc: /* Return t if OBJECT is nil or a coding-system.
8551 See the documentation of `define-coding-system' for information
8552 about coding-system objects.  */)
8553   (Lisp_Object object)
8554 {
8555   if (NILP (object)
8556       || CODING_SYSTEM_ID (object) >= 0)
8557     return Qt;
8558   if (! SYMBOLP (object)
8559       || NILP (Fget (object, Qcoding_system_define_form)))
8560     return Qnil;
8561   return Qt;
8562 }
8563 
8564 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
8565        Sread_non_nil_coding_system, 1, 1, 0,
8566        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
8567   (Lisp_Object prompt)
8568 {
8569   Lisp_Object val;
8570   do
8571     {
8572       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8573 			      Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
8574     }
8575   while (SCHARS (val) == 0);
8576   return (Fintern (val, Qnil));
8577 }
8578 
8579 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
8580        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
8581 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
8582 Ignores case when completing coding systems (all Emacs coding systems
8583 are lower-case).  */)
8584   (Lisp_Object prompt, Lisp_Object default_coding_system)
8585 {
8586   Lisp_Object val;
8587   ptrdiff_t count = SPECPDL_INDEX ();
8588 
8589   if (SYMBOLP (default_coding_system))
8590     default_coding_system = SYMBOL_NAME (default_coding_system);
8591   specbind (Qcompletion_ignore_case, Qt);
8592   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8593 			  Qt, Qnil, Qcoding_system_history,
8594 			  default_coding_system, Qnil);
8595   val = unbind_to (count, val);
8596   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
8597 }
8598 
8599 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
8600        1, 1, 0,
8601        doc: /* Check validity of CODING-SYSTEM.
8602 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
8603 It is valid if it is nil or a symbol defined as a coding system by the
8604 function `define-coding-system'.  */)
8605   (Lisp_Object coding_system)
8606 {
8607   Lisp_Object define_form;
8608 
8609   define_form = Fget (coding_system, Qcoding_system_define_form);
8610   if (! NILP (define_form))
8611     {
8612       Fput (coding_system, Qcoding_system_define_form, Qnil);
8613       safe_eval (define_form);
8614     }
8615   if (!NILP (Fcoding_system_p (coding_system)))
8616     return coding_system;
8617   xsignal1 (Qcoding_system_error, coding_system);
8618 }
8619 
8620 
8621 /* Detect how the bytes at SRC of length SRC_BYTES are encoded.  If
8622    HIGHEST, return the coding system of the highest
8623    priority among the detected coding systems.  Otherwise return a
8624    list of detected coding systems sorted by their priorities.  If
8625    MULTIBYTEP, it is assumed that the bytes are in correct
8626    multibyte form but contains only ASCII and eight-bit chars.
8627    Otherwise, the bytes are raw bytes.
8628 
8629    CODING-SYSTEM controls the detection as below:
8630 
8631    If it is nil, detect both text-format and eol-format.  If the
8632    text-format part of CODING-SYSTEM is already specified
8633    (e.g. `iso-latin-1'), detect only eol-format.  If the eol-format
8634    part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
8635    detect only text-format.  */
8636 
8637 Lisp_Object
detect_coding_system(const unsigned char * src,ptrdiff_t src_chars,ptrdiff_t src_bytes,bool highest,bool multibytep,Lisp_Object coding_system)8638 detect_coding_system (const unsigned char *src,
8639 		      ptrdiff_t src_chars, ptrdiff_t src_bytes,
8640 		      bool highest, bool multibytep,
8641 		      Lisp_Object coding_system)
8642 {
8643   const unsigned char *src_end = src + src_bytes;
8644   Lisp_Object attrs, eol_type;
8645   Lisp_Object val = Qnil;
8646   struct coding_system coding;
8647   ptrdiff_t id;
8648   struct coding_detection_info detect_info;
8649   enum coding_category base_category;
8650   bool null_byte_found = 0, eight_bit_found = 0;
8651 
8652   if (NILP (coding_system))
8653     coding_system = Qundecided;
8654   setup_coding_system (coding_system, &coding);
8655   attrs = CODING_ID_ATTRS (coding.id);
8656   eol_type = CODING_ID_EOL_TYPE (coding.id);
8657   coding_system = CODING_ATTR_BASE_NAME (attrs);
8658 
8659   coding.source = src;
8660   coding.src_chars = src_chars;
8661   coding.src_bytes = src_bytes;
8662   coding.src_multibyte = multibytep;
8663   coding.consumed = 0;
8664   coding.mode |= CODING_MODE_LAST_BLOCK;
8665   coding.head_ascii = 0;
8666 
8667   detect_info.checked = detect_info.found = detect_info.rejected = 0;
8668 
8669   /* At first, detect text-format if necessary.  */
8670   base_category = XFIXNUM (CODING_ATTR_CATEGORY (attrs));
8671   if (base_category == coding_category_undecided)
8672     {
8673       enum coding_category category UNINIT;
8674       struct coding_system *this UNINIT;
8675       int c, i;
8676       bool inhibit_nbd = inhibit_flag (coding.spec.undecided.inhibit_nbd,
8677 				       inhibit_null_byte_detection);
8678       bool inhibit_ied = inhibit_flag (coding.spec.undecided.inhibit_ied,
8679 				       inhibit_iso_escape_detection);
8680       bool prefer_utf_8 = coding.spec.undecided.prefer_utf_8;
8681 
8682       /* Skip all ASCII bytes except for a few ISO2022 controls.  */
8683       for (; src < src_end; src++)
8684 	{
8685 	  c = *src;
8686 	  if (c & 0x80)
8687 	    {
8688 	      eight_bit_found = 1;
8689 	      if (null_byte_found)
8690 		break;
8691 	    }
8692 	  else if (c < 0x20)
8693 	    {
8694 	      if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
8695 		  && ! inhibit_ied
8696 		  && ! detect_info.checked)
8697 		{
8698 		  if (detect_coding_iso_2022 (&coding, &detect_info))
8699 		    {
8700 		      /* We have scanned the whole data.  */
8701 		      if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
8702 			{
8703 			  /* We didn't find an 8-bit code.  We may
8704 			     have found a null-byte, but it's very
8705 			     rare that a binary file confirm to
8706 			     ISO-2022.  */
8707 			  src = src_end;
8708 			  coding.head_ascii = src - coding.source;
8709 			}
8710 		      detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
8711 		      break;
8712 		    }
8713 		}
8714 	      else if (! c && !inhibit_nbd)
8715 		{
8716 		  null_byte_found = 1;
8717 		  if (eight_bit_found)
8718 		    break;
8719 		}
8720 	      if (! eight_bit_found)
8721 		coding.head_ascii++;
8722 	    }
8723 	  else if (! eight_bit_found)
8724 	    coding.head_ascii++;
8725 	}
8726 
8727       if (null_byte_found || eight_bit_found
8728 	  || coding.head_ascii < coding.src_bytes
8729 	  || detect_info.found)
8730 	{
8731 	  if (coding.head_ascii == coding.src_bytes)
8732 	    /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
8733 	    for (i = 0; i < coding_category_raw_text; i++)
8734 	      {
8735 		category = coding_priorities[i];
8736 		this = coding_categories + category;
8737 		if (detect_info.found & (1 << category))
8738 		  break;
8739 	      }
8740 	  else
8741 	    {
8742 	      if (null_byte_found)
8743 		{
8744 		  detect_info.checked |= ~CATEGORY_MASK_UTF_16;
8745 		  detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
8746 		}
8747 	      else if (prefer_utf_8
8748 		       && detect_coding_utf_8 (&coding, &detect_info))
8749 		{
8750 		  detect_info.checked |= ~CATEGORY_MASK_UTF_8;
8751 		  detect_info.rejected |= ~CATEGORY_MASK_UTF_8;
8752 		}
8753 	      for (i = 0; i < coding_category_raw_text; i++)
8754 		{
8755 		  category = coding_priorities[i];
8756 		  this = coding_categories + category;
8757 
8758 		  if (this->id < 0)
8759 		    {
8760 		      /* No coding system of this category is defined.  */
8761 		      detect_info.rejected |= (1 << category);
8762 		    }
8763 		  else if (category >= coding_category_raw_text)
8764 		    continue;
8765 		  else if (detect_info.checked & (1 << category))
8766 		    {
8767 		      if (highest
8768 			  && (detect_info.found & (1 << category)))
8769 			break;
8770 		    }
8771 		  else if ((*(this->detector)) (&coding, &detect_info)
8772 			   && highest
8773 			   && (detect_info.found & (1 << category)))
8774 		    {
8775 		      if (category == coding_category_utf_16_auto)
8776 			{
8777 			  if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8778 			    category = coding_category_utf_16_le;
8779 			  else
8780 			    category = coding_category_utf_16_be;
8781 			}
8782 		      break;
8783 		    }
8784 		}
8785 	    }
8786 	}
8787 
8788       if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY
8789 	  || null_byte_found)
8790 	{
8791 	  detect_info.found = CATEGORY_MASK_RAW_TEXT;
8792 	  id = CODING_SYSTEM_ID (Qno_conversion);
8793 	  val = list1i (id);
8794 	}
8795       else if (! detect_info.rejected && ! detect_info.found)
8796 	{
8797 	  detect_info.found = CATEGORY_MASK_ANY;
8798 	  id = coding_categories[coding_category_undecided].id;
8799 	  val = list1i (id);
8800 	}
8801       else if (highest)
8802 	{
8803 	  if (detect_info.found)
8804 	    {
8805 	      detect_info.found = 1 << category;
8806 	      val = list1i (this->id);
8807 	    }
8808 	  else
8809 	    for (i = 0; i < coding_category_raw_text; i++)
8810 	      if (! (detect_info.rejected & (1 << coding_priorities[i])))
8811 		{
8812 		  detect_info.found = 1 << coding_priorities[i];
8813 		  id = coding_categories[coding_priorities[i]].id;
8814 		  val = list1i (id);
8815 		  break;
8816 		}
8817 	}
8818       else
8819 	{
8820 	  int mask = detect_info.rejected | detect_info.found;
8821 	  int found = 0;
8822 
8823 	  for (i = coding_category_raw_text - 1; i >= 0; i--)
8824 	    {
8825 	      category = coding_priorities[i];
8826 	      if (! (mask & (1 << category)))
8827 		{
8828 		  found |= 1 << category;
8829 		  id = coding_categories[category].id;
8830 		  if (id >= 0)
8831 		    val = list1i (id);
8832 		}
8833 	    }
8834 	  for (i = coding_category_raw_text - 1; i >= 0; i--)
8835 	    {
8836 	      category = coding_priorities[i];
8837 	      if (detect_info.found & (1 << category))
8838 		{
8839 		  id = coding_categories[category].id;
8840 		  val = Fcons (make_fixnum (id), val);
8841 		}
8842 	    }
8843 	  detect_info.found |= found;
8844 	}
8845     }
8846   else if (base_category == coding_category_utf_8_auto)
8847     {
8848       if (detect_coding_utf_8 (&coding, &detect_info))
8849 	{
8850 	  struct coding_system *this;
8851 
8852 	  if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
8853 	    this = coding_categories + coding_category_utf_8_sig;
8854 	  else
8855 	    this = coding_categories + coding_category_utf_8_nosig;
8856 	  val = list1i (this->id);
8857 	}
8858     }
8859   else if (base_category == coding_category_utf_16_auto)
8860     {
8861       if (detect_coding_utf_16 (&coding, &detect_info))
8862 	{
8863 	  struct coding_system *this;
8864 
8865 	  if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8866 	    this = coding_categories + coding_category_utf_16_le;
8867 	  else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
8868 	    this = coding_categories + coding_category_utf_16_be;
8869 	  else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
8870 	    this = coding_categories + coding_category_utf_16_be_nosig;
8871 	  else
8872 	    this = coding_categories + coding_category_utf_16_le_nosig;
8873 	  val = list1i (this->id);
8874 	}
8875     }
8876   else
8877     {
8878       detect_info.found = 1 << XFIXNUM (CODING_ATTR_CATEGORY (attrs));
8879       val = list1i (coding.id);
8880     }
8881 
8882   /* Then, detect eol-format if necessary.  */
8883   {
8884     int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1;
8885     Lisp_Object tail;
8886 
8887     if (VECTORP (eol_type))
8888       {
8889 	if (detect_info.found & ~CATEGORY_MASK_UTF_16)
8890 	  {
8891 	    if (null_byte_found)
8892 	      normal_eol = EOL_SEEN_LF;
8893 	    else
8894 	      normal_eol = detect_eol (coding.source, src_bytes,
8895 				       coding_category_raw_text);
8896 	  }
8897 	if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
8898 				 | CATEGORY_MASK_UTF_16_BE_NOSIG))
8899 	  utf_16_be_eol = detect_eol (coding.source, src_bytes,
8900 				      coding_category_utf_16_be);
8901 	if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
8902 				 | CATEGORY_MASK_UTF_16_LE_NOSIG))
8903 	  utf_16_le_eol = detect_eol (coding.source, src_bytes,
8904 				      coding_category_utf_16_le);
8905       }
8906     else
8907       {
8908 	if (EQ (eol_type, Qunix))
8909 	  normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
8910 	else if (EQ (eol_type, Qdos))
8911 	  normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
8912 	else
8913 	  normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
8914       }
8915 
8916     for (tail = val; CONSP (tail); tail = XCDR (tail))
8917       {
8918 	enum coding_category category;
8919 	int this_eol;
8920 
8921 	id = XFIXNUM (XCAR (tail));
8922 	attrs = CODING_ID_ATTRS (id);
8923 	category = XFIXNUM (CODING_ATTR_CATEGORY (attrs));
8924 	eol_type = CODING_ID_EOL_TYPE (id);
8925 	if (VECTORP (eol_type))
8926 	  {
8927 	    if (category == coding_category_utf_16_be
8928 		|| category == coding_category_utf_16_be_nosig)
8929 	      this_eol = utf_16_be_eol;
8930 	    else if (category == coding_category_utf_16_le
8931 		     || category == coding_category_utf_16_le_nosig)
8932 	      this_eol = utf_16_le_eol;
8933 	    else
8934 	      this_eol = normal_eol;
8935 
8936 	    if (this_eol == EOL_SEEN_LF)
8937 	      XSETCAR (tail, AREF (eol_type, 0));
8938 	    else if (this_eol == EOL_SEEN_CRLF)
8939 	      XSETCAR (tail, AREF (eol_type, 1));
8940 	    else if (this_eol == EOL_SEEN_CR)
8941 	      XSETCAR (tail, AREF (eol_type, 2));
8942 	    else
8943 	      XSETCAR (tail, CODING_ID_NAME (id));
8944 	  }
8945 	else
8946 	  XSETCAR (tail, CODING_ID_NAME (id));
8947       }
8948   }
8949 
8950   return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val);
8951 }
8952 
8953 
8954 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
8955        2, 3, 0,
8956        doc: /* Detect coding system of the text in the region between START and END.
8957 Return a list of possible coding systems ordered by priority.
8958 The coding systems to try and their priorities follows what
8959 the function `coding-system-priority-list' (which see) returns.
8960 
8961 If only ASCII characters are found (except for such ISO-2022 control
8962 characters as ESC), it returns a list of single element `undecided'
8963 or its subsidiary coding system according to a detected end-of-line
8964 format.
8965 
8966 If optional argument HIGHEST is non-nil, return the coding system of
8967 highest priority.  */)
8968   (Lisp_Object start, Lisp_Object end, Lisp_Object highest)
8969 {
8970   ptrdiff_t from, to;
8971   ptrdiff_t from_byte, to_byte;
8972 
8973   validate_region (&start, &end);
8974   from = XFIXNUM (start), to = XFIXNUM (end);
8975   from_byte = CHAR_TO_BYTE (from);
8976   to_byte = CHAR_TO_BYTE (to);
8977 
8978   if (from < GPT && to >= GPT)
8979     move_gap_both (to, to_byte);
8980 
8981   return detect_coding_system (BYTE_POS_ADDR (from_byte),
8982 			       to - from, to_byte - from_byte,
8983 			       !NILP (highest),
8984 			       !NILP (BVAR (current_buffer
8985 				      , enable_multibyte_characters)),
8986 			       Qnil);
8987 }
8988 
8989 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
8990        1, 2, 0,
8991        doc: /* Detect coding system of the text in STRING.
8992 Return a list of possible coding systems ordered by priority.
8993 The coding systems to try and their priorities follows what
8994 the function `coding-system-priority-list' (which see) returns.
8995 
8996 If only ASCII characters are found (except for such ISO-2022 control
8997 characters as ESC), it returns a list of single element `undecided'
8998 or its subsidiary coding system according to a detected end-of-line
8999 format.
9000 
9001 If optional argument HIGHEST is non-nil, return the coding system of
9002 highest priority.  */)
9003   (Lisp_Object string, Lisp_Object highest)
9004 {
9005   CHECK_STRING (string);
9006 
9007   return detect_coding_system (SDATA (string),
9008 			       SCHARS (string), SBYTES (string),
9009 			       !NILP (highest), STRING_MULTIBYTE (string),
9010 			       Qnil);
9011 }
9012 
9013 
9014 static bool
char_encodable_p(int c,Lisp_Object attrs)9015 char_encodable_p (int c, Lisp_Object attrs)
9016 {
9017   Lisp_Object tail;
9018   struct charset *charset;
9019   Lisp_Object translation_table;
9020 
9021   translation_table = CODING_ATTR_TRANS_TBL (attrs);
9022   if (! NILP (translation_table))
9023     c = translate_char (translation_table, c);
9024   for (tail = CODING_ATTR_CHARSET_LIST (attrs);
9025        CONSP (tail); tail = XCDR (tail))
9026     {
9027       charset = CHARSET_FROM_ID (XFIXNUM (XCAR (tail)));
9028       if (CHAR_CHARSET_P (c, charset))
9029 	break;
9030     }
9031   return (! NILP (tail));
9032 }
9033 
9034 
9035 /* Return a list of coding systems that safely encode the text between
9036    START and END.  If EXCLUDE is non-nil, it is a list of coding
9037    systems not to check.  The returned list doesn't contain any such
9038    coding systems.  In any case, if the text contains only ASCII or is
9039    unibyte, return t.  */
9040 
9041 DEFUN ("find-coding-systems-region-internal",
9042        Ffind_coding_systems_region_internal,
9043        Sfind_coding_systems_region_internal, 2, 3, 0,
9044        doc: /* Internal use only.  */)
9045   (Lisp_Object start, Lisp_Object end, Lisp_Object exclude)
9046 {
9047   Lisp_Object coding_attrs_list, safe_codings;
9048   ptrdiff_t start_byte, end_byte;
9049   const unsigned char *p, *pbeg, *pend;
9050   int c;
9051   Lisp_Object tail, elt, work_table;
9052 
9053   if (STRINGP (start))
9054     {
9055       if (!STRING_MULTIBYTE (start)
9056 	  || SCHARS (start) == SBYTES (start))
9057 	return Qt;
9058       start_byte = 0;
9059       end_byte = SBYTES (start);
9060     }
9061   else
9062     {
9063       EMACS_INT s = fix_position (start);
9064       EMACS_INT e = fix_position (end);
9065       if (! (BEG <= s && s <= e && e <= Z))
9066 	args_out_of_range (start, end);
9067       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
9068 	return Qt;
9069       start_byte = CHAR_TO_BYTE (s);
9070       end_byte = CHAR_TO_BYTE (e);
9071       if (e - s == end_byte - start_byte)
9072 	return Qt;
9073 
9074       if (s < GPT && GPT < e)
9075 	{
9076 	  if (GPT - s < e - GPT)
9077 	    move_gap_both (s, start_byte);
9078 	  else
9079 	    move_gap_both (e, end_byte);
9080 	}
9081     }
9082 
9083   coding_attrs_list = Qnil;
9084   for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
9085     if (NILP (exclude)
9086 	|| NILP (Fmemq (XCAR (tail), exclude)))
9087       {
9088 	Lisp_Object attrs;
9089 
9090 	attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
9091 	if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs)))
9092 	  {
9093 	    ASET (attrs, coding_attr_trans_tbl,
9094 		  get_translation_table (attrs, 1, NULL));
9095 	    coding_attrs_list = Fcons (attrs, coding_attrs_list);
9096 	  }
9097       }
9098 
9099   if (STRINGP (start))
9100     p = pbeg = SDATA (start);
9101   else
9102     p = pbeg = BYTE_POS_ADDR (start_byte);
9103   pend = p + (end_byte - start_byte);
9104 
9105   while (p < pend && ASCII_CHAR_P (*p)) p++;
9106   while (p < pend && ASCII_CHAR_P (*(pend - 1))) pend--;
9107 
9108   work_table = Fmake_char_table (Qnil, Qnil);
9109   while (p < pend)
9110     {
9111       if (ASCII_CHAR_P (*p))
9112 	p++;
9113       else
9114 	{
9115 	  c = string_char_advance (&p);
9116 	  if (!NILP (char_table_ref (work_table, c)))
9117 	    /* This character was already checked.  Ignore it.  */
9118 	    continue;
9119 
9120 	  charset_map_loaded = 0;
9121 	  for (tail = coding_attrs_list; CONSP (tail);)
9122 	    {
9123 	      elt = XCAR (tail);
9124 	      if (NILP (elt))
9125 		tail = XCDR (tail);
9126 	      else if (char_encodable_p (c, elt))
9127 		tail = XCDR (tail);
9128 	      else if (CONSP (XCDR (tail)))
9129 		{
9130 		  XSETCAR (tail, XCAR (XCDR (tail)));
9131 		  XSETCDR (tail, XCDR (XCDR (tail)));
9132 		}
9133 	      else
9134 		{
9135 		  XSETCAR (tail, Qnil);
9136 		  tail = XCDR (tail);
9137 		}
9138 	    }
9139 	  if (charset_map_loaded)
9140 	    {
9141 	      ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
9142 
9143 	      if (STRINGP (start))
9144 		pbeg = SDATA (start);
9145 	      else
9146 		pbeg = BYTE_POS_ADDR (start_byte);
9147 	      p = pbeg + p_offset;
9148 	      pend = pbeg + pend_offset;
9149 	    }
9150 	  char_table_set (work_table, c, Qt);
9151 	}
9152     }
9153 
9154   safe_codings = list2 (Qraw_text, Qno_conversion);
9155   for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
9156     if (! NILP (XCAR (tail)))
9157       safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
9158 
9159   return safe_codings;
9160 }
9161 
9162 
9163 DEFUN ("unencodable-char-position", Funencodable_char_position,
9164        Sunencodable_char_position, 3, 5, 0,
9165        doc: /* Return position of first un-encodable character in a region.
9166 START and END specify the region and CODING-SYSTEM specifies the
9167 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
9168 
9169 If optional 4th argument COUNT is non-nil, it specifies at most how
9170 many un-encodable characters to search.  In this case, the value is a
9171 list of positions.
9172 
9173 If optional 5th argument STRING is non-nil, it is a string to search
9174 for un-encodable characters.  In that case, START and END are indexes
9175 to the string and treated as in `substring'.  */)
9176   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system,
9177    Lisp_Object count, Lisp_Object string)
9178 {
9179   EMACS_INT n;
9180   struct coding_system coding;
9181   Lisp_Object attrs, charset_list, translation_table;
9182   Lisp_Object positions;
9183   ptrdiff_t from, to;
9184   const unsigned char *p, *stop, *pend;
9185   bool ascii_compatible;
9186 
9187   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
9188   attrs = CODING_ID_ATTRS (coding.id);
9189   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
9190     return Qnil;
9191   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
9192   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9193   translation_table = get_translation_table (attrs, 1, NULL);
9194 
9195   if (NILP (string))
9196     {
9197       validate_region (&start, &end);
9198       from = XFIXNUM (start);
9199       to = XFIXNUM (end);
9200       if (NILP (BVAR (current_buffer, enable_multibyte_characters))
9201 	  || (ascii_compatible
9202 	      && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
9203 	return Qnil;
9204       p = CHAR_POS_ADDR (from);
9205       pend = CHAR_POS_ADDR (to);
9206       if (from < GPT && to >= GPT)
9207 	stop = GPT_ADDR;
9208       else
9209 	stop = pend;
9210     }
9211   else
9212     {
9213       CHECK_STRING (string);
9214       validate_subarray (string, start, end, SCHARS (string), &from, &to);
9215       if (! STRING_MULTIBYTE (string))
9216 	return Qnil;
9217       p = SDATA (string) + string_char_to_byte (string, from);
9218       stop = pend = SDATA (string) + string_char_to_byte (string, to);
9219       if (ascii_compatible && (to - from) == (pend - p))
9220 	return Qnil;
9221     }
9222 
9223   if (NILP (count))
9224     n = 1;
9225   else
9226     {
9227       CHECK_FIXNAT (count);
9228       n = XFIXNUM (count);
9229     }
9230 
9231   positions = Qnil;
9232   charset_map_loaded = 0;
9233   while (1)
9234     {
9235       int c;
9236 
9237       if (ascii_compatible)
9238 	while (p < stop && ASCII_CHAR_P (*p))
9239 	  p++, from++;
9240       if (p >= stop)
9241 	{
9242 	  if (p >= pend)
9243 	    break;
9244 	  stop = pend;
9245 	  p = GAP_END_ADDR;
9246 	}
9247 
9248       c = string_char_advance (&p);
9249       if (! (ASCII_CHAR_P (c) && ascii_compatible)
9250 	  && ! char_charset (translate_char (translation_table, c),
9251 			     charset_list, NULL))
9252 	{
9253 	  positions = Fcons (make_fixnum (from), positions);
9254 	  n--;
9255 	  if (n == 0)
9256 	    break;
9257 	}
9258 
9259       from++;
9260       if (charset_map_loaded && NILP (string))
9261 	{
9262 	  p = CHAR_POS_ADDR (from);
9263 	  pend = CHAR_POS_ADDR (to);
9264 	  if (from < GPT && to >= GPT)
9265 	    stop = GPT_ADDR;
9266 	  else
9267 	    stop = pend;
9268 	  charset_map_loaded = 0;
9269 	}
9270     }
9271 
9272   return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
9273 }
9274 
9275 
9276 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
9277        Scheck_coding_systems_region, 3, 3, 0,
9278        doc: /* Check if text between START and END is encodable by CODING-SYSTEM-LIST.
9279 
9280 START and END are buffer positions specifying the region.
9281 CODING-SYSTEM-LIST is a list of coding systems to check.
9282 
9283 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
9284 function returns nil.
9285 
9286 If some of the coding systems cannot encode the whole region, value is
9287 an alist, each element of which has the form (CODING-SYSTEM POS1 POS2 ...),
9288 which means that CODING-SYSTEM cannot encode the text at buffer positions
9289 POS1, POS2, ...
9290 
9291 START may be a string.  In that case, check if the string is
9292 encodable, and the value contains character indices into the string
9293 instead of buffer positions.  END is ignored in this case.
9294 
9295 If the current buffer (or START if it is a string) is unibyte, the value
9296 is nil.  */)
9297   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system_list)
9298 {
9299   Lisp_Object list;
9300   ptrdiff_t start_byte, end_byte;
9301   ptrdiff_t pos;
9302   const unsigned char *p, *pbeg, *pend;
9303   int c;
9304   Lisp_Object tail, elt, attrs;
9305 
9306   if (STRINGP (start))
9307     {
9308       if (!STRING_MULTIBYTE (start)
9309 	  || SCHARS (start) == SBYTES (start))
9310 	return Qnil;
9311       start_byte = 0;
9312       end_byte = SBYTES (start);
9313       pos = 0;
9314     }
9315   else
9316     {
9317       EMACS_INT s = fix_position (start);
9318       EMACS_INT e = fix_position (end);
9319       if (! (BEG <= s && s <= e && e <= Z))
9320 	args_out_of_range (start, end);
9321       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
9322 	return Qnil;
9323       start_byte = CHAR_TO_BYTE (s);
9324       end_byte = CHAR_TO_BYTE (e);
9325       if (e - s == end_byte - start_byte)
9326 	return Qnil;
9327 
9328       if (s < GPT && GPT < e)
9329 	{
9330 	  if (GPT - s < e - GPT)
9331 	    move_gap_both (s, start_byte);
9332 	  else
9333 	    move_gap_both (e, end_byte);
9334 	}
9335       pos = s;
9336     }
9337 
9338   list = Qnil;
9339   for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
9340     {
9341       elt = XCAR (tail);
9342       Lisp_Object spec = CODING_SYSTEM_SPEC (elt);
9343       if (!VECTORP (spec))
9344         xsignal1 (Qcoding_system_error, elt);
9345       attrs = AREF (spec, 0);
9346       ASET (attrs, coding_attr_trans_tbl,
9347 	    get_translation_table (attrs, 1, NULL));
9348       list = Fcons (list2 (elt, attrs), list);
9349     }
9350 
9351   if (STRINGP (start))
9352     p = pbeg = SDATA (start);
9353   else
9354     p = pbeg = BYTE_POS_ADDR (start_byte);
9355   pend = p + (end_byte - start_byte);
9356 
9357   while (p < pend && ASCII_CHAR_P (*p)) p++, pos++;
9358   while (p < pend && ASCII_CHAR_P (*(pend - 1))) pend--;
9359 
9360   while (p < pend)
9361     {
9362       if (ASCII_CHAR_P (*p))
9363 	p++;
9364       else
9365 	{
9366 	  c = string_char_advance (&p);
9367 
9368 	  charset_map_loaded = 0;
9369 	  for (tail = list; CONSP (tail); tail = XCDR (tail))
9370 	    {
9371 	      elt = XCDR (XCAR (tail));
9372 	      if (! char_encodable_p (c, XCAR (elt)))
9373 		XSETCDR (elt, Fcons (make_fixnum (pos), XCDR (elt)));
9374 	    }
9375 	  if (charset_map_loaded)
9376 	    {
9377 	      ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
9378 
9379 	      if (STRINGP (start))
9380 		pbeg = SDATA (start);
9381 	      else
9382 		pbeg = BYTE_POS_ADDR (start_byte);
9383 	      p = pbeg + p_offset;
9384 	      pend = pbeg + pend_offset;
9385 	    }
9386 	}
9387       pos++;
9388     }
9389 
9390   tail = list;
9391   list = Qnil;
9392   for (; CONSP (tail); tail = XCDR (tail))
9393     {
9394       elt = XCAR (tail);
9395       if (CONSP (XCDR (XCDR (elt))))
9396 	list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
9397 		      list);
9398     }
9399 
9400   return list;
9401 }
9402 
9403 
9404 static Lisp_Object
code_convert_region(Lisp_Object start,Lisp_Object end,Lisp_Object coding_system,Lisp_Object dst_object,bool encodep,bool norecord)9405 code_convert_region (Lisp_Object start, Lisp_Object end,
9406 		     Lisp_Object coding_system, Lisp_Object dst_object,
9407 		     bool encodep, bool norecord)
9408 {
9409   struct coding_system coding;
9410   ptrdiff_t from, from_byte, to, to_byte;
9411   Lisp_Object src_object;
9412 
9413   if (NILP (coding_system))
9414     coding_system = Qno_conversion;
9415   else
9416     CHECK_CODING_SYSTEM (coding_system);
9417   src_object = Fcurrent_buffer ();
9418   if (NILP (dst_object))
9419     dst_object = src_object;
9420   else if (! EQ (dst_object, Qt))
9421     CHECK_BUFFER (dst_object);
9422 
9423   validate_region (&start, &end);
9424   from = XFIXNAT (start);
9425   from_byte = CHAR_TO_BYTE (from);
9426   to = XFIXNAT (end);
9427   to_byte = CHAR_TO_BYTE (to);
9428 
9429   setup_coding_system (coding_system, &coding);
9430   coding.mode |= CODING_MODE_LAST_BLOCK;
9431 
9432   if (BUFFERP (dst_object) && !EQ (dst_object, src_object))
9433     {
9434       struct buffer *buf = XBUFFER (dst_object);
9435       ptrdiff_t buf_pt = BUF_PT (buf);
9436 
9437       invalidate_buffer_caches (buf, buf_pt, buf_pt);
9438     }
9439 
9440   if (encodep)
9441     encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9442 			  dst_object);
9443   else
9444     decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9445 			  dst_object);
9446   if (! norecord)
9447     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9448 
9449   return (BUFFERP (dst_object)
9450 	  ? make_fixnum (coding.produced_char)
9451 	  : coding.dst_object);
9452 }
9453 
9454 
9455 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
9456        3, 4, "r\nzCoding system: ",
9457        doc: /* Decode the current region using the specified coding system.
9458 Interactively, prompt for the coding system to decode the region, and
9459 replace the region with the decoded text.
9460 
9461 \"Decoding\" means transforming bytes into readable text (characters).
9462 If, for instance, you have a region that contains data that represents
9463 the two bytes #xc2 #xa9, after calling this function with the utf-8
9464 coding system, the region will contain the single
9465 character ?\\N{COPYRIGHT SIGN}.
9466 
9467 When called from a program, takes four arguments:
9468 	START, END, CODING-SYSTEM, and DESTINATION.
9469 START and END are buffer positions.
9470 
9471 Optional 4th arguments DESTINATION specifies where the decoded text goes.
9472 If nil, the region between START and END is replaced by the decoded text.
9473 If buffer, the decoded text is inserted in that buffer after point (point
9474 does not move).  If that buffer is unibyte, it receives the individual
9475 bytes of the internal representation of the decoded text.
9476 In those cases, the length of the decoded text is returned.
9477 If DESTINATION is t, the decoded text is returned.
9478 
9479 This function sets `last-coding-system-used' to the precise coding system
9480 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9481 not fully specified.)  */)
9482   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
9483 {
9484   return code_convert_region (start, end, coding_system, destination, 0, 0);
9485 }
9486 
9487 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
9488        3, 4, "r\nzCoding system: ",
9489        doc: /* Encode the current region using th specified coding system.
9490 Interactively, prompt for the coding system to encode the region, and
9491 replace the region with the bytes that are the result of the encoding.
9492 
9493 What's meant by \"encoding\" is transforming textual data (characters)
9494 into bytes.  If, for instance, you have a region that contains the
9495 single character ?\\N{COPYRIGHT SIGN}, after calling this function with
9496 the utf-8 coding system, the data in the region will represent the two
9497 bytes #xc2 #xa9.
9498 
9499 When called from a program, takes four arguments:
9500         START, END, CODING-SYSTEM and DESTINATION.
9501 START and END are buffer positions.
9502 
9503 Optional 4th argument DESTINATION specifies where the encoded text goes.
9504 If nil, the region between START and END is replaced by the encoded text.
9505 If buffer, the encoded text is inserted in that buffer after point (point
9506 does not move).
9507 In those cases, the length of the encoded text is returned.
9508 If DESTINATION is t, the encoded text is returned.
9509 
9510 This function sets `last-coding-system-used' to the precise coding system
9511 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9512 not fully specified.)  */)
9513   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
9514 {
9515   return code_convert_region (start, end, coding_system, destination, 1, 0);
9516 }
9517 
9518 /* Whether STRING only contains chars in the 0..127 range.  */
9519 bool
string_ascii_p(Lisp_Object string)9520 string_ascii_p (Lisp_Object string)
9521 {
9522   ptrdiff_t nbytes = SBYTES (string);
9523   for (ptrdiff_t i = 0; i < nbytes; i++)
9524     if (SREF (string, i) > 127)
9525       return false;
9526   return true;
9527 }
9528 
9529 Lisp_Object
code_convert_string(Lisp_Object string,Lisp_Object coding_system,Lisp_Object dst_object,bool encodep,bool nocopy,bool norecord)9530 code_convert_string (Lisp_Object string, Lisp_Object coding_system,
9531 		     Lisp_Object dst_object, bool encodep, bool nocopy,
9532 		     bool norecord)
9533 {
9534   struct coding_system coding;
9535   ptrdiff_t chars, bytes;
9536 
9537   CHECK_STRING (string);
9538   if (NILP (coding_system))
9539     {
9540       if (! norecord)
9541 	Vlast_coding_system_used = Qno_conversion;
9542       if (NILP (dst_object))
9543 	return nocopy ? string : Fcopy_sequence (string);
9544     }
9545 
9546   if (NILP (coding_system))
9547     coding_system = Qno_conversion;
9548   else
9549     CHECK_CODING_SYSTEM (coding_system);
9550   if (NILP (dst_object))
9551     dst_object = Qt;
9552   else if (! EQ (dst_object, Qt))
9553     CHECK_BUFFER (dst_object);
9554 
9555   setup_coding_system (coding_system, &coding);
9556   coding.mode |= CODING_MODE_LAST_BLOCK;
9557   chars = SCHARS (string);
9558   bytes = SBYTES (string);
9559 
9560   if (EQ (dst_object, Qt))
9561     {
9562       /* Fast path for ASCII-only input and an ASCII-compatible coding:
9563          act as identity if no EOL conversion is needed.  */
9564       Lisp_Object attrs = CODING_ID_ATTRS (coding.id);
9565       if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
9566           && (STRING_MULTIBYTE (string)
9567               ? (chars == bytes) : string_ascii_p (string))
9568           && (EQ (CODING_ID_EOL_TYPE (coding.id), Qunix)
9569               || inhibit_eol_conversion
9570               || ! memchr (SDATA (string), encodep ? '\n' : '\r', bytes)))
9571         {
9572           if (! norecord)
9573             Vlast_coding_system_used = coding_system;
9574           return (nocopy
9575                   ? string
9576                   : (encodep
9577                      ? make_unibyte_string (SSDATA (string), bytes)
9578                      : make_multibyte_string (SSDATA (string), bytes, bytes)));
9579         }
9580     }
9581   else if (BUFFERP (dst_object))
9582     {
9583       struct buffer *buf = XBUFFER (dst_object);
9584       ptrdiff_t buf_pt = BUF_PT (buf);
9585 
9586       invalidate_buffer_caches (buf, buf_pt, buf_pt);
9587     }
9588 
9589   if (encodep)
9590     encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9591   else
9592     decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9593   if (! norecord)
9594     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9595 
9596   return (BUFFERP (dst_object)
9597 	  ? make_fixnum (coding.produced_char)
9598 	  : coding.dst_object);
9599 }
9600 
9601 
9602 /* Encode or decode STRING according to CODING_SYSTEM.
9603    Do not set Vlast_coding_system_used.  */
9604 
9605 Lisp_Object
code_convert_string_norecord(Lisp_Object string,Lisp_Object coding_system,bool encodep)9606 code_convert_string_norecord (Lisp_Object string, Lisp_Object coding_system,
9607 			      bool encodep)
9608 {
9609   return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
9610 }
9611 
9612 
9613 /* Return the gap address of BUFFER.  If the gap size is less than
9614    NBYTES, enlarge the gap in advance.  */
9615 
9616 static unsigned char *
get_buffer_gap_address(Lisp_Object buffer,ptrdiff_t nbytes)9617 get_buffer_gap_address (Lisp_Object buffer, ptrdiff_t nbytes)
9618 {
9619   struct buffer *buf = XBUFFER (buffer);
9620 
9621   if (BUF_GPT (buf) != BUF_PT (buf))
9622     {
9623       struct buffer *oldb = current_buffer;
9624 
9625       current_buffer = buf;
9626       move_gap_both (PT, PT_BYTE);
9627       current_buffer = oldb;
9628     }
9629   if (BUF_GAP_SIZE (buf) < nbytes)
9630     make_gap_1 (buf, nbytes);
9631   return BUF_GPT_ADDR (buf);
9632 }
9633 
9634 /* Return a pointer to the byte sequence for C, and its byte length in
9635    LEN.  This function is used to get a byte sequence for HANDLE_8_BIT
9636    and HANDLE_OVER_UNI arguments of encode_string_utf_8 and
9637    decode_string_utf_8 when those arguments are given by
9638    characters.  */
9639 
9640 static unsigned char *
get_char_bytes(int c,int * len)9641 get_char_bytes (int c, int *len)
9642 {
9643   /* Use two caches, since encode/decode_string_utf_8 are called
9644      repeatedly with the same values for HANDLE_8_BIT and
9645      HANDLE_OVER_UNI arguments.  */
9646   static int chars[2];
9647   static unsigned char bytes[2][6];
9648   static int nbytes[2];
9649   static int last_index;
9650 
9651   if (chars[last_index] == c)
9652     {
9653       *len = nbytes[last_index];
9654       return bytes[last_index];
9655     }
9656   if (chars[1 - last_index] == c)
9657     {
9658       *len = nbytes[1 - last_index];
9659       return bytes[1 - last_index];
9660     }
9661   last_index = 1 - last_index;
9662   chars[last_index] = c;
9663   *len = nbytes[last_index] = CHAR_STRING (c, bytes[last_index]);
9664   return bytes[last_index];
9665 }
9666 
9667 /* Encode STRING by the coding system utf-8-unix.
9668 
9669    This function is optimized for speed when the input string is
9670    already a valid sequence of Unicode codepoints in the internal
9671    representation, i.e. there are neither 8-bit raw bytes nor
9672    characters beyond the Unicode range in the string's contents.
9673 
9674    Ignore any :pre-write-conversion and :encode-translation-table
9675    properties.
9676 
9677    Assume that arguments have values as described below.
9678    The validity must be enforced and ensured by the caller.
9679 
9680    STRING is a multibyte string or an ASCII-only unibyte string.
9681 
9682    BUFFER is a unibyte buffer or Qnil.
9683 
9684    If BUFFER is a unibyte buffer, insert the encoded result
9685    after point of the buffer, and return the number of
9686    inserted characters.  The caller should have made BUFFER ready for
9687    modifying in advance (e.g., by calling invalidate_buffer_caches).
9688 
9689    If BUFFER is nil, return a unibyte string from the encoded result.
9690 
9691    If NOCOPY is non-zero, and if STRING contains only Unicode
9692    characters (i.e., the encoding does not change the byte sequence),
9693    return STRING even if it is multibyte.  WARNING: This will return a
9694    _multibyte_ string, something that callers might not expect, especially
9695    if STRING is not pure-ASCII; only use NOCOPY non-zero if the caller
9696    will only use the byte sequence of the encoded result accessed by
9697    SDATA or SSDATA, and the original STRING will _not_ be modified after
9698    the encoding.  When in doubt, always pass NOCOPY as zero.  You _have_
9699    been warned!
9700 
9701    HANDLE-8-BIT and HANDLE-OVER-UNI specify how to handle a non-Unicode
9702    character in STRING.  The former is for an eight-bit character (represented
9703    by a 2-byte overlong sequence in a multibyte STRING).  The latter is
9704    for a codepoint beyond the end of the Unicode range (a character whose
9705    code is greater than the maximum Unicode character 0x10FFFF, represented
9706    by a 4 or 5-byte sequence in a multibyte STRING).
9707 
9708    If these two arguments are unibyte strings (typically
9709    "\357\277\275", the UTF-8 sequence for the Unicode REPLACEMENT
9710    CHARACTER #xFFFD), encode a non-Unicode character into that
9711    unibyte sequence.
9712 
9713    If the two arguments are characters, encode a non-Unicode
9714    character as the respective argument characters.
9715 
9716    If they are Qignored, skip a non-Unicode character.
9717 
9718    If HANDLE-8-BIT is Qt, encode eight-bit characters into single bytes
9719    of the same value, like the usual Emacs encoding does.
9720 
9721    If HANDLE-OVER-UNI is Qt, encode characters beyond the Unicode
9722    range into the same 4 or 5-byte sequence as used by Emacs
9723    internally, like the usual Emacs encoding does.
9724 
9725    If the two arguments are Qnil, return Qnil if STRING has a
9726    non-Unicode character.  This allows the caller to signal an error
9727    if such input strings are not allowed.  */
9728 
9729 Lisp_Object
encode_string_utf_8(Lisp_Object string,Lisp_Object buffer,bool nocopy,Lisp_Object handle_8_bit,Lisp_Object handle_over_uni)9730 encode_string_utf_8 (Lisp_Object string, Lisp_Object buffer,
9731 		     bool nocopy, Lisp_Object handle_8_bit,
9732 		     Lisp_Object handle_over_uni)
9733 {
9734   ptrdiff_t nchars = SCHARS (string), nbytes = SBYTES (string);
9735   if (NILP (buffer) && nchars == nbytes && nocopy)
9736     /* STRING contains only ASCII characters.  */
9737     return string;
9738 
9739   ptrdiff_t num_8_bit = 0;   /* number of eight-bit chars in STRING */
9740   /* The following two vars are counted only if handle_over_uni is not Qt.  */
9741   ptrdiff_t num_over_4 = 0; /* number of 4-byte non-Unicode chars in STRING */
9742   ptrdiff_t num_over_5 = 0; /* number of 5-byte non-Unicode chars in STRING */
9743   ptrdiff_t outbytes;	     /* number of bytes of decoding result */
9744   unsigned char *p = SDATA (string);
9745   unsigned char *pend = p + nbytes;
9746   unsigned char *src = NULL, *dst = NULL;
9747   unsigned char *replace_8_bit = NULL, *replace_over_uni = NULL;
9748   int replace_8_bit_len = 0, replace_over_uni_len = 0;
9749   Lisp_Object val;		/* the return value */
9750 
9751   /* Scan bytes in STRING twice.  The first scan is to count non-Unicode
9752      characters, and the second scan is to encode STRING.  If the
9753      encoding is trivial (no need of changing the byte sequence),
9754      the second scan is avoided.  */
9755   for (int scan_count = 0; scan_count < 2; scan_count++)
9756     {
9757       while (p < pend)
9758 	{
9759 	  if (nchars == pend - p)
9760 	    /* There is no multibyte character remaining.  */
9761 	    break;
9762 
9763 	  int c = *p;
9764 	  int len = BYTES_BY_CHAR_HEAD (c);
9765 
9766 	  nchars--;
9767 	  if (len == 1
9768 	      || len == 3
9769 	      || (len == 2 ? ! CHAR_BYTE8_HEAD_P (c)
9770 		  : (EQ (handle_over_uni, Qt)
9771 		     || (len == 4
9772 			 && STRING_CHAR (p) <= MAX_UNICODE_CHAR))))
9773 	    {
9774 	      p += len;
9775 	      continue;
9776 	    }
9777 
9778 	  /* A character to change the byte sequence on encoding was
9779 	     found.  A rare case.  */
9780 	  if (len == 2)
9781 	    {
9782 	      /* Handle an eight-bit character by handle_8_bit.  */
9783 	      if (scan_count == 0)
9784 		{
9785 		  if (NILP (handle_8_bit))
9786 		    return Qnil;
9787 		  num_8_bit++;
9788 		}
9789 	      else
9790 		{
9791 		  if (src < p)
9792 		    {
9793 		      memcpy (dst, src, p - src);
9794 		      dst += p - src;
9795 		    }
9796 		  if (replace_8_bit_len > 0)
9797 		    {
9798 		      memcpy (dst, replace_8_bit, replace_8_bit_len);
9799 		      dst += replace_8_bit_len;
9800 		    }
9801 		  else if (EQ (handle_8_bit, Qt))
9802 		    {
9803 		      int char8 = STRING_CHAR (p);
9804 		      *dst++ = CHAR_TO_BYTE8 (char8);
9805 		    }
9806 		}
9807 	    }
9808 	  else			/* len == 4 or 5 */
9809 	    {
9810 	      /* Handle an over-unicode character by handle_over_uni.  */
9811 	      if (scan_count == 0)
9812 		{
9813 		  if (NILP (handle_over_uni))
9814 		    return Qnil;
9815 		  if (len == 4)
9816 		    num_over_4++;
9817 		  else
9818 		    num_over_5++;
9819 		}
9820 	      else
9821 		{
9822 		  if (src < p)
9823 		    {
9824 		      memcpy (dst, src, p - src);
9825 		      dst += p - src;
9826 		    }
9827 		  if (replace_over_uni_len > 0)
9828 		    {
9829 		      memcpy (dst, replace_over_uni, replace_over_uni_len);
9830 		      dst += replace_over_uni_len;
9831 		    }
9832 		}
9833 	    }
9834 	  p += len;
9835 	  src = p;
9836 	}
9837 
9838       if (scan_count == 0)
9839 	{
9840 	  /* End of the first scan.  */
9841 	  outbytes = nbytes;
9842 	  if (num_8_bit == 0
9843 	      && (num_over_4 + num_over_5 == 0 || EQ (handle_over_uni, Qt)))
9844 	    {
9845 	      /* We can break the loop because there is no need of
9846 		 changing the byte sequence.  This is the typical
9847 		 case.  */
9848 	      scan_count = 1;
9849 	    }
9850 	  else
9851 	    {
9852 	      /* Prepare for handling non-Unicode characters during
9853 		 the next scan.  */
9854 	      if (num_8_bit > 0)
9855 		{
9856 		  if (CHARACTERP (handle_8_bit))
9857 		    replace_8_bit = get_char_bytes (XFIXNUM (handle_8_bit),
9858 						    &replace_8_bit_len);
9859 		  else if (STRINGP (handle_8_bit))
9860 		    {
9861 		      replace_8_bit = SDATA (handle_8_bit);
9862 		      replace_8_bit_len = SBYTES (handle_8_bit);
9863 		    }
9864 		  if (replace_8_bit)
9865 		    outbytes += (replace_8_bit_len - 2) * num_8_bit;
9866 		  else if (EQ (handle_8_bit, Qignored))
9867 		    outbytes -= 2 * num_8_bit;
9868 		  else if (EQ (handle_8_bit, Qt))
9869 		    outbytes -= num_8_bit;
9870 		  else
9871 		    return Qnil;
9872 		}
9873 	      if (num_over_4 + num_over_5 > 0)
9874 		{
9875 		  if (CHARACTERP (handle_over_uni))
9876 		    replace_over_uni = get_char_bytes (XFIXNUM (handle_over_uni),
9877 						       &replace_over_uni_len);
9878 		  else if (STRINGP (handle_over_uni))
9879 		    {
9880 		      replace_over_uni = SDATA (handle_over_uni);
9881 		      replace_over_uni_len = SBYTES (handle_over_uni);
9882 		    }
9883 		  if (num_over_4 > 0)
9884 		    {
9885 		      if (replace_over_uni)
9886 			outbytes += (replace_over_uni_len - 4) * num_over_4;
9887 		      else if (EQ (handle_over_uni, Qignored))
9888 			outbytes -= 4 * num_over_4;
9889 		      else if (! EQ (handle_over_uni, Qt))
9890 			return Qnil;
9891 		    }
9892 		  if (num_over_5 > 0)
9893 		    {
9894 		      if (replace_over_uni)
9895 			outbytes += (replace_over_uni_len - 5) * num_over_5;
9896 		      else if (EQ (handle_over_uni, Qignored))
9897 			outbytes -= 5 * num_over_5;
9898 		      else if (! EQ (handle_over_uni, Qt))
9899 			return Qnil;
9900 		    }
9901 		}
9902 	    }
9903 
9904 	  /* Prepare return value and space to store the encoded bytes.  */
9905 	  if (BUFFERP (buffer))
9906 	    {
9907 	      val = make_fixnum (outbytes);
9908 	      dst = get_buffer_gap_address (buffer, nbytes);
9909 	    }
9910 	  else
9911 	    {
9912 	      if (nocopy && (num_8_bit + num_over_4 + num_over_5) == 0)
9913 		return string;
9914 	      val = make_uninit_string (outbytes);
9915 	      dst = SDATA (val);
9916 	    }
9917 	  p = src = SDATA (string);
9918 	}
9919     }
9920 
9921   if (src < pend)
9922     memcpy (dst, src, pend - src);
9923   if (BUFFERP (buffer))
9924     {
9925       struct buffer *oldb = current_buffer;
9926 
9927       current_buffer = XBUFFER (buffer);
9928       insert_from_gap (outbytes, outbytes, false);
9929       current_buffer = oldb;
9930     }
9931   return val;
9932 }
9933 
9934 /* Decode input string by the coding system utf-8-unix.
9935 
9936    This function is optimized for speed when the input string is
9937    already a valid UTF-8 sequence, i.e. there are neither 8-bit raw
9938    bytes nor any UTF-8 sequences longer than 4 bytes in the string's
9939    contents.
9940 
9941    Ignore any :post-read-conversion and :decode-translation-table
9942    properties.
9943 
9944    Assume that arguments have values as described below.
9945    The validity must be enforced and ensured by the caller.
9946 
9947    STRING is a unibyte string, an ASCII-only multibyte string, or Qnil.
9948    If STRING is Qnil, the input is a C string pointed by STR whose
9949    length in bytes is in STR_LEN.
9950 
9951    BUFFER is a multibyte buffer or Qnil.
9952    If BUFFER is a multibyte buffer, insert the decoding result of
9953    Unicode characters after point of the buffer, and return the number
9954    of inserted characters.  The caller should have made BUFFER ready
9955    for modifying in advance (e.g., by calling invalidate_buffer_caches).
9956 
9957    If BUFFER is Qnil, return a multibyte string from the decoded result.
9958 
9959    NOCOPY non-zero means it is OK to return the input STRING if it
9960    contains only ASCII characters or only valid UTF-8 sequences of 2
9961    to 4 bytes.  WARNING: This will return a _unibyte_ string, something
9962    that callers might not expect, especially if STRING is not
9963    pure-ASCII; only use NOCOPY non-zero if the caller will only use
9964    the byte sequence of the decoded result accessed via SDATA or
9965    SSDATA, and if the original STRING will _not_ be modified after the
9966    decoding.  When in doubt, always pass NOCOPY as zero.  You _have_
9967    been warned!
9968 
9969    If STRING is Qnil, and the original string is passed via STR, NOCOPY
9970    is ignored.
9971 
9972    HANDLE-8-BIT and HANDLE-OVER-UNI specify how to handle a invalid
9973    byte sequence.  The former is for a 1-byte invalid sequence that
9974    violates the fundamental UTF-8 encoding rules.  The latter is for a
9975    4 or 5-byte overlong sequences that Emacs internally uses to
9976    represent characters beyond the Unicode range (characters whose
9977    codepoints are greater than #x10FFFF).  Note that this function does
9978    not in general treat such overlong UTF-8 sequences as invalid.
9979 
9980    If these two arguments are strings (typically a 1-char string of
9981    the Unicode REPLACEMENT CHARACTER #xFFFD), decode an invalid byte
9982    sequence into that string.  They must be multibyte strings if they
9983    contain a non-ASCII character.
9984 
9985    If the two arguments are characters, decode an invalid byte
9986    sequence into the corresponding multibyte representation of the
9987    respective character.
9988 
9989    If they are Qignored, skip an invalid byte sequence without
9990    producing anything in the decoded string.
9991 
9992    If HANDLE-8-BIT is Qt, decode a 1-byte invalid sequence into the
9993    corresponding eight-bit multibyte representation, like the usual
9994    Emacs decoding does.
9995 
9996    If HANDLE-OVER-UNI is Qt, decode a 4 or 5-byte overlong sequence
9997    that follows Emacs' internal representation for a character beyond
9998    Unicode range into the corresponding character, like the usual
9999    Emacs decoding does.
10000 
10001    If the two arguments are Qnil, return Qnil if the input string has
10002    raw bytes or overlong sequences.  This allows the caller to signal
10003    an error if such inputs are not allowed.  */
10004 
10005 Lisp_Object
decode_string_utf_8(Lisp_Object string,const char * str,ptrdiff_t str_len,Lisp_Object buffer,bool nocopy,Lisp_Object handle_8_bit,Lisp_Object handle_over_uni)10006 decode_string_utf_8 (Lisp_Object string, const char *str, ptrdiff_t str_len,
10007 		     Lisp_Object buffer, bool nocopy,
10008 		     Lisp_Object handle_8_bit, Lisp_Object handle_over_uni)
10009 {
10010   /* This is like BYTES_BY_CHAR_HEAD, but it is assured that C >= 0x80
10011      and it returns 0 for an invalid sequence.  */
10012 #define UTF_8_SEQUENCE_LENGTH(c)	\
10013   ((c) < 0xC2 ? 0			\
10014    : (c) < 0xE0 ? 2			\
10015    : (c) < 0xF0 ? 3			\
10016    : (c) < 0xF8 ? 4			\
10017    : (c) == 0xF8 ? 5			\
10018    : 0)
10019 
10020   ptrdiff_t nbytes = STRINGP (string) ? SBYTES (string) : str_len;
10021   unsigned char *p = STRINGP (string) ? SDATA (string) : (unsigned char *) str;
10022   unsigned char *str_orig = p;
10023   unsigned char *pend = p + nbytes;
10024   ptrdiff_t num_8_bit = 0;   /* number of invalid 1-byte sequences */
10025   ptrdiff_t num_over_4 = 0;  /* number of invalid 4-byte sequences */
10026   ptrdiff_t num_over_5 = 0;  /* number of invalid 5-byte sequences */
10027   ptrdiff_t outbytes = nbytes;	/* number of decoded bytes */
10028   ptrdiff_t outchars = 0;    /* number of decoded characters */
10029   unsigned char *src = NULL, *dst = NULL;
10030   bool change_byte_sequence = false;
10031 
10032   /* Scan input bytes twice.  The first scan is to count invalid
10033      sequences, and the second scan is to decode input.  If the
10034      decoding is trivial (no need of changing the byte sequence),
10035      the second scan is avoided.  */
10036   while (p < pend)
10037     {
10038       src = p;
10039       /* Try short cut for an ASCII-only case.  */
10040       while (p < pend && *p < 0x80) p++;
10041       outchars += (p - src);
10042       if (p == pend)
10043 	break;
10044       int c = *p;
10045       outchars++;
10046       int len = UTF_8_SEQUENCE_LENGTH (c);
10047       /* len == 0, 2, 3, 4, 5.  */
10048       if (UTF_8_EXTRA_OCTET_P (p[1])
10049 	  && (len == 2
10050 	      || (UTF_8_EXTRA_OCTET_P (p[2])
10051 		  && (len == 3
10052 		      || (UTF_8_EXTRA_OCTET_P (p[3])
10053 			  && len == 4
10054 			  && STRING_CHAR (p) <= MAX_UNICODE_CHAR)))))
10055 	{
10056 	  p += len;
10057 	  continue;
10058 	}
10059 
10060       /* A sequence to change on decoding was found.  A rare case.  */
10061       if (len == 0)
10062 	{
10063 	  if (NILP (handle_8_bit))
10064 	    return Qnil;
10065 	  num_8_bit++;
10066 	  len = 1;
10067 	}
10068       else			/* len == 4 or 5 */
10069 	{
10070 	  if (NILP (handle_over_uni))
10071 	    return Qnil;
10072 	  if (len == 4)
10073 	    num_over_4++;
10074 	  else
10075 	    num_over_5++;
10076 	}
10077       change_byte_sequence = true;
10078       p += len;
10079     }
10080 
10081   Lisp_Object val;	     /* the return value */
10082 
10083   if (! change_byte_sequence
10084       && NILP (buffer))
10085     {
10086       if (nocopy && STRINGP (string))
10087 	return string;
10088       val = make_uninit_multibyte_string (outchars, outbytes);
10089       memcpy (SDATA (val), str_orig, pend - str_orig);
10090       return val;
10091     }
10092 
10093   /* Count the number of resulting chars and bytes.  */
10094   unsigned char *replace_8_bit = NULL, *replace_over_uni = NULL;
10095   int replace_8_bit_len = 0, replace_over_uni_len = 0;
10096 
10097   if (change_byte_sequence)
10098     {
10099       if (num_8_bit > 0)
10100 	{
10101 	  if (CHARACTERP (handle_8_bit))
10102 	    replace_8_bit = get_char_bytes (XFIXNUM (handle_8_bit),
10103 					    &replace_8_bit_len);
10104 	  else if (STRINGP (handle_8_bit))
10105 	    {
10106 	      replace_8_bit = SDATA (handle_8_bit);
10107 	      replace_8_bit_len = SBYTES (handle_8_bit);
10108 	    }
10109 	  if (replace_8_bit)
10110 	    outbytes += (replace_8_bit_len - 1) * num_8_bit;
10111 	  else if (EQ (handle_8_bit, Qignored))
10112 	    {
10113 	      outbytes -= num_8_bit;
10114 	      outchars -= num_8_bit;
10115 	    }
10116 	  else /* EQ (handle_8_bit, Qt)) */
10117 	    outbytes += num_8_bit;
10118 	}
10119       else if (num_over_4 + num_over_5 > 0)
10120 	{
10121 	  if (CHARACTERP (handle_over_uni))
10122 	    replace_over_uni = get_char_bytes (XFIXNUM (handle_over_uni),
10123 					       &replace_over_uni_len);
10124 	  else if (STRINGP (handle_over_uni))
10125 	    {
10126 	      replace_over_uni = SDATA (handle_over_uni);
10127 	      replace_over_uni_len = SBYTES (handle_over_uni);
10128 	    }
10129 	  if (num_over_4 > 0)
10130 	    {
10131 	      if (replace_over_uni)
10132 		outbytes += (replace_over_uni_len - 4) * num_over_4;
10133 	      else if (EQ (handle_over_uni, Qignored))
10134 		{
10135 		  outbytes -= 4 * num_over_4;
10136 		  outchars -= num_over_4;
10137 		}
10138 	    }
10139 	  if (num_over_5 > 0)
10140 	    {
10141 	      if (replace_over_uni)
10142 		outbytes += (replace_over_uni_len - 5) * num_over_5;
10143 	      else if (EQ (handle_over_uni, Qignored))
10144 		{
10145 		  outbytes -= 5 * num_over_5;
10146 		  outchars -= num_over_5;
10147 		}
10148 	    }
10149 	}
10150     }
10151 
10152   /* Prepare return value and  space to store the decoded bytes.  */
10153   if (BUFFERP (buffer))
10154     {
10155       val = make_fixnum (outchars);
10156       dst = get_buffer_gap_address (buffer, outbytes);
10157     }
10158   else
10159     {
10160       if (nocopy && (num_8_bit + num_over_4 + num_over_5) == 0
10161 	  && STRINGP (string))
10162 	return string;
10163       val = make_uninit_multibyte_string (outchars, outbytes);
10164       dst = SDATA (val);
10165     }
10166 
10167   src = str_orig;
10168   if (change_byte_sequence)
10169     {
10170       p = src;
10171       while (p < pend)
10172 	{
10173 	  /* Try short cut for an ASCII-only case.  */
10174 	  /* while (p < pend && *p < 0x80) p++; */
10175 	  /* if (p == pend) */
10176 	  /*   break; */
10177 	  int c = *p;
10178 	  if (c < 0x80)
10179 	    {
10180 	      p++;
10181 	      continue;
10182 	    }
10183 	  int len = UTF_8_SEQUENCE_LENGTH (c);
10184 	  if (len > 1)
10185 	    {
10186 	      int mlen;
10187 	      for (mlen = 1; mlen < len && UTF_8_EXTRA_OCTET_P (p[mlen]);
10188 		   mlen++);
10189 	      if (mlen == len
10190 		  && (len <= 3
10191 		      || (len == 4 && STRING_CHAR (p) <= MAX_UNICODE_CHAR)
10192 		      || EQ (handle_over_uni, Qt)))
10193 		{
10194 		  p += len;
10195 		  continue;
10196 		}
10197 	    }
10198 
10199 	  if (src < p)
10200 	    {
10201 	      memcpy (dst, src, p - src);
10202 	      dst += p - src;
10203 	    }
10204 	  if (len == 0)
10205 	    {
10206 	      if (replace_8_bit)
10207 		{
10208 		  memcpy (dst, replace_8_bit, replace_8_bit_len);
10209 		  dst += replace_8_bit_len;
10210 		}
10211 	      else if (EQ (handle_8_bit, Qt))
10212 		{
10213 		  dst += BYTE8_STRING (c, dst);
10214 		}
10215 	      len = 1;
10216 	    }
10217 	  else			/* len == 4 or 5 */
10218 	    {
10219 	      /* Handle p[0]... by handle_over_uni.  */
10220 	      if (replace_over_uni)
10221 		{
10222 		  memcpy (dst, replace_over_uni, replace_over_uni_len);
10223 		  dst += replace_over_uni_len;
10224 		}
10225 	    }
10226 	  p += len;
10227 	  src = p;
10228 	}
10229     }
10230 
10231   if (src < pend)
10232     memcpy (dst, src, pend - src);
10233   if (BUFFERP (buffer))
10234     {
10235       struct buffer *oldb = current_buffer;
10236 
10237       current_buffer = XBUFFER (buffer);
10238       insert_from_gap (outchars, outbytes, false);
10239       current_buffer = oldb;
10240     }
10241   return val;
10242 }
10243 
10244 /* #define ENABLE_UTF_8_CONVERTER_TEST */
10245 
10246 #ifdef ENABLE_UTF_8_CONVERTER_TEST
10247 
10248 /* These functions are useful for testing and benchmarking
10249    encode_string_utf_8 and decode_string_utf_8.  */
10250 
10251 /* ENCODE_METHOD specifies which internal decoder to use.
10252    If it is Qnil, use encode_string_utf_8.
10253    Otherwise, use code_convert_string.
10254 
10255    COUNT, if integer, specifies how many times to call those functions
10256    with the same arguments (for benchmarking). */
10257 
10258 DEFUN ("internal-encode-string-utf-8", Finternal_encode_string_utf_8,
10259        Sinternal_encode_string_utf_8, 7, 7, 0,
10260        doc: /* Internal use only.*/)
10261   (Lisp_Object string, Lisp_Object buffer, Lisp_Object nocopy,
10262    Lisp_Object handle_8_bit, Lisp_Object handle_over_uni,
10263    Lisp_Object encode_method, Lisp_Object count)
10264 {
10265   int repeat_count;
10266   Lisp_Object val;
10267 
10268   /* Check arguments.  Return Qnil when an argument is invalid.  */
10269   if (! STRINGP (string))
10270     return Qnil;
10271   if (! NILP (buffer)
10272       && (! BUFFERP (buffer)
10273 	  || ! NILP (BVAR (XBUFFER (buffer), enable_multibyte_characters))))
10274     return Qnil;
10275   if (! NILP (handle_8_bit) && ! EQ (handle_8_bit, Qt)
10276       && ! EQ (handle_8_bit, Qignored)
10277       && ! CHARACTERP (handle_8_bit)
10278       && (! STRINGP (handle_8_bit) || STRING_MULTIBYTE (handle_8_bit)))
10279     return Qnil;
10280   if (! NILP (handle_over_uni) && ! EQ (handle_over_uni, Qt)
10281       && ! EQ (handle_over_uni, Qignored)
10282       && ! CHARACTERP (handle_over_uni)
10283       && (! STRINGP (handle_over_uni) || STRING_MULTIBYTE (handle_over_uni)))
10284     return Qnil;
10285 
10286   CHECK_FIXNUM (count);
10287   repeat_count = XFIXNUM (count);
10288 
10289   val = Qnil;
10290   /* Run an encoder according to ENCODE_METHOD.  */
10291   if (NILP (encode_method))
10292     {
10293       for (int i = 0; i < repeat_count; i++)
10294 	val = encode_string_utf_8 (string, buffer, ! NILP (nocopy),
10295 				   handle_8_bit, handle_over_uni);
10296     }
10297   else
10298     {
10299       for (int i = 0; i < repeat_count; i++)
10300 	val = code_convert_string (string, Qutf_8_unix, Qnil, true,
10301 				   ! NILP (nocopy), true);
10302     }
10303   return val;
10304 }
10305 
10306 /* DECODE_METHOD specifies which internal decoder to use.
10307    If it is Qnil, use decode_string_utf_8.
10308    If it is Qt, use code_convert_string.
10309    Otherwise, use make_string_from_utf8.
10310 
10311    COUNT, if integer, specifies how many times to call those functions
10312    with the same arguments (for benchmarking).  */
10313 
10314 DEFUN ("internal-decode-string-utf-8", Finternal_decode_string_utf_8,
10315        Sinternal_decode_string_utf_8, 7, 7, 0,
10316        doc: /* Internal use only.*/)
10317   (Lisp_Object string, Lisp_Object buffer, Lisp_Object nocopy,
10318    Lisp_Object handle_8_bit, Lisp_Object handle_over_uni,
10319    Lisp_Object decode_method, Lisp_Object count)
10320 {
10321   int repeat_count;
10322   Lisp_Object val;
10323 
10324   /* Check arguments.  Return Qnil when an argument is invalid.  */
10325   if (! STRINGP (string))
10326     return Qnil;
10327   if (! NILP (buffer)
10328       && (! BUFFERP (buffer)
10329 	  || NILP (BVAR (XBUFFER (buffer), enable_multibyte_characters))))
10330     return Qnil;
10331   if (! NILP (handle_8_bit) && ! EQ (handle_8_bit, Qt)
10332       && ! EQ (handle_8_bit, Qignored)
10333       && ! CHARACTERP (handle_8_bit)
10334       && (! STRINGP (handle_8_bit) || ! STRING_MULTIBYTE (handle_8_bit)))
10335     return Qnil;
10336   if (! NILP (handle_over_uni) && ! EQ (handle_over_uni, Qt)
10337       && ! EQ (handle_over_uni, Qignored)
10338       && ! CHARACTERP (handle_over_uni)
10339       && (! STRINGP (handle_over_uni) || ! STRING_MULTIBYTE (handle_over_uni)))
10340     return Qnil;
10341 
10342   CHECK_FIXNUM (count);
10343   repeat_count = XFIXNUM (count);
10344 
10345   val = Qnil;
10346   /* Run a decoder according to DECODE_METHOD.  */
10347   if (NILP (decode_method))
10348     {
10349       for (int i = 0; i < repeat_count; i++)
10350 	val = decode_string_utf_8 (string, buffer, ! NILP (nocopy),
10351 				   handle_8_bit, handle_over_uni);
10352     }
10353   else if (EQ (decode_method, Qt))
10354     {
10355       if (! BUFFERP (buffer))
10356 	buffer = Qt;
10357       for (int i = 0; i < repeat_count; i++)
10358 	val = code_convert_string (string, Qutf_8_unix, buffer, false,
10359 				   ! NILP (nocopy), true);
10360     }
10361   else if (! NILP (decode_method))
10362     {
10363       for (int i = 0; i < repeat_count; i++)
10364 	val = make_string_from_utf8 ((char *) SDATA (string), SBYTES (string));
10365     }
10366   return val;
10367 }
10368 
10369 #endif	/* ENABLE_UTF_8_CONVERTER_TEST */
10370 
10371 /* Encode or decode STRING using CODING_SYSTEM, with the possibility of
10372    returning STRING itself if it equals the result.
10373    Do not set Vlast_coding_system_used.  */
10374 static Lisp_Object
convert_string_nocopy(Lisp_Object string,Lisp_Object coding_system,bool encodep)10375 convert_string_nocopy (Lisp_Object string, Lisp_Object coding_system,
10376                        bool encodep)
10377 {
10378   return code_convert_string (string, coding_system, Qt, encodep, 1, 1);
10379 }
10380 
10381 /* Encode or decode a file name, to or from a unibyte string suitable
10382    for passing to C library functions.  */
10383 Lisp_Object
decode_file_name(Lisp_Object fname)10384 decode_file_name (Lisp_Object fname)
10385 {
10386 #ifdef WINDOWSNT
10387   /* The w32 build pretends to use UTF-8 for file-name encoding, and
10388      converts the file names either to UTF-16LE or to the system ANSI
10389      codepage internally, depending on the underlying OS; see w32.c.  */
10390   if (! NILP (Fcoding_system_p (Qutf_8)))
10391     return convert_string_nocopy (fname, Qutf_8, 0);
10392   return fname;
10393 #else  /* !WINDOWSNT */
10394   if (! NILP (Vfile_name_coding_system))
10395     return convert_string_nocopy (fname, Vfile_name_coding_system, 0);
10396   else if (! NILP (Vdefault_file_name_coding_system))
10397     return convert_string_nocopy (fname, Vdefault_file_name_coding_system, 0);
10398   else
10399     return fname;
10400 #endif
10401 }
10402 
10403 static Lisp_Object
encode_file_name_1(Lisp_Object fname)10404 encode_file_name_1 (Lisp_Object fname)
10405 {
10406   /* This is especially important during bootstrap and dumping, when
10407      file-name encoding is not yet known, and therefore any non-ASCII
10408      file names are unibyte strings, and could only be thrashed if we
10409      try to encode them.  */
10410   if (!STRING_MULTIBYTE (fname))
10411     return fname;
10412 #ifdef WINDOWSNT
10413   /* The w32 build pretends to use UTF-8 for file-name encoding, and
10414      converts the file names either to UTF-16LE or to the system ANSI
10415      codepage internally, depending on the underlying OS; see w32.c.  */
10416   if (! NILP (Fcoding_system_p (Qutf_8)))
10417     return convert_string_nocopy (fname, Qutf_8, 1);
10418   return fname;
10419 #else  /* !WINDOWSNT */
10420   if (! NILP (Vfile_name_coding_system))
10421     return convert_string_nocopy (fname, Vfile_name_coding_system, 1);
10422   else if (! NILP (Vdefault_file_name_coding_system))
10423     return convert_string_nocopy (fname, Vdefault_file_name_coding_system, 1);
10424   else
10425     return fname;
10426 #endif
10427 }
10428 
10429 Lisp_Object
encode_file_name(Lisp_Object fname)10430 encode_file_name (Lisp_Object fname)
10431 {
10432   Lisp_Object encoded = encode_file_name_1 (fname);
10433   /* No system accepts NUL bytes in filenames.  Allowing them can
10434      cause subtle bugs because the system would silently use a
10435      different filename than expected.  Perform this check after
10436      encoding to not miss NUL bytes introduced through encoding.  */
10437   CHECK_STRING_NULL_BYTES (encoded);
10438   return encoded;
10439 }
10440 
10441 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
10442        2, 4, 0,
10443        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
10444 
10445 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
10446 if the decoding operation is trivial.
10447 
10448 Optional fourth arg BUFFER non-nil means that the decoded text is
10449 inserted in that buffer after point (point does not move).  In this
10450 case, the return value is the length of the decoded text.  If that
10451 buffer is unibyte, it receives the individual bytes of the internal
10452 representation of the decoded text.
10453 
10454 This function sets `last-coding-system-used' to the precise coding system
10455 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
10456 not fully specified.)  The function does not change the match data.  */)
10457   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
10458 {
10459   return code_convert_string (string, coding_system, buffer,
10460 			      0, ! NILP (nocopy), 0);
10461 }
10462 
10463 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
10464        2, 4, 0,
10465        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
10466 
10467 Optional third arg NOCOPY non-nil means it is OK to return STRING
10468 itself if the encoding operation is trivial.
10469 
10470 Optional fourth arg BUFFER non-nil means that the encoded text is
10471 inserted in that buffer after point (point does not move).  In this
10472 case, the return value is the length of the encoded text.
10473 
10474 This function sets `last-coding-system-used' to the precise coding system
10475 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
10476 not fully specified.)  The function does not change the match data.  */)
10477   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
10478 {
10479   return code_convert_string (string, coding_system, buffer,
10480 			      1, ! NILP (nocopy), 0);
10481 }
10482 
10483 
10484 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
10485        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
10486 Return the corresponding character.  */)
10487   (Lisp_Object code)
10488 {
10489   Lisp_Object spec, attrs, val;
10490   struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
10491   EMACS_INT ch;
10492   int c;
10493 
10494   CHECK_FIXNAT (code);
10495   ch = XFIXNAT (code);
10496   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
10497   attrs = AREF (spec, 0);
10498 
10499   if (ASCII_CHAR_P (ch)
10500       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
10501     return code;
10502 
10503   val = CODING_ATTR_CHARSET_LIST (attrs);
10504   charset_roman = CHARSET_FROM_ID (XFIXNUM (XCAR (val))), val = XCDR (val);
10505   charset_kana = CHARSET_FROM_ID (XFIXNUM (XCAR (val))), val = XCDR (val);
10506   charset_kanji = CHARSET_FROM_ID (XFIXNUM (XCAR (val)));
10507 
10508   if (ch <= 0x7F)
10509     {
10510       c = ch;
10511       charset = charset_roman;
10512     }
10513   else if (ch >= 0xA0 && ch < 0xDF)
10514     {
10515       c = ch - 0x80;
10516       charset = charset_kana;
10517     }
10518   else
10519     {
10520       EMACS_INT c1 = ch >> 8;
10521       int c2 = ch & 0xFF;
10522 
10523       if (c1 < 0x81 || (c1 > 0x9F && c1 < 0xE0) || c1 > 0xEF
10524 	  || c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
10525 	error ("Invalid code: %"pI"d", ch);
10526       c = ch;
10527       SJIS_TO_JIS (c);
10528       charset = charset_kanji;
10529     }
10530   c = DECODE_CHAR (charset, c);
10531   if (c < 0)
10532     error ("Invalid code: %"pI"d", ch);
10533   return make_fixnum (c);
10534 }
10535 
10536 
10537 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
10538        doc: /* Encode a Japanese character CH to shift_jis encoding.
10539 Return the corresponding code in SJIS.  */)
10540   (Lisp_Object ch)
10541 {
10542   Lisp_Object spec, attrs, charset_list;
10543   int c;
10544   struct charset *charset;
10545   unsigned code;
10546 
10547   CHECK_CHARACTER (ch);
10548   c = XFIXNAT (ch);
10549   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
10550   attrs = AREF (spec, 0);
10551 
10552   if (ASCII_CHAR_P (c)
10553       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
10554     return ch;
10555 
10556   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
10557   charset = char_charset (c, charset_list, &code);
10558   if (code == CHARSET_INVALID_CODE (charset))
10559     error ("Can't encode by shift_jis encoding: %c", c);
10560   JIS_TO_SJIS (code);
10561 
10562   return make_fixnum (code);
10563 }
10564 
10565 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
10566        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
10567 Return the corresponding character.  */)
10568   (Lisp_Object code)
10569 {
10570   Lisp_Object spec, attrs, val;
10571   struct charset *charset_roman, *charset_big5, *charset;
10572   EMACS_INT ch;
10573   int c;
10574 
10575   CHECK_FIXNAT (code);
10576   ch = XFIXNAT (code);
10577   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
10578   attrs = AREF (spec, 0);
10579 
10580   if (ASCII_CHAR_P (ch)
10581       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
10582     return code;
10583 
10584   val = CODING_ATTR_CHARSET_LIST (attrs);
10585   charset_roman = CHARSET_FROM_ID (XFIXNUM (XCAR (val))), val = XCDR (val);
10586   charset_big5 = CHARSET_FROM_ID (XFIXNUM (XCAR (val)));
10587 
10588   if (ch <= 0x7F)
10589     {
10590       c = ch;
10591       charset = charset_roman;
10592     }
10593   else
10594     {
10595       EMACS_INT b1 = ch >> 8;
10596       int b2 = ch & 0x7F;
10597       if (b1 < 0xA1 || b1 > 0xFE
10598 	  || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
10599 	error ("Invalid code: %"pI"d", ch);
10600       c = ch;
10601       charset = charset_big5;
10602     }
10603   c = DECODE_CHAR (charset, c);
10604   if (c < 0)
10605     error ("Invalid code: %"pI"d", ch);
10606   return make_fixnum (c);
10607 }
10608 
10609 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
10610        doc: /* Encode the Big5 character CH to BIG5 coding system.
10611 Return the corresponding character code in Big5.  */)
10612   (Lisp_Object ch)
10613 {
10614   Lisp_Object spec, attrs, charset_list;
10615   struct charset *charset;
10616   int c;
10617   unsigned code;
10618 
10619   CHECK_CHARACTER (ch);
10620   c = XFIXNAT (ch);
10621   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
10622   attrs = AREF (spec, 0);
10623   if (ASCII_CHAR_P (c)
10624       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
10625     return ch;
10626 
10627   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
10628   charset = char_charset (c, charset_list, &code);
10629   if (code == CHARSET_INVALID_CODE (charset))
10630     error ("Can't encode by Big5 encoding: %c", c);
10631 
10632   return make_fixnum (code);
10633 }
10634 
10635 
10636 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
10637        Sset_terminal_coding_system_internal, 1, 2, 0,
10638        doc: /* Internal use only.  */)
10639   (Lisp_Object coding_system, Lisp_Object terminal)
10640 {
10641   struct terminal *term = decode_live_terminal (terminal);
10642   struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (term);
10643   CHECK_SYMBOL (coding_system);
10644   setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
10645   /* We had better not send unsafe characters to terminal.  */
10646   terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
10647   /* Character composition should be disabled.  */
10648   terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
10649   terminal_coding->src_multibyte = 1;
10650   terminal_coding->dst_multibyte = 0;
10651   tset_charset_list
10652     (term, (terminal_coding->common_flags & CODING_REQUIRE_ENCODING_MASK
10653 	    ? coding_charset_list (terminal_coding)
10654 	    : list1i (charset_ascii)));
10655   return Qnil;
10656 }
10657 
10658 DEFUN ("set-safe-terminal-coding-system-internal",
10659        Fset_safe_terminal_coding_system_internal,
10660        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
10661        doc: /* Internal use only.  */)
10662   (Lisp_Object coding_system)
10663 {
10664   CHECK_SYMBOL (coding_system);
10665   setup_coding_system (Fcheck_coding_system (coding_system),
10666 		       &safe_terminal_coding);
10667   /* Character composition should be disabled.  */
10668   safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
10669   safe_terminal_coding.src_multibyte = 1;
10670   safe_terminal_coding.dst_multibyte = 0;
10671   return Qnil;
10672 }
10673 
10674 DEFUN ("terminal-coding-system", Fterminal_coding_system,
10675        Sterminal_coding_system, 0, 1, 0,
10676        doc: /* Return coding system specified for terminal output on the given terminal.
10677 TERMINAL may be a terminal object, a frame, or nil for the selected
10678 frame's terminal device.  */)
10679   (Lisp_Object terminal)
10680 {
10681   struct coding_system *terminal_coding
10682     = TERMINAL_TERMINAL_CODING (decode_live_terminal (terminal));
10683   Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
10684 
10685   /* For backward compatibility, return nil if it is `undecided'.  */
10686   return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
10687 }
10688 
10689 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
10690        Sset_keyboard_coding_system_internal, 1, 2, 0,
10691        doc: /* Internal use only.  */)
10692   (Lisp_Object coding_system, Lisp_Object terminal)
10693 {
10694   struct terminal *t = decode_live_terminal (terminal);
10695   CHECK_SYMBOL (coding_system);
10696   if (NILP (coding_system))
10697     coding_system = Qno_conversion;
10698   else
10699     Fcheck_coding_system (coding_system);
10700   setup_coding_system (coding_system, TERMINAL_KEYBOARD_CODING (t));
10701   /* Character composition should be disabled.  */
10702   TERMINAL_KEYBOARD_CODING (t)->common_flags
10703     &= ~CODING_ANNOTATE_COMPOSITION_MASK;
10704   return Qnil;
10705 }
10706 
10707 DEFUN ("keyboard-coding-system",
10708        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
10709        doc: /* Return coding system specified for decoding keyboard input.  */)
10710   (Lisp_Object terminal)
10711 {
10712   return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
10713 			 (decode_live_terminal (terminal))->id);
10714 }
10715 
10716 
10717 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
10718        Sfind_operation_coding_system,  1, MANY, 0,
10719        doc: /* Choose a coding system for an operation based on the target name.
10720 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
10721 DECODING-SYSTEM is the coding system to use for decoding
10722 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
10723 for encoding (in case OPERATION does encoding).
10724 
10725 The first argument OPERATION specifies an I/O primitive:
10726   For file I/O, `insert-file-contents' or `write-region'.
10727   For process I/O, `call-process', `call-process-region', or `start-process'.
10728   For network I/O, `open-network-stream'.
10729 
10730 The remaining arguments should be the same arguments that were passed
10731 to the primitive.  Depending on which primitive, one of those arguments
10732 is selected as the TARGET.  For example, if OPERATION does file I/O,
10733 whichever argument specifies the file name is TARGET.
10734 
10735 TARGET has a meaning which depends on OPERATION:
10736   For file I/O, TARGET is a file name (except for the special case below).
10737   For process I/O, TARGET is a process name.
10738   For network I/O, TARGET is a service name or a port number.
10739 
10740 This function looks up what is specified for TARGET in
10741 `file-coding-system-alist', `process-coding-system-alist',
10742 or `network-coding-system-alist' depending on OPERATION.
10743 They may specify a coding system, a cons of coding systems,
10744 or a function symbol to call.
10745 In the last case, we call the function with one argument,
10746 which is a list of all the arguments given to this function.
10747 If the function can't decide a coding system, it can return
10748 `undecided' so that the normal code-detection is performed.
10749 
10750 If OPERATION is `insert-file-contents', the argument corresponding to
10751 TARGET may be a cons (FILENAME . BUFFER).  In that case, FILENAME is a
10752 file name to look up, and BUFFER is a buffer that contains the file's
10753 contents (not yet decoded).  If `file-coding-system-alist' specifies a
10754 function to call for FILENAME, that function should examine the
10755 contents of BUFFER instead of reading the file.
10756 
10757 usage: (find-operation-coding-system OPERATION ARGUMENTS...)  */)
10758   (ptrdiff_t nargs, Lisp_Object *args)
10759 {
10760   Lisp_Object operation, target_idx, target, val;
10761   register Lisp_Object chain;
10762 
10763   if (nargs < 2)
10764     error ("Too few arguments");
10765   operation = args[0];
10766   if (!SYMBOLP (operation)
10767       || (target_idx = Fget (operation, Qtarget_idx), !FIXNATP (target_idx)))
10768     error ("Invalid first argument");
10769   if (nargs <= 1 + XFIXNAT (target_idx))
10770     error ("Too few arguments for operation `%s'",
10771 	   SDATA (SYMBOL_NAME (operation)));
10772   target = args[XFIXNAT (target_idx) + 1];
10773   if (!(STRINGP (target)
10774 	|| (EQ (operation, Qinsert_file_contents) && CONSP (target)
10775 	    && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
10776 	|| (EQ (operation, Qopen_network_stream)
10777 	    && (FIXNUMP (target) || EQ (target, Qt)))))
10778     error ("Invalid argument %"pI"d of operation `%s'",
10779 	   XFIXNAT (target_idx) + 1, SDATA (SYMBOL_NAME (operation)));
10780   if (CONSP (target))
10781     target = XCAR (target);
10782 
10783   chain = ((EQ (operation, Qinsert_file_contents)
10784 	    || EQ (operation, Qwrite_region))
10785 	   ? Vfile_coding_system_alist
10786 	   : (EQ (operation, Qopen_network_stream)
10787 	      ? Vnetwork_coding_system_alist
10788 	      : Vprocess_coding_system_alist));
10789   if (NILP (chain))
10790     return Qnil;
10791 
10792   for (; CONSP (chain); chain = XCDR (chain))
10793     {
10794       Lisp_Object elt;
10795 
10796       elt = XCAR (chain);
10797       if (CONSP (elt)
10798 	  && ((STRINGP (target)
10799 	       && STRINGP (XCAR (elt))
10800 	       && fast_string_match (XCAR (elt), target) >= 0)
10801 	      || (FIXNUMP (target) && EQ (target, XCAR (elt)))))
10802 	{
10803 	  val = XCDR (elt);
10804 	  /* Here, if VAL is both a valid coding system and a valid
10805              function symbol, we return VAL as a coding system.  */
10806 	  if (CONSP (val))
10807 	    return val;
10808 	  if (! SYMBOLP (val))
10809 	    return Qnil;
10810 	  if (! NILP (Fcoding_system_p (val)))
10811 	    return Fcons (val, val);
10812 	  if (! NILP (Ffboundp (val)))
10813 	    {
10814 	      /* We use call1 rather than safe_call1
10815 		 so as to get bug reports about functions called here
10816 		 which don't handle the current interface.  */
10817 	      val = call1 (val, Flist (nargs, args));
10818 	      if (CONSP (val))
10819 		return val;
10820 	      if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
10821 		return Fcons (val, val);
10822 	    }
10823 	  return Qnil;
10824 	}
10825     }
10826   return Qnil;
10827 }
10828 
10829 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
10830        Sset_coding_system_priority, 0, MANY, 0,
10831        doc: /* Assign higher priority to the coding systems given as arguments.
10832 If multiple coding systems belong to the same category,
10833 all but the first one are ignored.
10834 
10835 usage: (set-coding-system-priority &rest coding-systems)  */)
10836   (ptrdiff_t nargs, Lisp_Object *args)
10837 {
10838   ptrdiff_t i, j;
10839   bool changed[coding_category_max];
10840   enum coding_category priorities[coding_category_max];
10841 
10842   memset (changed, 0, sizeof changed);
10843 
10844   for (i = j = 0; i < nargs; i++)
10845     {
10846       enum coding_category category;
10847       Lisp_Object spec, attrs;
10848 
10849       CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
10850       attrs = AREF (spec, 0);
10851       category = XFIXNUM (CODING_ATTR_CATEGORY (attrs));
10852       if (changed[category])
10853 	/* Ignore this coding system because a coding system of the
10854 	   same category already had a higher priority.  */
10855 	continue;
10856       changed[category] = 1;
10857       priorities[j++] = category;
10858       if (coding_categories[category].id >= 0
10859 	  && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
10860 	setup_coding_system (args[i], &coding_categories[category]);
10861       Fset (AREF (Vcoding_category_table, category), args[i]);
10862     }
10863 
10864   /* Now we have decided top J priorities.  Reflect the order of the
10865      original priorities to the remaining priorities.  */
10866 
10867   for (i = j, j = 0; i < coding_category_max; i++, j++)
10868     {
10869       while (j < coding_category_max
10870 	     && changed[coding_priorities[j]])
10871 	j++;
10872       if (j == coding_category_max)
10873 	emacs_abort ();
10874       priorities[i] = coding_priorities[j];
10875     }
10876 
10877   memcpy (coding_priorities, priorities, sizeof priorities);
10878 
10879   /* Update `coding-category-list'.  */
10880   Vcoding_category_list = Qnil;
10881   for (i = coding_category_max; i-- > 0; )
10882     Vcoding_category_list
10883       = Fcons (AREF (Vcoding_category_table, priorities[i]),
10884 	       Vcoding_category_list);
10885 
10886   return Qnil;
10887 }
10888 
10889 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
10890        Scoding_system_priority_list, 0, 1, 0,
10891        doc: /* Return a list of coding systems ordered by their priorities.
10892 The list contains a subset of coding systems; i.e. coding systems
10893 assigned to each coding category (see `coding-category-list').
10894 
10895 HIGHESTP non-nil means just return the highest priority one.  */)
10896   (Lisp_Object highestp)
10897 {
10898   int i;
10899   Lisp_Object val;
10900 
10901   for (i = 0, val = Qnil; i < coding_category_max; i++)
10902     {
10903       enum coding_category category = coding_priorities[i];
10904       int id = coding_categories[category].id;
10905       Lisp_Object attrs;
10906 
10907       if (id < 0)
10908 	continue;
10909       attrs = CODING_ID_ATTRS (id);
10910       if (! NILP (highestp))
10911 	return CODING_ATTR_BASE_NAME (attrs);
10912       val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
10913     }
10914   return Fnreverse (val);
10915 }
10916 
10917 static Lisp_Object
make_subsidiaries(Lisp_Object base)10918 make_subsidiaries (Lisp_Object base)
10919 {
10920   static char const suffixes[][8] = { "-unix", "-dos", "-mac" };
10921   ptrdiff_t base_name_len = SBYTES (SYMBOL_NAME (base));
10922   USE_SAFE_ALLOCA;
10923   char *buf = SAFE_ALLOCA (base_name_len + 6);
10924 
10925   memcpy (buf, SDATA (SYMBOL_NAME (base)), base_name_len);
10926   Lisp_Object subsidiaries = make_nil_vector (3);
10927   for (int i = 0; i < 3; i++)
10928     {
10929       strcpy (buf + base_name_len, suffixes[i]);
10930       ASET (subsidiaries, i, intern (buf));
10931     }
10932   SAFE_FREE ();
10933   return subsidiaries;
10934 }
10935 
10936 
10937 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
10938        Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
10939        doc: /* For internal use only.
10940 usage: (define-coding-system-internal ...)  */)
10941   (ptrdiff_t nargs, Lisp_Object *args)
10942 {
10943   enum coding_category category;
10944   int max_charset_id = 0;
10945 
10946   if (nargs < coding_arg_max)
10947     goto short_args;
10948 
10949   Lisp_Object attrs = make_nil_vector (coding_attr_last_index);
10950 
10951   Lisp_Object name = args[coding_arg_name];
10952   CHECK_SYMBOL (name);
10953   ASET (attrs, coding_attr_base_name, name);
10954 
10955   Lisp_Object val = args[coding_arg_mnemonic];
10956   /* decode_mode_spec_coding assumes the mnemonic is a single character.  */
10957   if (STRINGP (val))
10958     val = make_fixnum (STRING_CHAR (SDATA (val)));
10959   else
10960     CHECK_CHARACTER (val);
10961   ASET (attrs, coding_attr_mnemonic, val);
10962 
10963   Lisp_Object coding_type = args[coding_arg_coding_type];
10964   CHECK_SYMBOL (coding_type);
10965   ASET (attrs, coding_attr_type, coding_type);
10966 
10967   Lisp_Object charset_list = args[coding_arg_charset_list];
10968   if (SYMBOLP (charset_list))
10969     {
10970       if (EQ (charset_list, Qiso_2022))
10971 	{
10972 	  if (! EQ (coding_type, Qiso_2022))
10973 	    error ("Invalid charset-list");
10974 	  charset_list = Viso_2022_charset_list;
10975 	}
10976       else if (EQ (charset_list, Qemacs_mule))
10977 	{
10978 	  if (! EQ (coding_type, Qemacs_mule))
10979 	    error ("Invalid charset-list");
10980 	  charset_list = Vemacs_mule_charset_list;
10981 	}
10982       for (Lisp_Object tail = charset_list; CONSP (tail); tail = XCDR (tail))
10983 	{
10984 	  if (! RANGED_FIXNUMP (0, XCAR (tail), INT_MAX - 1))
10985 	    error ("Invalid charset-list");
10986 	  if (max_charset_id < XFIXNAT (XCAR (tail)))
10987 	    max_charset_id = XFIXNAT (XCAR (tail));
10988 	}
10989     }
10990   else
10991     {
10992       charset_list = Fcopy_sequence (charset_list);
10993       for (Lisp_Object tail = charset_list; CONSP (tail); tail = XCDR (tail))
10994 	{
10995 	  struct charset *charset;
10996 
10997 	  val = XCAR (tail);
10998 	  CHECK_CHARSET_GET_CHARSET (val, charset);
10999 	  if (EQ (coding_type, Qiso_2022)
11000 	      ? CHARSET_ISO_FINAL (charset) < 0
11001 	      : EQ (coding_type, Qemacs_mule)
11002 	      ? CHARSET_EMACS_MULE_ID (charset) < 0
11003 	      : 0)
11004 	    error ("Can't handle charset `%s'",
11005 		   SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
11006 
11007 	  XSETCAR (tail, make_fixnum (charset->id));
11008 	  if (max_charset_id < charset->id)
11009 	    max_charset_id = charset->id;
11010 	}
11011     }
11012   ASET (attrs, coding_attr_charset_list, charset_list);
11013 
11014   Lisp_Object safe_charsets = make_uninit_string (max_charset_id + 1);
11015   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
11016   for (Lisp_Object tail = charset_list; CONSP (tail); tail = XCDR (tail))
11017     SSET (safe_charsets, XFIXNAT (XCAR (tail)), 0);
11018   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
11019 
11020   ASET (attrs, coding_attr_ascii_compat, args[coding_arg_ascii_compatible_p]);
11021 
11022   val = args[coding_arg_decode_translation_table];
11023   if (! CHAR_TABLE_P (val) && ! CONSP (val))
11024     CHECK_SYMBOL (val);
11025   ASET (attrs, coding_attr_decode_tbl, val);
11026 
11027   val = args[coding_arg_encode_translation_table];
11028   if (! CHAR_TABLE_P (val) && ! CONSP (val))
11029     CHECK_SYMBOL (val);
11030   ASET (attrs, coding_attr_encode_tbl, val);
11031 
11032   val = args[coding_arg_post_read_conversion];
11033   CHECK_SYMBOL (val);
11034   ASET (attrs, coding_attr_post_read, val);
11035 
11036   val = args[coding_arg_pre_write_conversion];
11037   CHECK_SYMBOL (val);
11038   ASET (attrs, coding_attr_pre_write, val);
11039 
11040   val = args[coding_arg_default_char];
11041   if (NILP (val))
11042     ASET (attrs, coding_attr_default_char, make_fixnum (' '));
11043   else
11044     {
11045       CHECK_CHARACTER (val);
11046       ASET (attrs, coding_attr_default_char, val);
11047     }
11048 
11049   val = args[coding_arg_for_unibyte];
11050   ASET (attrs, coding_attr_for_unibyte, NILP (val) ? Qnil : Qt);
11051 
11052   val = args[coding_arg_plist];
11053   CHECK_LIST (val);
11054   ASET (attrs, coding_attr_plist, val);
11055 
11056   if (EQ (coding_type, Qcharset))
11057     {
11058       /* Generate a lisp vector of 256 elements.  Each element is nil,
11059 	 integer, or a list of charset IDs.
11060 
11061 	 If Nth element is nil, the byte code N is invalid in this
11062 	 coding system.
11063 
11064 	 If Nth element is a number NUM, N is the first byte of a
11065 	 charset whose ID is NUM.
11066 
11067 	 If Nth element is a list of charset IDs, N is the first byte
11068 	 of one of them.  The list is sorted by dimensions of the
11069 	 charsets.  A charset of smaller dimension comes first. */
11070       val = make_nil_vector (256);
11071 
11072       for (Lisp_Object tail = charset_list; CONSP (tail); tail = XCDR (tail))
11073 	{
11074 	  struct charset *charset = CHARSET_FROM_ID (XFIXNAT (XCAR (tail)));
11075 	  int dim = CHARSET_DIMENSION (charset);
11076 	  int idx = (dim - 1) * 4;
11077 
11078 	  if (CHARSET_ASCII_COMPATIBLE_P (charset))
11079 	    ASET (attrs, coding_attr_ascii_compat, Qt);
11080 
11081 	  for (int i = charset->code_space[idx];
11082 	       i <= charset->code_space[idx + 1]; i++)
11083 	    {
11084 	      Lisp_Object tmp, tmp2;
11085 	      int dim2;
11086 
11087 	      tmp = AREF (val, i);
11088 	      if (NILP (tmp))
11089 		tmp = XCAR (tail);
11090 	      else if (FIXNATP (tmp))
11091 		{
11092 		  dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFIXNAT (tmp)));
11093 		  if (dim < dim2)
11094 		    tmp = list2 (XCAR (tail), tmp);
11095 		  else
11096 		    tmp = list2 (tmp, XCAR (tail));
11097 		}
11098 	      else
11099 		{
11100 		  for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
11101 		    {
11102 		      dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFIXNAT (XCAR (tmp2))));
11103 		      if (dim < dim2)
11104 			break;
11105 		    }
11106 		  if (NILP (tmp2))
11107 		    tmp = nconc2 (tmp, list1 (XCAR (tail)));
11108 		  else
11109 		    {
11110 		      XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
11111 		      XSETCAR (tmp2, XCAR (tail));
11112 		    }
11113 		}
11114 	      ASET (val, i, tmp);
11115 	    }
11116 	}
11117       ASET (attrs, coding_attr_charset_valids, val);
11118       category = coding_category_charset;
11119     }
11120   else if (EQ (coding_type, Qccl))
11121     {
11122       Lisp_Object valids;
11123 
11124       if (nargs < coding_arg_ccl_max)
11125 	goto short_args;
11126 
11127       val = args[coding_arg_ccl_decoder];
11128       CHECK_CCL_PROGRAM (val);
11129       if (VECTORP (val))
11130 	val = Fcopy_sequence (val);
11131       ASET (attrs, coding_attr_ccl_decoder, val);
11132 
11133       val = args[coding_arg_ccl_encoder];
11134       CHECK_CCL_PROGRAM (val);
11135       if (VECTORP (val))
11136 	val = Fcopy_sequence (val);
11137       ASET (attrs, coding_attr_ccl_encoder, val);
11138 
11139       val = args[coding_arg_ccl_valids];
11140       valids = Fmake_string (make_fixnum (256), make_fixnum (0), Qnil);
11141       for (Lisp_Object tail = val; CONSP (tail); tail = XCDR (tail))
11142 	{
11143 	  int from, to;
11144 
11145 	  val = XCAR (tail);
11146 	  if (FIXNUMP (val))
11147 	    {
11148 	      if (! (0 <= XFIXNUM (val) && XFIXNUM (val) <= 255))
11149 		args_out_of_range_3 (val, make_fixnum (0), make_fixnum (255));
11150 	      from = to = XFIXNUM (val);
11151 	    }
11152 	  else
11153 	    {
11154 	      CHECK_CONS (val);
11155 	      from = check_integer_range (XCAR (val), 0, 255);
11156 	      to = check_integer_range (XCDR (val), from, 255);
11157 	    }
11158 	  for (int i = from; i <= to; i++)
11159 	    SSET (valids, i, 1);
11160 	}
11161       ASET (attrs, coding_attr_ccl_valids, valids);
11162 
11163       category = coding_category_ccl;
11164     }
11165   else if (EQ (coding_type, Qutf_16))
11166     {
11167       Lisp_Object bom, endian;
11168 
11169       ASET (attrs, coding_attr_ascii_compat, Qnil);
11170 
11171       if (nargs < coding_arg_utf16_max)
11172 	goto short_args;
11173 
11174       bom = args[coding_arg_utf16_bom];
11175       if (! NILP (bom) && ! EQ (bom, Qt))
11176 	{
11177 	  CHECK_CONS (bom);
11178 	  val = XCAR (bom);
11179 	  CHECK_CODING_SYSTEM (val);
11180 	  val = XCDR (bom);
11181 	  CHECK_CODING_SYSTEM (val);
11182 	}
11183       ASET (attrs, coding_attr_utf_bom, bom);
11184 
11185       endian = args[coding_arg_utf16_endian];
11186       CHECK_SYMBOL (endian);
11187       if (NILP (endian))
11188 	endian = Qbig;
11189       else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
11190 	error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
11191       ASET (attrs, coding_attr_utf_16_endian, endian);
11192 
11193       category = (CONSP (bom)
11194 		  ? coding_category_utf_16_auto
11195 		  : NILP (bom)
11196 		  ? (EQ (endian, Qbig)
11197 		     ? coding_category_utf_16_be_nosig
11198 		     : coding_category_utf_16_le_nosig)
11199 		  : (EQ (endian, Qbig)
11200 		     ? coding_category_utf_16_be
11201 		     : coding_category_utf_16_le));
11202     }
11203   else if (EQ (coding_type, Qiso_2022))
11204     {
11205       Lisp_Object initial, reg_usage, request, flags;
11206 
11207       if (nargs < coding_arg_iso2022_max)
11208 	goto short_args;
11209 
11210       initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
11211       CHECK_VECTOR (initial);
11212       for (int i = 0; i < 4; i++)
11213 	{
11214 	  val = AREF (initial, i);
11215 	  if (! NILP (val))
11216 	    {
11217 	      struct charset *charset;
11218 
11219 	      CHECK_CHARSET_GET_CHARSET (val, charset);
11220 	      ASET (initial, i, make_fixnum (CHARSET_ID (charset)));
11221 	      if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
11222 		ASET (attrs, coding_attr_ascii_compat, Qt);
11223 	    }
11224 	  else
11225 	    ASET (initial, i, make_fixnum (-1));
11226 	}
11227 
11228       reg_usage = args[coding_arg_iso2022_reg_usage];
11229       CHECK_CONS (reg_usage);
11230       CHECK_FIXNUM (XCAR (reg_usage));
11231       CHECK_FIXNUM (XCDR (reg_usage));
11232 
11233       request = Fcopy_sequence (args[coding_arg_iso2022_request]);
11234       for (Lisp_Object tail = request; CONSP (tail); tail = XCDR (tail))
11235 	{
11236 	  int id;
11237 
11238 	  val = XCAR (tail);
11239 	  CHECK_CONS (val);
11240 	  CHECK_CHARSET_GET_ID (XCAR (val), id);
11241 	  check_integer_range (XCDR (val), 0, 3);
11242 	  XSETCAR (val, make_fixnum (id));
11243 	}
11244 
11245       flags = args[coding_arg_iso2022_flags];
11246       CHECK_FIXNAT (flags);
11247       int i = XFIXNUM (flags) & INT_MAX;
11248       if (EQ (args[coding_arg_charset_list], Qiso_2022))
11249 	i |= CODING_ISO_FLAG_FULL_SUPPORT;
11250       flags = make_fixnum (i);
11251 
11252       ASET (attrs, coding_attr_iso_initial, initial);
11253       ASET (attrs, coding_attr_iso_usage, reg_usage);
11254       ASET (attrs, coding_attr_iso_request, request);
11255       ASET (attrs, coding_attr_iso_flags, flags);
11256       setup_iso_safe_charsets (attrs);
11257 
11258       if (i & CODING_ISO_FLAG_SEVEN_BITS)
11259 	category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
11260 			  | CODING_ISO_FLAG_SINGLE_SHIFT))
11261 		    ? coding_category_iso_7_else
11262 		    : EQ (args[coding_arg_charset_list], Qiso_2022)
11263 		    ? coding_category_iso_7
11264 		    : coding_category_iso_7_tight);
11265       else
11266 	{
11267 	  int id = XFIXNUM (AREF (initial, 1));
11268 
11269 	  category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
11270 		       || EQ (args[coding_arg_charset_list], Qiso_2022)
11271 		       || id < 0)
11272 		      ? coding_category_iso_8_else
11273 		      : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
11274 		      ? coding_category_iso_8_1
11275 		      : coding_category_iso_8_2);
11276 	}
11277       if (category != coding_category_iso_8_1
11278 	  && category != coding_category_iso_8_2)
11279 	ASET (attrs, coding_attr_ascii_compat, Qnil);
11280     }
11281   else if (EQ (coding_type, Qemacs_mule))
11282     {
11283       if (EQ (args[coding_arg_charset_list], Qemacs_mule))
11284 	ASET (attrs, coding_attr_emacs_mule_full, Qt);
11285       ASET (attrs, coding_attr_ascii_compat, Qt);
11286       category = coding_category_emacs_mule;
11287     }
11288   else if (EQ (coding_type, Qshift_jis))
11289     {
11290       ptrdiff_t charset_list_len = list_length (charset_list);
11291       if (charset_list_len != 3 && charset_list_len != 4)
11292 	error ("There should be three or four charsets");
11293 
11294       struct charset *charset = CHARSET_FROM_ID (XFIXNUM (XCAR (charset_list)));
11295       if (CHARSET_DIMENSION (charset) != 1)
11296 	error ("Dimension of charset %s is not one",
11297 	       SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
11298       if (CHARSET_ASCII_COMPATIBLE_P (charset))
11299 	ASET (attrs, coding_attr_ascii_compat, Qt);
11300 
11301       charset_list = XCDR (charset_list);
11302       charset = CHARSET_FROM_ID (XFIXNUM (XCAR (charset_list)));
11303       if (CHARSET_DIMENSION (charset) != 1)
11304 	error ("Dimension of charset %s is not one",
11305 	       SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
11306 
11307       charset_list = XCDR (charset_list);
11308       charset = CHARSET_FROM_ID (XFIXNUM (XCAR (charset_list)));
11309       if (CHARSET_DIMENSION (charset) != 2)
11310 	error ("Dimension of charset %s is not two",
11311 	       SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
11312 
11313       charset_list = XCDR (charset_list);
11314       if (! NILP (charset_list))
11315 	{
11316 	  charset = CHARSET_FROM_ID (XFIXNUM (XCAR (charset_list)));
11317 	  if (CHARSET_DIMENSION (charset) != 2)
11318 	    error ("Dimension of charset %s is not two",
11319 		   SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
11320 	}
11321 
11322       category = coding_category_sjis;
11323       Vsjis_coding_system = name;
11324     }
11325   else if (EQ (coding_type, Qbig5))
11326     {
11327       struct charset *charset;
11328 
11329       if (list_length (charset_list) != 2)
11330 	error ("There should be just two charsets");
11331 
11332       charset = CHARSET_FROM_ID (XFIXNUM (XCAR (charset_list)));
11333       if (CHARSET_DIMENSION (charset) != 1)
11334 	error ("Dimension of charset %s is not one",
11335 	       SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
11336       if (CHARSET_ASCII_COMPATIBLE_P (charset))
11337 	ASET (attrs, coding_attr_ascii_compat, Qt);
11338 
11339       charset_list = XCDR (charset_list);
11340       charset = CHARSET_FROM_ID (XFIXNUM (XCAR (charset_list)));
11341       if (CHARSET_DIMENSION (charset) != 2)
11342 	error ("Dimension of charset %s is not two",
11343 	       SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
11344 
11345       category = coding_category_big5;
11346       Vbig5_coding_system = name;
11347     }
11348   else if (EQ (coding_type, Qraw_text))
11349     {
11350       category = coding_category_raw_text;
11351       ASET (attrs, coding_attr_ascii_compat, Qt);
11352     }
11353   else if (EQ (coding_type, Qutf_8))
11354     {
11355       Lisp_Object bom;
11356 
11357       if (nargs < coding_arg_utf8_max)
11358 	goto short_args;
11359 
11360       bom = args[coding_arg_utf8_bom];
11361       if (! NILP (bom) && ! EQ (bom, Qt))
11362 	{
11363 	  CHECK_CONS (bom);
11364 	  val = XCAR (bom);
11365 	  CHECK_CODING_SYSTEM (val);
11366 	  val = XCDR (bom);
11367 	  CHECK_CODING_SYSTEM (val);
11368 	}
11369       ASET (attrs, coding_attr_utf_bom, bom);
11370       if (NILP (bom))
11371 	ASET (attrs, coding_attr_ascii_compat, Qt);
11372 
11373       category = (CONSP (bom) ? coding_category_utf_8_auto
11374 		  : NILP (bom) ? coding_category_utf_8_nosig
11375 		  : coding_category_utf_8_sig);
11376     }
11377   else if (EQ (coding_type, Qundecided))
11378     {
11379       if (nargs < coding_arg_undecided_max)
11380 	goto short_args;
11381       ASET (attrs, coding_attr_undecided_inhibit_null_byte_detection,
11382 	    args[coding_arg_undecided_inhibit_null_byte_detection]);
11383       ASET (attrs, coding_attr_undecided_inhibit_iso_escape_detection,
11384 	    args[coding_arg_undecided_inhibit_iso_escape_detection]);
11385       ASET (attrs, coding_attr_undecided_prefer_utf_8,
11386 	    args[coding_arg_undecided_prefer_utf_8]);
11387       category = coding_category_undecided;
11388     }
11389   else
11390     error ("Invalid coding system type: %s",
11391 	   SDATA (SYMBOL_NAME (coding_type)));
11392 
11393   ASET (attrs, coding_attr_category, make_fixnum (category));
11394   ASET (attrs, coding_attr_plist,
11395 	Fcons (QCcategory,
11396 	       Fcons (AREF (Vcoding_category_table, category),
11397 		      CODING_ATTR_PLIST (attrs))));
11398   ASET (attrs, coding_attr_plist,
11399 	Fcons (QCascii_compatible_p,
11400 	       Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
11401 		      CODING_ATTR_PLIST (attrs))));
11402 
11403   Lisp_Object eol_type = args[coding_arg_eol_type];
11404   if (! NILP (eol_type)
11405       && ! EQ (eol_type, Qunix)
11406       && ! EQ (eol_type, Qdos)
11407       && ! EQ (eol_type, Qmac))
11408     error ("Invalid eol-type");
11409 
11410   Lisp_Object aliases = list1 (name);
11411 
11412   if (NILP (eol_type))
11413     {
11414       eol_type = make_subsidiaries (name);
11415       for (int i = 0; i < 3; i++)
11416 	{
11417 	  Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
11418 
11419 	  this_name = AREF (eol_type, i);
11420 	  this_aliases = list1 (this_name);
11421 	  this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
11422 	  this_spec = make_uninit_vector (3);
11423 	  ASET (this_spec, 0, attrs);
11424 	  ASET (this_spec, 1, this_aliases);
11425 	  ASET (this_spec, 2, this_eol_type);
11426 	  Fputhash (this_name, this_spec, Vcoding_system_hash_table);
11427 	  Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
11428 	  val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist, Qnil);
11429 	  if (NILP (val))
11430 	    Vcoding_system_alist
11431 	      = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
11432 		       Vcoding_system_alist);
11433 	}
11434     }
11435 
11436   Lisp_Object spec_vec = make_uninit_vector (3);
11437   ASET (spec_vec, 0, attrs);
11438   ASET (spec_vec, 1, aliases);
11439   ASET (spec_vec, 2, eol_type);
11440 
11441   Fputhash (name, spec_vec, Vcoding_system_hash_table);
11442   Vcoding_system_list = Fcons (name, Vcoding_system_list);
11443   val = Fassoc (Fsymbol_name (name), Vcoding_system_alist, Qnil);
11444   if (NILP (val))
11445     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
11446 				  Vcoding_system_alist);
11447 
11448   int id = coding_categories[category].id;
11449   if (id < 0 || EQ (name, CODING_ID_NAME (id)))
11450       setup_coding_system (name, &coding_categories[category]);
11451 
11452   return Qnil;
11453 
11454  short_args:
11455   Fsignal (Qwrong_number_of_arguments,
11456 	   Fcons (intern ("define-coding-system-internal"),
11457 		  make_fixnum (nargs)));
11458 }
11459 
11460 
11461 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
11462        3, 3, 0,
11463        doc: /* Change value in CODING-SYSTEM's property list PROP to VAL.  */)
11464   (Lisp_Object coding_system, Lisp_Object prop, Lisp_Object val)
11465 {
11466   Lisp_Object spec, attrs;
11467 
11468   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
11469   attrs = AREF (spec, 0);
11470   if (EQ (prop, QCmnemonic))
11471     {
11472       /* decode_mode_spec_coding assumes the mnemonic is a single character.  */
11473       if (STRINGP (val))
11474 	val = make_fixnum (STRING_CHAR (SDATA (val)));
11475       else
11476 	CHECK_CHARACTER (val);
11477       ASET (attrs, coding_attr_mnemonic, val);
11478     }
11479   else if (EQ (prop, QCdefault_char))
11480     {
11481       if (NILP (val))
11482 	val = make_fixnum (' ');
11483       else
11484 	CHECK_CHARACTER (val);
11485       ASET (attrs, coding_attr_default_char, val);
11486     }
11487   else if (EQ (prop, QCdecode_translation_table))
11488     {
11489       if (! CHAR_TABLE_P (val) && ! CONSP (val))
11490 	CHECK_SYMBOL (val);
11491       ASET (attrs, coding_attr_decode_tbl, val);
11492     }
11493   else if (EQ (prop, QCencode_translation_table))
11494     {
11495       if (! CHAR_TABLE_P (val) && ! CONSP (val))
11496 	CHECK_SYMBOL (val);
11497       ASET (attrs, coding_attr_encode_tbl, val);
11498     }
11499   else if (EQ (prop, QCpost_read_conversion))
11500     {
11501       CHECK_SYMBOL (val);
11502       ASET (attrs, coding_attr_post_read, val);
11503     }
11504   else if (EQ (prop, QCpre_write_conversion))
11505     {
11506       CHECK_SYMBOL (val);
11507       ASET (attrs, coding_attr_pre_write, val);
11508     }
11509   else if (EQ (prop, QCascii_compatible_p))
11510     {
11511       ASET (attrs, coding_attr_ascii_compat, val);
11512     }
11513 
11514   ASET (attrs, coding_attr_plist,
11515 	Fplist_put (CODING_ATTR_PLIST (attrs), prop, val));
11516   return val;
11517 }
11518 
11519 
11520 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
11521        Sdefine_coding_system_alias, 2, 2, 0,
11522        doc: /* Define ALIAS as an alias for CODING-SYSTEM.  */)
11523   (Lisp_Object alias, Lisp_Object coding_system)
11524 {
11525   Lisp_Object spec, aliases, eol_type, val;
11526 
11527   CHECK_SYMBOL (alias);
11528   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
11529   aliases = AREF (spec, 1);
11530   /* ALIASES should be a list of length more than zero, and the first
11531      element is a base coding system.  Append ALIAS at the tail of the
11532      list.  */
11533   while (!NILP (XCDR (aliases)))
11534     aliases = XCDR (aliases);
11535   XSETCDR (aliases, list1 (alias));
11536 
11537   eol_type = AREF (spec, 2);
11538   if (VECTORP (eol_type))
11539     {
11540       Lisp_Object subsidiaries;
11541       int i;
11542 
11543       subsidiaries = make_subsidiaries (alias);
11544       for (i = 0; i < 3; i++)
11545 	Fdefine_coding_system_alias (AREF (subsidiaries, i),
11546 				     AREF (eol_type, i));
11547     }
11548 
11549   Fputhash (alias, spec, Vcoding_system_hash_table);
11550   Vcoding_system_list = Fcons (alias, Vcoding_system_list);
11551   val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist, Qnil);
11552   if (NILP (val))
11553     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
11554 				  Vcoding_system_alist);
11555 
11556   return Qnil;
11557 }
11558 
11559 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
11560        1, 1, 0,
11561        doc: /* Return the base of CODING-SYSTEM.
11562 Any alias or subsidiary coding system is not a base coding system.  */)
11563   (Lisp_Object coding_system)
11564 {
11565   Lisp_Object spec, attrs;
11566 
11567   if (NILP (coding_system))
11568     return (Qno_conversion);
11569   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
11570   attrs = AREF (spec, 0);
11571   return CODING_ATTR_BASE_NAME (attrs);
11572 }
11573 
11574 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
11575        1, 1, 0,
11576        doc: /* Return the property list of CODING-SYSTEM.  */)
11577   (Lisp_Object coding_system)
11578 {
11579   Lisp_Object spec, attrs;
11580 
11581   if (NILP (coding_system))
11582     coding_system = Qno_conversion;
11583   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
11584   attrs = AREF (spec, 0);
11585   return CODING_ATTR_PLIST (attrs);
11586 }
11587 
11588 
11589 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
11590        1, 1, 0,
11591        doc: /* Return the list of aliases of CODING-SYSTEM.  */)
11592   (Lisp_Object coding_system)
11593 {
11594   Lisp_Object spec;
11595 
11596   if (NILP (coding_system))
11597     coding_system = Qno_conversion;
11598   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
11599   return AREF (spec, 1);
11600 }
11601 
11602 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
11603        Scoding_system_eol_type, 1, 1, 0,
11604        doc: /* Return eol-type of CODING-SYSTEM.
11605 An eol-type is an integer 0, 1, 2, or a vector of coding systems.
11606 
11607 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
11608 and CR respectively.
11609 
11610 A vector value indicates that a format of end-of-line should be
11611 detected automatically.  Nth element of the vector is the subsidiary
11612 coding system whose eol-type is N.  */)
11613   (Lisp_Object coding_system)
11614 {
11615   Lisp_Object spec, eol_type;
11616   int n;
11617 
11618   if (NILP (coding_system))
11619     coding_system = Qno_conversion;
11620   if (! CODING_SYSTEM_P (coding_system))
11621     return Qnil;
11622   spec = CODING_SYSTEM_SPEC (coding_system);
11623   eol_type = AREF (spec, 2);
11624   if (VECTORP (eol_type))
11625     return Fcopy_sequence (eol_type);
11626   n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
11627   return make_fixnum (n);
11628 }
11629 
11630 
11631 /*** 9. Post-amble ***/
11632 
11633 void
init_coding_once(void)11634 init_coding_once (void)
11635 {
11636   int i;
11637 
11638   for (i = 0; i < coding_category_max; i++)
11639     {
11640       coding_categories[i].id = -1;
11641       coding_priorities[i] = i;
11642     }
11643 
11644   PDUMPER_REMEMBER_SCALAR (coding_categories);
11645   PDUMPER_REMEMBER_SCALAR (coding_priorities);
11646 
11647   /* ISO2022 specific initialize routine.  */
11648   for (i = 0; i < 0x20; i++)
11649     iso_code_class[i] = ISO_control_0;
11650   for (i = 0x21; i < 0x7F; i++)
11651     iso_code_class[i] = ISO_graphic_plane_0;
11652   for (i = 0x80; i < 0xA0; i++)
11653     iso_code_class[i] = ISO_control_1;
11654   for (i = 0xA1; i < 0xFF; i++)
11655     iso_code_class[i] = ISO_graphic_plane_1;
11656   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
11657   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
11658   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
11659   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
11660   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
11661   iso_code_class[ISO_CODE_ESC] = ISO_escape;
11662   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
11663   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
11664   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
11665 
11666   PDUMPER_REMEMBER_SCALAR (iso_code_class);
11667 
11668   for (i = 0; i < 256; i++)
11669     {
11670       emacs_mule_bytes[i] = 1;
11671     }
11672   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
11673   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
11674   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
11675   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
11676 
11677   PDUMPER_REMEMBER_SCALAR (emacs_mule_bytes);
11678 }
11679 
11680 static void reset_coding_after_pdumper_load (void);
11681 
11682 void
syms_of_coding(void)11683 syms_of_coding (void)
11684 {
11685   staticpro (&Vcoding_system_hash_table);
11686   Vcoding_system_hash_table = CALLN (Fmake_hash_table, QCtest, Qeq);
11687 
11688   staticpro (&Vsjis_coding_system);
11689   Vsjis_coding_system = Qnil;
11690 
11691   staticpro (&Vbig5_coding_system);
11692   Vbig5_coding_system = Qnil;
11693 
11694   staticpro (&Vcode_conversion_reused_workbuf);
11695   Vcode_conversion_reused_workbuf = Qnil;
11696 
11697   staticpro (&Vcode_conversion_workbuf_name);
11698   Vcode_conversion_workbuf_name = build_pure_c_string (" *code-conversion-work*");
11699 
11700   reused_workbuf_in_use = false;
11701   PDUMPER_REMEMBER_SCALAR (reused_workbuf_in_use);
11702 
11703   DEFSYM (Qcharset, "charset");
11704   DEFSYM (Qtarget_idx, "target-idx");
11705   DEFSYM (Qcoding_system_history, "coding-system-history");
11706   Fset (Qcoding_system_history, Qnil);
11707 
11708   /* Target FILENAME is the first argument.  */
11709   Fput (Qinsert_file_contents, Qtarget_idx, make_fixnum (0));
11710   /* Target FILENAME is the third argument.  */
11711   Fput (Qwrite_region, Qtarget_idx, make_fixnum (2));
11712 
11713   DEFSYM (Qcall_process, "call-process");
11714   /* Target PROGRAM is the first argument.  */
11715   Fput (Qcall_process, Qtarget_idx, make_fixnum (0));
11716 
11717   DEFSYM (Qcall_process_region, "call-process-region");
11718   /* Target PROGRAM is the third argument.  */
11719   Fput (Qcall_process_region, Qtarget_idx, make_fixnum (2));
11720 
11721   DEFSYM (Qstart_process, "start-process");
11722   /* Target PROGRAM is the third argument.  */
11723   Fput (Qstart_process, Qtarget_idx, make_fixnum (2));
11724 
11725   DEFSYM (Qopen_network_stream, "open-network-stream");
11726   /* Target SERVICE is the fourth argument.  */
11727   Fput (Qopen_network_stream, Qtarget_idx, make_fixnum (3));
11728 
11729   DEFSYM (Qunix, "unix");
11730   DEFSYM (Qdos, "dos");
11731   DEFSYM (Qmac, "mac");
11732 
11733   DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
11734   DEFSYM (Qundecided, "undecided");
11735   DEFSYM (Qno_conversion, "no-conversion");
11736   DEFSYM (Qraw_text, "raw-text");
11737   DEFSYM (Qus_ascii, "us-ascii");
11738 
11739   DEFSYM (Qiso_2022, "iso-2022");
11740 
11741   DEFSYM (Qutf_8, "utf-8");
11742   DEFSYM (Qutf_8_unix, "utf-8-unix");
11743   DEFSYM (Qutf_8_emacs, "utf-8-emacs");
11744 
11745 #if defined (WINDOWSNT) || defined (CYGWIN)
11746   /* No, not utf-16-le: that one has a BOM.  */
11747   DEFSYM (Qutf_16le, "utf-16le");
11748 #endif
11749 
11750   DEFSYM (Qutf_16, "utf-16");
11751   DEFSYM (Qbig, "big");
11752   DEFSYM (Qlittle, "little");
11753 
11754   DEFSYM (Qshift_jis, "shift-jis");
11755   DEFSYM (Qbig5, "big5");
11756 
11757   DEFSYM (Qcoding_system_p, "coding-system-p");
11758 
11759   /* Error signaled when there's a problem with detecting a coding system.  */
11760   DEFSYM (Qcoding_system_error, "coding-system-error");
11761   Fput (Qcoding_system_error, Qerror_conditions,
11762 	pure_list (Qcoding_system_error, Qerror));
11763   Fput (Qcoding_system_error, Qerror_message,
11764 	build_pure_c_string ("Invalid coding system"));
11765 
11766   DEFSYM (Qtranslation_table, "translation-table");
11767   Fput (Qtranslation_table, Qchar_table_extra_slots, make_fixnum (2));
11768   DEFSYM (Qtranslation_table_id, "translation-table-id");
11769 
11770   /* Coding system emacs-mule and raw-text are for converting only
11771      end-of-line format.  */
11772   DEFSYM (Qemacs_mule, "emacs-mule");
11773 
11774   DEFSYM (QCcategory, ":category");
11775   DEFSYM (QCmnemonic, ":mnemonic");
11776   DEFSYM (QCdefault_char, ":default-char");
11777   DEFSYM (QCdecode_translation_table, ":decode-translation-table");
11778   DEFSYM (QCencode_translation_table, ":encode-translation-table");
11779   DEFSYM (QCpost_read_conversion, ":post-read-conversion");
11780   DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
11781   DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
11782 
11783   Vcoding_category_table = make_nil_vector (coding_category_max);
11784   staticpro (&Vcoding_category_table);
11785   /* Followings are target of code detection.  */
11786   ASET (Vcoding_category_table, coding_category_iso_7,
11787 	intern_c_string ("coding-category-iso-7"));
11788   ASET (Vcoding_category_table, coding_category_iso_7_tight,
11789 	intern_c_string ("coding-category-iso-7-tight"));
11790   ASET (Vcoding_category_table, coding_category_iso_8_1,
11791 	intern_c_string ("coding-category-iso-8-1"));
11792   ASET (Vcoding_category_table, coding_category_iso_8_2,
11793 	intern_c_string ("coding-category-iso-8-2"));
11794   ASET (Vcoding_category_table, coding_category_iso_7_else,
11795 	intern_c_string ("coding-category-iso-7-else"));
11796   ASET (Vcoding_category_table, coding_category_iso_8_else,
11797 	intern_c_string ("coding-category-iso-8-else"));
11798   ASET (Vcoding_category_table, coding_category_utf_8_auto,
11799 	intern_c_string ("coding-category-utf-8-auto"));
11800   ASET (Vcoding_category_table, coding_category_utf_8_nosig,
11801 	intern_c_string ("coding-category-utf-8"));
11802   ASET (Vcoding_category_table, coding_category_utf_8_sig,
11803 	intern_c_string ("coding-category-utf-8-sig"));
11804   ASET (Vcoding_category_table, coding_category_utf_16_be,
11805 	intern_c_string ("coding-category-utf-16-be"));
11806   ASET (Vcoding_category_table, coding_category_utf_16_auto,
11807 	intern_c_string ("coding-category-utf-16-auto"));
11808   ASET (Vcoding_category_table, coding_category_utf_16_le,
11809 	intern_c_string ("coding-category-utf-16-le"));
11810   ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
11811 	intern_c_string ("coding-category-utf-16-be-nosig"));
11812   ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
11813 	intern_c_string ("coding-category-utf-16-le-nosig"));
11814   ASET (Vcoding_category_table, coding_category_charset,
11815 	intern_c_string ("coding-category-charset"));
11816   ASET (Vcoding_category_table, coding_category_sjis,
11817 	intern_c_string ("coding-category-sjis"));
11818   ASET (Vcoding_category_table, coding_category_big5,
11819 	intern_c_string ("coding-category-big5"));
11820   ASET (Vcoding_category_table, coding_category_ccl,
11821 	intern_c_string ("coding-category-ccl"));
11822   ASET (Vcoding_category_table, coding_category_emacs_mule,
11823 	intern_c_string ("coding-category-emacs-mule"));
11824   /* Followings are NOT target of code detection.  */
11825   ASET (Vcoding_category_table, coding_category_raw_text,
11826 	intern_c_string ("coding-category-raw-text"));
11827   ASET (Vcoding_category_table, coding_category_undecided,
11828 	intern_c_string ("coding-category-undecided"));
11829 
11830   DEFSYM (Qinsufficient_source, "insufficient-source");
11831   DEFSYM (Qinvalid_source, "invalid-source");
11832   DEFSYM (Qinterrupted, "interrupted");
11833 
11834   /* If a symbol has this property, evaluate the value to define the
11835      symbol as a coding system.  */
11836   DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
11837 
11838   DEFSYM (Qignored, "ignored");
11839 
11840   DEFSYM (Qutf_8_string_p, "utf-8-string-p");
11841   DEFSYM (Qfilenamep, "filenamep");
11842 
11843   defsubr (&Scoding_system_p);
11844   defsubr (&Sread_coding_system);
11845   defsubr (&Sread_non_nil_coding_system);
11846   defsubr (&Scheck_coding_system);
11847   defsubr (&Sdetect_coding_region);
11848   defsubr (&Sdetect_coding_string);
11849   defsubr (&Sfind_coding_systems_region_internal);
11850   defsubr (&Sunencodable_char_position);
11851   defsubr (&Scheck_coding_systems_region);
11852   defsubr (&Sdecode_coding_region);
11853   defsubr (&Sencode_coding_region);
11854   defsubr (&Sdecode_coding_string);
11855   defsubr (&Sencode_coding_string);
11856 #ifdef ENABLE_UTF_8_CONVERTER_TEST
11857   defsubr (&Sinternal_encode_string_utf_8);
11858   defsubr (&Sinternal_decode_string_utf_8);
11859 #endif	/* ENABLE_UTF_8_CONVERTER_TEST */
11860   defsubr (&Sdecode_sjis_char);
11861   defsubr (&Sencode_sjis_char);
11862   defsubr (&Sdecode_big5_char);
11863   defsubr (&Sencode_big5_char);
11864   defsubr (&Sset_terminal_coding_system_internal);
11865   defsubr (&Sset_safe_terminal_coding_system_internal);
11866   defsubr (&Sterminal_coding_system);
11867   defsubr (&Sset_keyboard_coding_system_internal);
11868   defsubr (&Skeyboard_coding_system);
11869   defsubr (&Sfind_operation_coding_system);
11870   defsubr (&Sset_coding_system_priority);
11871   defsubr (&Sdefine_coding_system_internal);
11872   defsubr (&Sdefine_coding_system_alias);
11873   defsubr (&Scoding_system_put);
11874   defsubr (&Scoding_system_base);
11875   defsubr (&Scoding_system_plist);
11876   defsubr (&Scoding_system_aliases);
11877   defsubr (&Scoding_system_eol_type);
11878   defsubr (&Scoding_system_priority_list);
11879 
11880   DEFVAR_LISP ("coding-system-list", Vcoding_system_list,
11881 	       doc: /* List of coding systems.
11882 
11883 Do not alter the value of this variable manually.  This variable should be
11884 updated by the functions `define-coding-system' and
11885 `define-coding-system-alias'.  */);
11886   Vcoding_system_list = Qnil;
11887 
11888   DEFVAR_LISP ("coding-system-alist", Vcoding_system_alist,
11889 	       doc: /* Alist of coding system names.
11890 Each element is one element list of coding system name.
11891 This variable is given to `completing-read' as COLLECTION argument.
11892 
11893 Do not alter the value of this variable manually.  This variable should be
11894 updated by `define-coding-system-alias'.  */);
11895   Vcoding_system_alist = Qnil;
11896 
11897   DEFVAR_LISP ("coding-category-list", Vcoding_category_list,
11898 	       doc: /* List of coding-categories (symbols) ordered by priority.
11899 
11900 On detecting a coding system, Emacs tries code detection algorithms
11901 associated with each coding-category one by one in this order.  When
11902 one algorithm agrees with a byte sequence of source text, the coding
11903 system bound to the corresponding coding-category is selected.
11904 
11905 Don't modify this variable directly, but use `set-coding-system-priority'.  */);
11906   {
11907     int i;
11908 
11909     Vcoding_category_list = Qnil;
11910     for (i = coding_category_max - 1; i >= 0; i--)
11911       Vcoding_category_list
11912 	= Fcons (AREF (Vcoding_category_table, i),
11913 		 Vcoding_category_list);
11914   }
11915 
11916   DEFVAR_LISP ("coding-system-for-read", Vcoding_system_for_read,
11917 	       doc: /* Specify the coding system for read operations.
11918 It is useful to bind this variable with `let', but do not set it globally.
11919 If the value is a coding system, it is used for decoding on read operation.
11920 If not, an appropriate element is used from one of the coding system alists.
11921 There are three such tables: `file-coding-system-alist',
11922 `process-coding-system-alist', and `network-coding-system-alist'.  */);
11923   Vcoding_system_for_read = Qnil;
11924 
11925   DEFVAR_LISP ("coding-system-for-write", Vcoding_system_for_write,
11926 	       doc: /* Specify the coding system for write operations.
11927 Programs bind this variable with `let', but you should not set it globally.
11928 If the value is a coding system, it is used for encoding of output,
11929 when writing it to a file and when sending it to a file or subprocess.
11930 
11931 If this does not specify a coding system, an appropriate element
11932 is used from one of the coding system alists.
11933 There are three such tables: `file-coding-system-alist',
11934 `process-coding-system-alist', and `network-coding-system-alist'.
11935 For output to files, if the above procedure does not specify a coding system,
11936 the value of `buffer-file-coding-system' is used.  */);
11937   Vcoding_system_for_write = Qnil;
11938 
11939   DEFVAR_LISP ("last-coding-system-used", Vlast_coding_system_used,
11940 	       doc: /*
11941 Coding system used in the latest file or process I/O.  */);
11942   Vlast_coding_system_used = Qnil;
11943 
11944   DEFVAR_LISP ("last-code-conversion-error", Vlast_code_conversion_error,
11945 	       doc: /*
11946 Error status of the last code conversion.
11947 
11948 When an error was detected in the last code conversion, this variable
11949 is set to one of the following symbols.
11950   `insufficient-source'
11951   `inconsistent-eol'
11952   `invalid-source'
11953   `interrupted'
11954   `insufficient-memory'
11955 When no error was detected, the value doesn't change.  So, to check
11956 the error status of a code conversion by this variable, you must
11957 explicitly set this variable to nil before performing code
11958 conversion.  */);
11959   Vlast_code_conversion_error = Qnil;
11960 
11961   DEFVAR_BOOL ("inhibit-eol-conversion", inhibit_eol_conversion,
11962 	       doc: /*
11963 Non-nil means always inhibit code conversion of end-of-line format.
11964 See info node `Coding Systems' and info node `Text and Binary' concerning
11965 such conversion.  */);
11966   inhibit_eol_conversion = 0;
11967 
11968   DEFVAR_BOOL ("inherit-process-coding-system", inherit_process_coding_system,
11969 	       doc: /*
11970 Non-nil means process buffer inherits coding system of process output.
11971 Bind it to t if the process output is to be treated as if it were a file
11972 read from some filesystem.  */);
11973   inherit_process_coding_system = 0;
11974 
11975   DEFVAR_LISP ("file-coding-system-alist", Vfile_coding_system_alist,
11976 	       doc: /*
11977 Alist to decide a coding system to use for a file I/O operation.
11978 The format is ((PATTERN . VAL) ...),
11979 where PATTERN is a regular expression matching a file name,
11980 VAL is a coding system, a cons of coding systems, or a function symbol.
11981 If VAL is a coding system, it is used for both decoding and encoding
11982 the file contents.
11983 If VAL is a cons of coding systems, the car part is used for decoding,
11984 and the cdr part is used for encoding.
11985 If VAL is a function symbol, the function must return a coding system
11986 or a cons of coding systems which are used as above.  The function is
11987 called with an argument that is a list of the arguments with which
11988 `find-operation-coding-system' was called.  If the function can't decide
11989 a coding system, it can return `undecided' so that the normal
11990 code-detection is performed.
11991 
11992 See also the function `find-operation-coding-system'
11993 and the variable `auto-coding-alist'.  */);
11994   Vfile_coding_system_alist = Qnil;
11995 
11996   DEFVAR_LISP ("process-coding-system-alist", Vprocess_coding_system_alist,
11997 	       doc: /*
11998 Alist to decide a coding system to use for a process I/O operation.
11999 The format is ((PATTERN . VAL) ...),
12000 where PATTERN is a regular expression matching a program name,
12001 VAL is a coding system, a cons of coding systems, or a function symbol.
12002 If VAL is a coding system, it is used for both decoding what received
12003 from the program and encoding what sent to the program.
12004 If VAL is a cons of coding systems, the car part is used for decoding,
12005 and the cdr part is used for encoding.
12006 If VAL is a function symbol, the function must return a coding system
12007 or a cons of coding systems which are used as above.
12008 
12009 See also the function `find-operation-coding-system'.  */);
12010   Vprocess_coding_system_alist = Qnil;
12011 
12012   DEFVAR_LISP ("network-coding-system-alist", Vnetwork_coding_system_alist,
12013 	       doc: /*
12014 Alist to decide a coding system to use for a network I/O operation.
12015 The format is ((PATTERN . VAL) ...),
12016 where PATTERN is a regular expression matching a network service name
12017 or is a port number to connect to,
12018 VAL is a coding system, a cons of coding systems, or a function symbol.
12019 If VAL is a coding system, it is used for both decoding what received
12020 from the network stream and encoding what sent to the network stream.
12021 If VAL is a cons of coding systems, the car part is used for decoding,
12022 and the cdr part is used for encoding.
12023 If VAL is a function symbol, the function must return a coding system
12024 or a cons of coding systems which are used as above.
12025 
12026 See also the function `find-operation-coding-system'.  */);
12027   Vnetwork_coding_system_alist = Qnil;
12028 
12029   DEFVAR_LISP ("locale-coding-system", Vlocale_coding_system,
12030 	       doc: /* Coding system to use with system messages.
12031 Also used for decoding keyboard input on X Window system, and for
12032 encoding standard output and error streams.  */);
12033   Vlocale_coding_system = Qnil;
12034 
12035   /* The eol mnemonics are reset in startup.el system-dependently.  */
12036   DEFVAR_LISP ("eol-mnemonic-unix", eol_mnemonic_unix,
12037 	       doc: /*
12038 String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
12039   eol_mnemonic_unix = build_pure_c_string (":");
12040 
12041   DEFVAR_LISP ("eol-mnemonic-dos", eol_mnemonic_dos,
12042 	       doc: /*
12043 String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
12044   eol_mnemonic_dos = build_pure_c_string ("\\");
12045 
12046   DEFVAR_LISP ("eol-mnemonic-mac", eol_mnemonic_mac,
12047 	       doc: /*
12048 String displayed in mode line for MAC-like (CR) end-of-line format.  */);
12049   eol_mnemonic_mac = build_pure_c_string ("/");
12050 
12051   DEFVAR_LISP ("eol-mnemonic-undecided", eol_mnemonic_undecided,
12052 	       doc: /*
12053 String displayed in mode line when end-of-line format is not yet determined.  */);
12054   eol_mnemonic_undecided = build_pure_c_string (":");
12055 
12056   DEFVAR_LISP ("enable-character-translation", Venable_character_translation,
12057 	       doc: /*
12058 Non-nil enables character translation while encoding and decoding.  */);
12059   Venable_character_translation = Qt;
12060 
12061   DEFVAR_LISP ("standard-translation-table-for-decode",
12062 	       Vstandard_translation_table_for_decode,
12063 	       doc: /* Table for translating characters while decoding.  */);
12064   Vstandard_translation_table_for_decode = Qnil;
12065 
12066   DEFVAR_LISP ("standard-translation-table-for-encode",
12067 	       Vstandard_translation_table_for_encode,
12068 	       doc: /* Table for translating characters while encoding.  */);
12069   Vstandard_translation_table_for_encode = Qnil;
12070 
12071   DEFVAR_LISP ("charset-revision-table", Vcharset_revision_table,
12072 	       doc: /* Alist of charsets vs revision numbers.
12073 While encoding, if a charset (car part of an element) is found,
12074 designate it with the escape sequence identifying revision (cdr part
12075 of the element).  */);
12076   Vcharset_revision_table = Qnil;
12077 
12078   DEFVAR_LISP ("default-process-coding-system",
12079 	       Vdefault_process_coding_system,
12080 	       doc: /* Cons of coding systems used for process I/O by default.
12081 The car part is used for decoding a process output,
12082 the cdr part is used for encoding a text to be sent to a process.  */);
12083   Vdefault_process_coding_system = Qnil;
12084 
12085   DEFVAR_LISP ("latin-extra-code-table", Vlatin_extra_code_table,
12086 	       doc: /*
12087 Table of extra Latin codes in the range 128..159 (inclusive).
12088 This is a vector of length 256.
12089 If Nth element is non-nil, the existence of code N in a file
12090 \(or output of subprocess) doesn't prevent it to be detected as
12091 a coding system of ISO 2022 variant which has a flag
12092 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
12093 or reading output of a subprocess.
12094 Only 128th through 159th elements have a meaning.  */);
12095   Vlatin_extra_code_table = make_nil_vector (256);
12096 
12097   DEFVAR_LISP ("select-safe-coding-system-function",
12098 	       Vselect_safe_coding_system_function,
12099 	       doc: /*
12100 Function to call to select safe coding system for encoding a text.
12101 
12102 If set, this function is called to force a user to select a proper
12103 coding system which can encode the text in the case that a default
12104 coding system used in each operation can't encode the text.  The
12105 function should take care that the buffer is not modified while
12106 the coding system is being selected.
12107 
12108 The default value is `select-safe-coding-system' (which see).  */);
12109   Vselect_safe_coding_system_function = Qnil;
12110 
12111   DEFVAR_BOOL ("coding-system-require-warning",
12112 	       coding_system_require_warning,
12113 	       doc: /* Internal use only.
12114 If non-nil, on writing a file, `select-safe-coding-system-function' is
12115 called even if `coding-system-for-write' is non-nil.  The command
12116 `universal-coding-system-argument' binds this variable to t temporarily.  */);
12117   coding_system_require_warning = 0;
12118 
12119 
12120   DEFVAR_BOOL ("inhibit-iso-escape-detection",
12121 	       inhibit_iso_escape_detection,
12122 	       doc: /*
12123 If non-nil, Emacs ignores ISO-2022 escape sequences during code detection.
12124 
12125 When Emacs reads text, it tries to detect how the text is encoded.
12126 This code detection is sensitive to escape sequences.  If Emacs sees
12127 a valid ISO-2022 escape sequence, it assumes the text is encoded in one
12128 of the ISO2022 encodings, and decodes text by the corresponding coding
12129 system (e.g. `iso-2022-7bit').
12130 
12131 However, there may be a case that you want to read escape sequences in
12132 a file as is.  In such a case, you can set this variable to non-nil.
12133 Then the code detection will ignore any escape sequences, and no text is
12134 detected as encoded in some ISO-2022 encoding.  The result is that all
12135 escape sequences become visible in a buffer.
12136 
12137 The default value is nil, and it is strongly recommended not to change
12138 it.  That is because many Emacs Lisp source files that contain
12139 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
12140 in Emacs's distribution, and they won't be decoded correctly on
12141 reading if you suppress escape sequence detection.
12142 
12143 The other way to read escape sequences in a file without decoding is
12144 to explicitly specify some coding system that doesn't use ISO-2022
12145 escape sequence (e.g., `latin-1') on reading by \\[universal-coding-system-argument].  */);
12146   inhibit_iso_escape_detection = 0;
12147 
12148   DEFVAR_BOOL ("inhibit-null-byte-detection",
12149 	       inhibit_null_byte_detection,
12150 	       doc: /* If non-nil, Emacs ignores null bytes on code detection.
12151 By default, Emacs treats it as binary data, and does not attempt to
12152 decode it.  The effect is as if you specified `no-conversion' for
12153 reading that text.
12154 
12155 Set this to non-nil when a regular text happens to include null bytes.
12156 Examples are Index nodes of Info files and null-byte delimited output
12157 from GNU Find and GNU Grep.  Emacs will then ignore the null bytes and
12158 decode text as usual.  */);
12159   inhibit_null_byte_detection = 0;
12160 
12161   DEFVAR_BOOL ("disable-ascii-optimization", disable_ascii_optimization,
12162 	       doc: /* If non-nil, Emacs does not optimize code decoder for ASCII files.
12163 Internal use only.  Remove after the experimental optimizer becomes stable.  */);
12164   disable_ascii_optimization = 0;
12165 
12166   DEFVAR_LISP ("translation-table-for-input", Vtranslation_table_for_input,
12167 	       doc: /* Char table for translating self-inserting characters.
12168 This is applied to the result of input methods, not their input.
12169 See also `keyboard-translate-table'.
12170 
12171 Use of this variable for character code unification was rendered
12172 obsolete in Emacs 23.1 and later, since Unicode is now the basis of
12173 internal character representation.  */);
12174   Vtranslation_table_for_input = Qnil;
12175 
12176   Lisp_Object args[coding_arg_undecided_max];
12177   memclear (args, sizeof args);
12178 
12179   Lisp_Object plist[] =
12180     {
12181       QCname,
12182       args[coding_arg_name] = Qno_conversion,
12183       QCmnemonic,
12184       args[coding_arg_mnemonic] = make_fixnum ('='),
12185       intern_c_string (":coding-type"),
12186       args[coding_arg_coding_type] = Qraw_text,
12187       QCascii_compatible_p,
12188       args[coding_arg_ascii_compatible_p] = Qt,
12189       QCdefault_char,
12190       args[coding_arg_default_char] = make_fixnum (0),
12191       intern_c_string (":for-unibyte"),
12192       args[coding_arg_for_unibyte] = Qt,
12193       intern_c_string (":docstring"),
12194       (build_pure_c_string
12195        ("Do no conversion.\n"
12196 	"\n"
12197 	"When you visit a file with this coding, the file is read into a\n"
12198 	"unibyte buffer as is, thus each byte of a file is treated as a\n"
12199 	"character.")),
12200       intern_c_string (":eol-type"),
12201       args[coding_arg_eol_type] = Qunix,
12202     };
12203   args[coding_arg_plist] = CALLMANY (Flist, plist);
12204   Fdefine_coding_system_internal (coding_arg_max, args);
12205 
12206   plist[1] = args[coding_arg_name] = Qundecided;
12207   plist[3] = args[coding_arg_mnemonic] = make_fixnum ('-');
12208   plist[5] = args[coding_arg_coding_type] = Qundecided;
12209   /* This is already set.
12210      plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
12211   plist[8] = intern_c_string (":charset-list");
12212   plist[9] = args[coding_arg_charset_list] = list1 (Qascii);
12213   plist[11] = args[coding_arg_for_unibyte] = Qnil;
12214   plist[13] = build_pure_c_string ("No conversion on encoding, "
12215 				   "automatic conversion on decoding.");
12216   plist[15] = args[coding_arg_eol_type] = Qnil;
12217   args[coding_arg_plist] = CALLMANY (Flist, plist);
12218   args[coding_arg_undecided_inhibit_null_byte_detection] = make_fixnum (0);
12219   args[coding_arg_undecided_inhibit_iso_escape_detection] = make_fixnum (0);
12220   Fdefine_coding_system_internal (coding_arg_undecided_max, args);
12221 
12222   setup_coding_system (Qno_conversion, &safe_terminal_coding);
12223 
12224   for (int i = 0; i < coding_category_max; i++)
12225     Fset (AREF (Vcoding_category_table, i), Qno_conversion);
12226 
12227   pdumper_do_now_and_after_load (reset_coding_after_pdumper_load);
12228 }
12229 
12230 static void
reset_coding_after_pdumper_load(void)12231 reset_coding_after_pdumper_load (void)
12232 {
12233   if (!dumped_with_pdumper_p ())
12234     return;
12235   for (struct coding_system *this = &coding_categories[0];
12236        this < &coding_categories[coding_category_max];
12237        ++this)
12238     {
12239       int id = this->id;
12240       if (id >= 0)
12241         {
12242           /* Need to rebuild the coding system object because we
12243              persisted it as a scalar and it's full of gunk that's now
12244              invalid.  */
12245           memset (this, 0, sizeof (*this));
12246           setup_coding_system (CODING_ID_NAME (id), this);
12247         }
12248     }
12249   /* In temacs the below is done by mule-conf.el, because we need to
12250      define us-ascii first.  But in dumped Emacs us-ascii is restored
12251      by the above loop, and mule-conf.el will not be loaded, so we set
12252      it up now; otherwise safe_terminal_coding will remain zeroed.  */
12253   Fset_safe_terminal_coding_system_internal (Qus_ascii);
12254 }
12255