1%%	options
2
3copyright owner	=	Dirk Krause
4copyright year	=	2015-xxxx
5SPDX-License-Identifier:	BSD-3-Clause
6
7
8
9%%	header
10
11/**	@file
12	File and memory encodings for characters.
13
14	CRT on Windows: Optional, disabling CRT degrades performance.
15*/
16
17#ifndef DK4CONF_H_INCLUDED
18#if DK4_BUILDING_DKTOOLS4
19#include "dk4conf.h"
20#else
21#include <dktools-4/dk4conf.h>
22#endif
23#endif
24
25#ifndef DK4TYPES_H_INCLUDED
26#if DK4_BUILDING_DKTOOLS4
27#include <libdk4base/dk4types.h>
28#else
29#include <dktools-4/dk4types.h>
30#endif
31#endif
32
33#ifndef DK4ERROR_H_INCLUDED
34#if DK4_BUILDING_DKTOOLS4
35#include <libdk4base/dk4error.h>
36#else
37#include <dktools-4/dk4error.h>
38#endif
39#endif
40
41
42
43/**	Encodings for text files.
44*/
45enum {
46				/**	Bytes 0x00 to 0xff represent
47					U+0000 to U+00FF.
48					Formerly misleading named "ASCII".
49				*/
50  DK4_FILE_ENCODING_PLAIN	=	0,
51
52				/**	Encoding used in Windows GUI programs.
53				*/
54  DK4_FILE_ENCODING_WIN1252 ,
55
56				/**	UTF-8 encoding used on Linux and Unix.
57				*/
58  DK4_FILE_ENCODING_UTF8 ,
59
60				/**	UTF-16 encoding, least significant
61					byte first.
62				*/
63  DK4_FILE_ENCODING_UTF16_LE ,
64
65				/**	UTF-16 encoding, most significant
66					byte first.
67				*/
68  DK4_FILE_ENCODING_UTF16_BE ,
69
70				/**	32-bit unicode characters, least
71					significant byte first.
72				*/
73  DK4_FILE_ENCODING_32_LE ,
74
75				/**	32-bit unicode characters, most
76					significant byte first.
77				*/
78  DK4_FILE_ENCODING_32_BE
79
80};
81
82/**	Encoding for characters in memory.
83*/
84enum {
85			/**	Bytes 0x00 to 0xFF represent U+0000 to U+00FF.
86			*/
87  DK4_ENCODING_PLAIN	=	DK4_FILE_ENCODING_PLAIN,
88
89			/**	Encoding used by Windows GUI programs.
90			*/
91  DK4_ENCODING_WIN1252	=	DK4_FILE_ENCODING_WIN1252,
92
93			/**	UTF-8 encoding used on Linux and Unix.
94			*/
95  DK4_ENCODING_UTF8	=	DK4_FILE_ENCODING_UTF8,
96
97#if DK4_WORDS_BIGENDIAN
98
99			/**	UTF-16 encoding.
100			*/
101  DK4_ENCODING_UTF16	=	DK4_FILE_ENCODING_UTF16_BE ,
102
103			/**	32-bit unicode characters.
104			*/
105  DK4_ENCODING_32	=	DK4_FILE_ENCODING_32_BE
106
107#else
108
109			/**	UTF-16 encoding.
110			*/
111  DK4_ENCODING_UTF16	=	DK4_FILE_ENCODING_UTF16_LE ,
112
113			/**	32-bit unicode characters.
114  			*/
115  DK4_ENCODING_32	=	DK4_FILE_ENCODING_32_LE
116
117#endif
118
119};
120
121
122
123#ifdef __cplusplus
124extern "C" {
125#endif
126
127/**	Find encoding by name.
128	@param	encptr	Pointer to result variable for encoding.
129	@param	bomptr	Pointer to result variable for BOM writing.
130	@param	src	Source text containing encoding name.
131	@param	erp	Error report, may be NULL.
132	@return	1 on success, 0 on error.
133
134	Error codes:
135	- DK4_E_INVALID_ARGUMENTS<br>
136	  if encptr or src is NULL,
137	- DK4_E_BUFFER_TOO_SMALL<br>
138	  if the text is too long to create a local copy for modification, or
139	- DK4_E_SYNTAX<br>
140	  if invalid encoding, options or encoding/option combinations are
141	  specified.
142*/
143int
144dk4enc_find(int *encptr, int *bomptr, const dkChar *src, dk4_er_t *erp);
145
146#ifdef __cplusplus
147}
148#endif
149
150
151
152%%	module
153
154#include "dk4conf.h"
155#include <libdk4c/dk4enc.h>
156#include <libdk4base/dk4mem.h>
157#include <libdk4base/dk4strd.h>
158
159#if	DK4_HAVE_ASSERT_H
160#ifndef	ASSERT_H_INCLUDED
161#include <assert.h>
162#define	ASSERT_H_INCLUDED 1
163#endif
164#endif
165
166
167/**	Encoding names in variations.
168*/
169static const dkChar * const dk4enc_encoding_names[] = {
170$!string-table macro=dkT
171#
172#	0  ...  1	ASCII
173#
174plain
175ascii
176#
177#	2		ANSI, used on Windows systems
178#
179ansi
180#
181#	3  ...  4	UTF-8
182#
183utf-8
184utf8
185#
186#	5  ...  6	UTF-16, systems native endianness
187#
188utf-16
189utf16
190#
191#	7  ... 12	UTF-16LE
192#
193utf-16-le
194utf-16le
195utf16le
196utf-16-lsb
197utf-16lsb
198utf16lsb
199#
200#	13 ... 18	UTF-16BE
201#
202utf-16-be
203utf-16be
204utf16be
205utf-16-msb
206utf-16msb
207utf16msb
208#
209#	19		32 bit in systems native endianness
210#
211c32
212#
213#	20 ... 23	32 bit little endian
214#
215c32-le
216c32le
217c32-lsb
218c32lsb
219#
220#	24 ... 27	32 bit big endian
221#
222c32-be
223c32be
224c32-msb
225c32msb
226#
227#	28 ... 34	Backward compatibility
228#
229utf-16.msb
230utf-16.lsb
231uc32
232uc32.msb
233uc32.lsb
234iso-latin-1
235iso-8859-1
236#
237#	35 ... 36
238#
239win1252
240cp1252
241#
242#
243#
244$!end
245};
246
247
248
249/**	Keywords for further options.
250*/
251static const dkChar * const dk4enc_option_keywords[] = {
252$!string-table macro=dkT
253le
254lsb
255be
256msb
257bom
258nobom
259$!end
260};
261
262
263
264int
265dk4enc_find(int *encptr, int *bomptr, const dkChar *src, dk4_er_t *erp)
266{
267  dkChar	buf[64];		/* Private copy for modification */
268  dkChar	*p1;			/* Start of text */
269  dkChar	*p2;			/* Start of options */
270  int		res	=	0;	/* Array index */
271  int		back	=	0;	/* Function result */
272  int		ae	=	0;	/* Flag: Allow ending specification */
273  int		enc	=	0;	/* Encoding found */
274  int		bom	=	0;	/* Flag: BOM keyword found */
275  int		bom_f	=	0;	/* Flag: BOM information found */
276#if	DK4_USE_ASSERT
277  assert(NULL != encptr);
278  assert(NULL != src);
279#endif
280  if ((NULL != encptr) && (NULL != src)) {
281    if (0 != dk4str_cpy_s(buf, DK4_SIZEOF(buf,dkChar), src, erp)) {
282      p1 = dk4str_start(buf, NULL);
283      if (NULL != p1) {
284        p2 = dk4str_chr(buf, dkT(','));
285	if (NULL == p2) {
286	  p2 = dk4str_chr(buf, dkT('.'));
287	}
288	if (NULL != p2) { *(p2++) = dkT('\0'); p2 = dk4str_start(p2, NULL); }
289	dk4str_normalize(p1, NULL);
290	switch (dk4str_array_index(dk4enc_encoding_names, p1, 0)) {
291	  case 0: case 1: case 33: case 34: {
292	    enc = DK4_FILE_ENCODING_PLAIN;
293	    back = 1;
294	  } break;
295	  case 2: case 35: case 36: {
296	    enc = DK4_FILE_ENCODING_WIN1252;
297	    back = 1;
298	  } break;
299	  case 3: case 4: {
300	    enc = DK4_FILE_ENCODING_UTF8;
301	    back = 1;
302	  } break;
303	  case 5: case 6: {
304	    enc = DK4_ENCODING_UTF16;
305	    ae  = 1;
306	    back = 1;
307	    bom = 1;
308	  } break;
309	  case 7: case 8: case 9: case 10: case 11: case 12: case 29: {
310	    enc = DK4_FILE_ENCODING_UTF16_LE;
311	    back = 1;
312	    bom = 1;
313	  } break;
314	  case 13: case 14: case 15: case 16: case 17: case 18: case 28: {
315	    enc = DK4_FILE_ENCODING_UTF16_BE;
316	    back = 1;
317	    bom = 1;
318	  } break;
319	  case 19: case 30: {
320	    enc = DK4_ENCODING_32;
321	    ae  = 1;
322	    back = 1;
323	    bom = 1;
324	  } break;
325	  case 20: case 21: case 22: case 23: case 32: {
326	    enc = DK4_FILE_ENCODING_32_LE;
327	    back = 1;
328	    bom = 1;
329	  } break;
330	  case 24: case 25: case 26: case 27: case 31: {
331	    enc = DK4_FILE_ENCODING_32_BE;
332	    back = 1;
333	    bom = 1;
334	  } break;
335	  default: {
336	    dk4error_set_simple_error_code(erp, DK4_E_SYNTAX);
337	  } break;
338	}
339	if (1 == back) {
340	  while (NULL != p2) {
341	    p1 = dk4str_chr(p2, dkT(','));
342	    if (NULL != p1) { *(p1++) = dkT('\0'); p1 = dk4str_start(p1,NULL); }
343	    dk4str_normalize(p2, NULL);
344	    switch (res = dk4str_array_index(dk4enc_option_keywords, p2, 0)) {
345	      case 0: case 1: case 2: case 3: {
346	        if (0 != ae) {
347		  switch (enc) {
348		    case DK4_ENCODING_UTF16: {
349		      enc = (
350		        ((2 == res) || (3 == res))
351			? DK4_FILE_ENCODING_UTF16_BE
352			: DK4_FILE_ENCODING_UTF16_LE
353		      );
354		    } break;
355		    case DK4_ENCODING_32: {
356		      enc = (
357		        ((2 == res) || (3 == res))
358			? DK4_FILE_ENCODING_32_BE
359			: DK4_FILE_ENCODING_32_LE
360		      );
361		    } break;
362		  }
363		  ae = 0;
364		} else {
365		  back = 0;
366		  dk4error_set_simple_error_code(erp, DK4_E_SYNTAX);
367		}
368	      } break;
369	      case 4: {
370	        switch (enc) {
371		  case DK4_FILE_ENCODING_UTF8:
372		  case DK4_FILE_ENCODING_UTF16_LE:
373		  case DK4_FILE_ENCODING_UTF16_BE:
374		  case DK4_FILE_ENCODING_32_LE:
375		  case DK4_FILE_ENCODING_32_BE: {
376		    bom = 1;
377		    bom_f = 1;
378		  } break;
379		  default: {
380		    back = 0;
381		    dk4error_set_simple_error_code(erp, DK4_E_SYNTAX);
382		  } break;
383		}
384	      } break;
385	      case 5: {
386	        bom = 0;
387		bom_f = 1;
388	      } break;
389	      default: {
390	        back = 0;
391		dk4error_set_simple_error_code(erp, DK4_E_SYNTAX);
392	      } break;
393	    }
394	    p2 = p1;
395	  }
396	}
397      } else {
398        /* ERROR: Empty string */
399	dk4error_set_simple_error_code(erp, DK4_E_SYNTAX);
400      }
401    } else {
402      dk4error_set_simple_error_code(erp, DK4_E_BUFFER_TOO_SMALL);
403    }
404  } else {
405    dk4error_set_simple_error_code(erp, DK4_E_INVALID_ARGUMENTS);
406  }
407  if (NULL != encptr) { *encptr = enc; }
408  if (0 != bom_f) { if (NULL != bomptr) { *bomptr = bom; } }
409  return back;
410}
411
412
413