1 /*
2  *      encodings.c - this file is part of Geany, a fast and lightweight IDE
3  *
4  *      Copyright 2005 The Geany contributors
5  *
6  *      This program is free software; you can redistribute it and/or modify
7  *      it under the terms of the GNU General Public License as published by
8  *      the Free Software Foundation; either version 2 of the License, or
9  *      (at your option) any later version.
10  *
11  *      This program is distributed in the hope that it will be useful,
12  *      but WITHOUT ANY WARRANTY; without even the implied warranty of
13  *      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  *      GNU General Public License for more details.
15  *
16  *      You should have received a copy of the GNU General Public License along
17  *      with this program; if not, write to the Free Software Foundation, Inc.,
18  *      51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
19  */
20 
21 /*
22  * Encoding conversion and Byte Order Mark (BOM) handling.
23  */
24 
25 /*
26  * Modified by the gedit Team, 2002. See the gedit AUTHORS file for a
27  * list of people on the gedit Team.
28  * See the gedit ChangeLog files for a list of changes.
29  */
30  /* Stolen from anjuta */
31 
32 #ifdef HAVE_CONFIG_H
33 # include "config.h"
34 #endif
35 
36 #include "encodings.h"
37 #include "encodingsprivate.h"
38 
39 #include "app.h"
40 #include "callbacks.h"
41 #include "documentprivate.h"
42 #include "support.h"
43 #include "ui_utils.h"
44 #include "utils.h"
45 
46 #include <string.h>
47 
48 
49 /* <meta http-equiv="content-type" content="text/html; charset=UTF-8" /> */
50 #define PATTERN_HTMLMETA "<meta\\s+http-equiv\\s*=\\s*\"?content-type\"?\\s+content\\s*=\\s*\"text/x?html;\\s*charset=([a-z0-9_-]+)\"\\s*/?>"
51 /* " geany_encoding=utf-8 " or " coding: utf-8 " */
52 #define PATTERN_CODING "coding[\t ]*[:=][\t ]*\"?([a-z0-9-]+)\"?[\t ]*"
53 
54 /* precompiled regexps */
55 static GRegex *pregs[2];
56 static gboolean pregs_loaded = FALSE;
57 
58 
59 GeanyEncoding encodings[GEANY_ENCODINGS_MAX];
60 
61 
62 #define fill(Order, Group, Idx, Charset, Name) \
63 		encodings[Idx].idx = Idx; \
64 		encodings[Idx].order = Order; \
65 		encodings[Idx].group = Group; \
66 		encodings[Idx].charset = Charset; \
67 		encodings[Idx].name = Name;
68 
init_encodings(void)69 static void init_encodings(void)
70 {
71 	fill(0,		WESTEUROPEAN,	GEANY_ENCODING_ISO_8859_14,		"ISO-8859-14",		_("Celtic"));
72 	fill(1,		WESTEUROPEAN,	GEANY_ENCODING_ISO_8859_7,		"ISO-8859-7",		_("Greek"));
73 	fill(2,		WESTEUROPEAN,	GEANY_ENCODING_WINDOWS_1253,	"WINDOWS-1253",		_("Greek"));
74 	fill(3,		WESTEUROPEAN,	GEANY_ENCODING_ISO_8859_10,		"ISO-8859-10",		_("Nordic"));
75 	fill(4,		WESTEUROPEAN,	GEANY_ENCODING_ISO_8859_3,		"ISO-8859-3",		_("South European"));
76 	fill(5,		WESTEUROPEAN,	GEANY_ENCODING_IBM_850,			"IBM850",			_("Western"));
77 	fill(6,		WESTEUROPEAN,	GEANY_ENCODING_ISO_8859_1,		"ISO-8859-1",		_("Western"));
78 	fill(7,		WESTEUROPEAN,	GEANY_ENCODING_ISO_8859_15,		"ISO-8859-15",		_("Western"));
79 	fill(8,		WESTEUROPEAN,	GEANY_ENCODING_WINDOWS_1252,	"WINDOWS-1252",		_("Western"));
80 
81 	fill(0,		EASTEUROPEAN,	GEANY_ENCODING_ISO_8859_4,		"ISO-8859-4",		_("Baltic"));
82 	fill(1,		EASTEUROPEAN,	GEANY_ENCODING_ISO_8859_13,		"ISO-8859-13",		_("Baltic"));
83 	fill(2,		EASTEUROPEAN,	GEANY_ENCODING_WINDOWS_1257,	"WINDOWS-1257",		_("Baltic"));
84 	fill(3,		EASTEUROPEAN,	GEANY_ENCODING_IBM_852,			"IBM852",			_("Central European"));
85 	fill(4,		EASTEUROPEAN,	GEANY_ENCODING_ISO_8859_2,		"ISO-8859-2",		_("Central European"));
86 	fill(5,		EASTEUROPEAN,	GEANY_ENCODING_WINDOWS_1250,	"WINDOWS-1250",		_("Central European"));
87 	fill(6,		EASTEUROPEAN,	GEANY_ENCODING_IBM_855,			"IBM855",			_("Cyrillic"));
88 	fill(7,		EASTEUROPEAN,	GEANY_ENCODING_ISO_8859_5,		"ISO-8859-5",		_("Cyrillic"));
89 	/* ISO-IR-111 not available on Windows */
90 	fill(8,		EASTEUROPEAN,	GEANY_ENCODING_ISO_IR_111,		"ISO-IR-111",		_("Cyrillic"));
91 	fill(9,		EASTEUROPEAN,	GEANY_ENCODING_KOI8_R,			"KOI8-R",			_("Cyrillic"));
92 	fill(10,	EASTEUROPEAN,	GEANY_ENCODING_WINDOWS_1251,	"WINDOWS-1251",		_("Cyrillic"));
93 	fill(11,	EASTEUROPEAN,	GEANY_ENCODING_CP_866,			"CP866",			_("Cyrillic/Russian"));
94 	fill(12,	EASTEUROPEAN,	GEANY_ENCODING_KOI8_U,			"KOI8-U",			_("Cyrillic/Ukrainian"));
95 	fill(13,	EASTEUROPEAN,	GEANY_ENCODING_ISO_8859_16,		"ISO-8859-16",		_("Romanian"));
96 
97 	fill(0,		MIDDLEEASTERN,	GEANY_ENCODING_IBM_864,			"IBM864",			_("Arabic"));
98 	fill(1,		MIDDLEEASTERN,	GEANY_ENCODING_ISO_8859_6,		"ISO-8859-6",		_("Arabic"));
99 	fill(2,		MIDDLEEASTERN,	GEANY_ENCODING_WINDOWS_1256,	"WINDOWS-1256",		_("Arabic"));
100 	fill(3,		MIDDLEEASTERN,	GEANY_ENCODING_IBM_862,			"IBM862",			_("Hebrew"));
101 	/* not available at all, ? */
102 	fill(4,		MIDDLEEASTERN,	GEANY_ENCODING_ISO_8859_8_I,	"ISO-8859-8-I",		_("Hebrew"));
103 	fill(5,		MIDDLEEASTERN,	GEANY_ENCODING_WINDOWS_1255,	"WINDOWS-1255",		_("Hebrew"));
104 	fill(6,		MIDDLEEASTERN,	GEANY_ENCODING_ISO_8859_8,		"ISO-8859-8",		_("Hebrew Visual"));
105 
106 	fill(0,		ASIAN,			GEANY_ENCODING_ARMSCII_8,		"ARMSCII-8",		_("Armenian"));
107 	fill(1,		ASIAN,			GEANY_ENCODING_GEOSTD8,			"GEORGIAN-ACADEMY",	_("Georgian"));
108 	fill(2,		ASIAN,			GEANY_ENCODING_TIS_620,			"TIS-620",			_("Thai"));
109 	fill(3,		ASIAN,			GEANY_ENCODING_IBM_857,			"IBM857",			_("Turkish"));
110 	fill(4,		ASIAN,			GEANY_ENCODING_WINDOWS_1254,	"WINDOWS-1254",		_("Turkish"));
111 	fill(5,		ASIAN,			GEANY_ENCODING_ISO_8859_9,		"ISO-8859-9",		_("Turkish"));
112 	fill(6,		ASIAN,			GEANY_ENCODING_TCVN,			"TCVN",				_("Vietnamese"));
113 	fill(7,		ASIAN,			GEANY_ENCODING_VISCII,			"VISCII",			_("Vietnamese"));
114 	fill(8,		ASIAN,			GEANY_ENCODING_WINDOWS_1258,	"WINDOWS-1258",		_("Vietnamese"));
115 
116 	fill(0,		UNICODE,		GEANY_ENCODING_UTF_7,			"UTF-7",			_("Unicode"));
117 	fill(1,		UNICODE,		GEANY_ENCODING_UTF_8,			"UTF-8",			_("Unicode"));
118 	fill(2,		UNICODE,		GEANY_ENCODING_UTF_16LE,		"UTF-16LE",			_("Unicode"));
119 	fill(3,		UNICODE,		GEANY_ENCODING_UTF_16BE,		"UTF-16BE",			_("Unicode"));
120 	fill(4,		UNICODE,		GEANY_ENCODING_UCS_2LE,			"UCS-2LE",			_("Unicode"));
121 	fill(5,		UNICODE,		GEANY_ENCODING_UCS_2BE,			"UCS-2BE",			_("Unicode"));
122 	fill(6,		UNICODE,		GEANY_ENCODING_UTF_32LE,		"UTF-32LE",			_("Unicode"));
123 	fill(7,		UNICODE,		GEANY_ENCODING_UTF_32BE,		"UTF-32BE",			_("Unicode"));
124 
125 	fill(0,		EASTASIAN,		GEANY_ENCODING_GB18030,			"GB18030",			_("Chinese Simplified"));
126 	fill(1,		EASTASIAN,		GEANY_ENCODING_GB2312,			"GB2312",			_("Chinese Simplified"));
127 	fill(2,		EASTASIAN,		GEANY_ENCODING_GBK,				"GBK",				_("Chinese Simplified"));
128 	/* maybe not available on Linux */
129 	fill(3,		EASTASIAN,		GEANY_ENCODING_HZ,				"HZ",				_("Chinese Simplified"));
130 	fill(4,		EASTASIAN,		GEANY_ENCODING_BIG5,			"BIG5",				_("Chinese Traditional"));
131 	fill(5,		EASTASIAN,		GEANY_ENCODING_BIG5_HKSCS,		"BIG5-HKSCS",		_("Chinese Traditional"));
132 	fill(6,		EASTASIAN,		GEANY_ENCODING_EUC_TW,			"EUC-TW",			_("Chinese Traditional"));
133 	fill(7,		EASTASIAN,		GEANY_ENCODING_EUC_JP,			"EUC-JP",			_("Japanese"));
134 	fill(8,		EASTASIAN,		GEANY_ENCODING_ISO_2022_JP,		"ISO-2022-JP",		_("Japanese"));
135 	fill(9,		EASTASIAN,		GEANY_ENCODING_SHIFT_JIS,		"SHIFT_JIS",		_("Japanese"));
136 	fill(10,	EASTASIAN,		GEANY_ENCODING_CP_932,			"CP932",			_("Japanese"));
137 	fill(11,	EASTASIAN,		GEANY_ENCODING_EUC_KR,			"EUC-KR",			_("Korean"));
138 	fill(12,	EASTASIAN,		GEANY_ENCODING_ISO_2022_KR,		"ISO-2022-KR",		_("Korean"));
139 	fill(13,	EASTASIAN,		GEANY_ENCODING_JOHAB,			"JOHAB",			_("Korean"));
140 	fill(14,	EASTASIAN,		GEANY_ENCODING_UHC,				"UHC",				_("Korean"));
141 
142 	fill(0,		NONE,			GEANY_ENCODING_NONE,			"None",				_("Without encoding"));
143 }
144 
145 
146 /* compares two encoding names in a permissive fashion.
147  * e.g. "utf8" matches "UTF-8", "iso8859_1" matches "ISO-8859-1", etc. */
encodings_charset_equals(const gchar * a,const gchar * b)148 static gboolean encodings_charset_equals(const gchar *a, const gchar *b)
149 {
150 	gboolean was_alpha = FALSE; /* whether last character of previous word was a letter */
151 	gboolean need_sep = FALSE; /* whether we're expecting an implicit separator */
152 
153 	while (*a && *b)
154 	{
155 		gboolean is_alpha;
156 
157 		if (g_ascii_toupper(*a) == g_ascii_toupper(*b) &&
158 			((is_alpha = g_ascii_isalpha(*a)) || g_ascii_isdigit(*a)))
159 		{
160 			/* either there was a real separator, or we need a implicit one (a chage from alpha to
161 			 * numeric or so) */
162 			if (! need_sep || (was_alpha != is_alpha))
163 			{
164 				a++;
165 				b++;
166 				was_alpha = is_alpha;
167 				need_sep = FALSE;
168 			}
169 			else
170 				return FALSE;
171 		}
172 		else
173 		{
174 			guint n_sep = 0;
175 
176 			if (! g_ascii_isalnum(*a))
177 			{
178 				a++;
179 				n_sep++;
180 			}
181 			if (! g_ascii_isalnum(*b))
182 			{
183 				b++;
184 				n_sep++;
185 			}
186 			if (n_sep < 1)
187 				return FALSE;
188 			else if (n_sep < 2)
189 				need_sep = TRUE;
190 		}
191 	}
192 	return *a == *b;
193 }
194 
195 
encodings_get_idx_from_charset(const gchar * charset)196 GeanyEncodingIndex encodings_get_idx_from_charset(const gchar *charset)
197 {
198 	gint i;
199 
200 	if (charset == NULL)
201 		return GEANY_ENCODING_UTF_8;
202 
203 	i = 0;
204 	while (i < GEANY_ENCODINGS_MAX)
205 	{
206 		if (encodings_charset_equals(charset, encodings[i].charset))
207 			return i;
208 
209 		++i;
210 	}
211 	return GEANY_ENCODING_UTF_8;
212 }
213 
214 
encodings_get_from_charset(const gchar * charset)215 const GeanyEncoding *encodings_get_from_charset(const gchar *charset)
216 {
217 	gint i;
218 
219 	if (charset == NULL)
220 		return &encodings[GEANY_ENCODING_UTF_8];
221 
222 	i = 0;
223 	while (i < GEANY_ENCODINGS_MAX)
224 	{
225 		if (encodings_charset_equals(charset, encodings[i].charset))
226 			return &encodings[i];
227 
228 		++i;
229 	}
230 
231 	return NULL;
232 }
233 
234 
encodings_normalize_charset(const gchar * charset)235 static const gchar *encodings_normalize_charset(const gchar *charset)
236 {
237 	const GeanyEncoding *encoding;
238 
239 	encoding = encodings_get_from_charset(charset);
240 	if (encoding != NULL)
241 		return encoding->charset;
242 
243 	return NULL;
244 }
245 
246 
encodings_get_from_index(gint idx)247 const GeanyEncoding *encodings_get_from_index(gint idx)
248 {
249 	g_return_val_if_fail(idx >= 0 && idx < GEANY_ENCODINGS_MAX, NULL);
250 
251 	return &encodings[idx];
252 }
253 
254 
255 /**
256  *  Gets the character set name of the specified index e.g. for use with
257  *  @ref document_set_encoding().
258  *
259  *  @param idx @ref GeanyEncodingIndex to retrieve the corresponding character set.
260  *
261  *
262  *  @return @nullable The charset according to idx, or @c NULL if the index is invalid.
263  *
264  *  @since 0.13
265  **/
266 GEANY_API_SYMBOL
encodings_get_charset_from_index(gint idx)267 const gchar* encodings_get_charset_from_index(gint idx)
268 {
269 	g_return_val_if_fail(idx >= 0 && idx < GEANY_ENCODINGS_MAX, NULL);
270 
271 	return encodings[idx].charset;
272 }
273 
274 
encodings_to_string(const GeanyEncoding * enc)275 gchar *encodings_to_string(const GeanyEncoding* enc)
276 {
277 	g_return_val_if_fail(enc != NULL, NULL);
278 	g_return_val_if_fail(enc->name != NULL, NULL);
279 	g_return_val_if_fail(enc->charset != NULL, NULL);
280 
281 	return g_strdup_printf("%s (%s)", enc->name, enc->charset);
282 }
283 
284 
encodings_get_charset(const GeanyEncoding * enc)285 const gchar *encodings_get_charset(const GeanyEncoding* enc)
286 {
287 	g_return_val_if_fail(enc != NULL, NULL);
288 	g_return_val_if_fail(enc->charset != NULL, NULL);
289 
290 	return enc->charset;
291 }
292 
293 
294 static GtkWidget *radio_items[GEANY_ENCODINGS_MAX];
295 
296 
encodings_select_radio_item(const gchar * charset)297 void encodings_select_radio_item(const gchar *charset)
298 {
299 	gint i;
300 
301 	g_return_if_fail(charset != NULL);
302 
303 	i = 0;
304 	while (i < GEANY_ENCODINGS_MAX)
305 	{
306 		if (utils_str_equal(charset, encodings[i].charset))
307 			break;
308 		i++;
309 	}
310 	if (i == GEANY_ENCODINGS_MAX)
311 		i = GEANY_ENCODING_UTF_8; /* fallback to UTF-8 */
312 
313 	/* ignore_callback has to be set by the caller */
314 	gtk_check_menu_item_set_active(GTK_CHECK_MENU_ITEM(radio_items[i]), TRUE);
315 }
316 
317 
318 /* Regexp detection of file encoding declared in the file itself.
319  * Idea and parts of code taken from Bluefish, thanks.
320  * regex_compile() is used to compile regular expressions on program init and keep it in memory
321  * for faster access when opening a file. Pre-compiled regexps will be freed on program exit.
322  */
regex_compile(const gchar * pattern)323 static GRegex *regex_compile(const gchar *pattern)
324 {
325 	GError *error = NULL;
326 	GRegex *regex = g_regex_new(pattern, G_REGEX_CASELESS, 0, &error);
327 
328 	if (!regex)
329 	{
330 		geany_debug("Failed to compile encoding regex (%s)", error->message);
331 		g_error_free(error);
332 	}
333 	return regex;
334 }
335 
336 
regex_match(GRegex * preg,const gchar * buffer,gsize size)337 static gchar *regex_match(GRegex *preg, const gchar *buffer, gsize size)
338 {
339 	gchar *encoding = NULL;
340 	GMatchInfo *minfo;
341 
342 	if (G_UNLIKELY(! pregs_loaded || buffer == NULL))
343 		return NULL;
344 
345 	/* scan only the first 512 characters in the buffer */
346 	size = MIN(size, 512);
347 
348 	if (g_regex_match_full(preg, buffer, size, 0, 0, &minfo, NULL) &&
349 		g_match_info_get_match_count(minfo) >= 2)
350 	{
351 		encoding = g_match_info_fetch(minfo, 1);
352 		geany_debug("Detected encoding by regex search: %s", encoding);
353 
354 		SETPTR(encoding, g_utf8_strup(encoding, -1));
355 	}
356 	g_match_info_free(minfo);
357 	return encoding;
358 }
359 
360 
encodings_radio_item_change_cb(GtkCheckMenuItem * menuitem,gpointer user_data)361 static void encodings_radio_item_change_cb(GtkCheckMenuItem *menuitem, gpointer user_data)
362 {
363 	GeanyDocument *doc = document_get_current();
364 	const gchar *charset = user_data;
365 
366 	if (ignore_callback || doc == NULL || charset == NULL ||
367 		! gtk_check_menu_item_get_active(menuitem) ||
368 		utils_str_equal(charset, doc->encoding))
369 		return;
370 
371 	if (doc->readonly)
372 	{
373 		utils_beep();
374 		return;
375 	}
376 	document_undo_add(doc, UNDO_ENCODING, g_strdup(doc->encoding));
377 
378 	document_set_encoding(doc, charset);
379 }
380 
encodings_reload_radio_item_change_cb(GtkMenuItem * menuitem,gpointer user_data)381 static void encodings_reload_radio_item_change_cb(GtkMenuItem *menuitem, gpointer user_data)
382 {
383 	GeanyDocument *doc = document_get_current();
384 
385 	g_return_if_fail(doc != NULL);
386 
387 	document_reload_prompt(doc, user_data);
388 }
389 
390 
encodings_finalize(void)391 void encodings_finalize(void)
392 {
393 	if (pregs_loaded)
394 	{
395 		guint i, len;
396 		len = G_N_ELEMENTS(pregs);
397 		for (i = 0; i < len; i++)
398 		{
399 			g_regex_unref(pregs[i]);
400 		}
401 	}
402 }
403 
404 
encodings_init(void)405 void encodings_init(void)
406 {
407 	GtkWidget *menu[2];
408 	GCallback cb_func[2];
409 	gint group_sizes[GEANY_ENCODING_GROUPS_MAX] = { 0 };
410 	const gchar *const groups[GEANY_ENCODING_GROUPS_MAX] =
411 	{
412 		[NONE]			= NULL,
413 		[WESTEUROPEAN]	= N_("_West European"),
414 		[EASTEUROPEAN]	= N_("_East European"),
415 		[EASTASIAN]		= N_("East _Asian"),
416 		[ASIAN]			= N_("_SE & SW Asian"),
417 		[MIDDLEEASTERN]	= N_("_Middle Eastern"),
418 		[UNICODE]		= N_("_Unicode"),
419 	};
420 
421 	init_encodings();
422 
423 	if (! pregs_loaded)
424 	{
425 		pregs[0] = regex_compile(PATTERN_HTMLMETA);
426 		pregs[1] = regex_compile(PATTERN_CODING);
427 		pregs_loaded = TRUE;
428 	}
429 
430 	/* create encodings submenu in document menu */
431 	menu[0] = ui_lookup_widget(main_widgets.window, "set_encoding1_menu");
432 	menu[1] = ui_lookup_widget(main_widgets.window, "menu_reload_as1_menu");
433 	cb_func[0] = G_CALLBACK(encodings_radio_item_change_cb);
434 	cb_func[1] = G_CALLBACK(encodings_reload_radio_item_change_cb);
435 
436 	for (guint i = 0; i < G_N_ELEMENTS(encodings); i++)
437 		group_sizes[encodings[i].group]++;
438 
439 	for (guint k = 0; k < 2; k++)
440 	{
441 		GSList *group = NULL;
442 		GtkWidget *submenus[GEANY_ENCODING_GROUPS_MAX];
443 		gint orders[GEANY_ENCODING_GROUPS_MAX] = { 0 };
444 		guint n_added = 0;
445 
446 		for (guint i = 0; i < GEANY_ENCODING_GROUPS_MAX; i++)
447 		{
448 			if (! groups[i]) /* NONE */
449 				submenus[i] = menu[k];
450 			else
451 			{
452 				GtkWidget *item = gtk_menu_item_new_with_mnemonic(_(groups[i]));
453 				submenus[i] = gtk_menu_new();
454 				gtk_menu_item_set_submenu(GTK_MENU_ITEM(item), submenus[i]);
455 				gtk_container_add(GTK_CONTAINER(menu[k]), item);
456 				gtk_widget_show_all(item);
457 			}
458 		}
459 
460 		/** TODO can it be optimized? ATM 882 runs at line "if (encodings[i].order ...)" */
461 		do
462 		{
463 			for (guint i = 0; i < G_N_ELEMENTS(encodings); i++)
464 			{
465 				if (encodings[i].order == orders[encodings[i].group])
466 				{
467 					GtkWidget *item;
468 					gchar *label = encodings_to_string(&encodings[i]);
469 
470 					if (k == 0) /* Set Encoding menu */
471 					{
472 						item = gtk_radio_menu_item_new_with_label(group, label);
473 						group = gtk_radio_menu_item_get_group(GTK_RADIO_MENU_ITEM(item));
474 						radio_items[i] = item;
475 					}
476 					else
477 						item = gtk_menu_item_new_with_label(label);
478 					gtk_widget_show(item);
479 					gtk_container_add(GTK_CONTAINER(submenus[encodings[i].group]), item);
480 					g_signal_connect(item, "activate", cb_func[k],
481 							(gpointer) encodings[i].charset);
482 					g_free(label);
483 
484 					orders[encodings[i].group]++;
485 					n_added++;
486 				}
487 			}
488 		}
489 		while (n_added < G_N_ELEMENTS(encodings));
490 	}
491 }
492 
493 
encoding_combo_store_sort_func(GtkTreeModel * model,GtkTreeIter * a,GtkTreeIter * b,gpointer data)494 static gint encoding_combo_store_sort_func(GtkTreeModel *model,
495 										   GtkTreeIter *a,
496 										   GtkTreeIter *b,
497 										   gpointer data)
498 {
499 	gboolean a_has_child = gtk_tree_model_iter_has_child(model, a);
500 	gboolean b_has_child = gtk_tree_model_iter_has_child(model, b);
501 	gchar *a_string;
502 	gchar *b_string;
503 	gint cmp_res;
504 
505 	if (a_has_child != b_has_child)
506 		return a_has_child ? -1 : 1;
507 
508 	gtk_tree_model_get(model, a, 1, &a_string, -1);
509 	gtk_tree_model_get(model, b, 1, &b_string, -1);
510 	cmp_res = strcmp(a_string, b_string);
511 	g_free(a_string);
512 	g_free(b_string);
513 	return cmp_res;
514 }
515 
516 
encodings_encoding_store_new(gboolean has_detect)517 GtkTreeStore *encodings_encoding_store_new(gboolean has_detect)
518 {
519 	GtkTreeStore *store;
520 	GtkTreeIter iter_current, iter_westeuro, iter_easteuro, iter_eastasian,
521 				iter_asian, iter_utf8, iter_middleeast;
522 	GtkTreeIter *iter_parent;
523 	gint i;
524 
525 	store = gtk_tree_store_new(2, G_TYPE_INT, G_TYPE_STRING);
526 
527 	if (has_detect)
528 	{
529 		gtk_tree_store_append(store, &iter_current, NULL);
530 		gtk_tree_store_set(store, &iter_current, 0, GEANY_ENCODINGS_MAX, 1, _("Detect from file"), -1);
531 	}
532 
533 	gtk_tree_store_append(store, &iter_westeuro, NULL);
534 	gtk_tree_store_set(store, &iter_westeuro, 0, -1, 1, _("West European"), -1);
535 	gtk_tree_store_append(store, &iter_easteuro, NULL);
536 	gtk_tree_store_set(store, &iter_easteuro, 0, -1, 1, _("East European"), -1);
537 	gtk_tree_store_append(store, &iter_eastasian, NULL);
538 	gtk_tree_store_set(store, &iter_eastasian, 0, -1, 1, _("East Asian"), -1);
539 	gtk_tree_store_append(store, &iter_asian, NULL);
540 	gtk_tree_store_set(store, &iter_asian, 0, -1, 1, _("SE & SW Asian"), -1);
541 	gtk_tree_store_append(store, &iter_middleeast, NULL);
542 	gtk_tree_store_set(store, &iter_middleeast, 0, -1, 1, _("Middle Eastern"), -1);
543 	gtk_tree_store_append(store, &iter_utf8, NULL);
544 	gtk_tree_store_set(store, &iter_utf8, 0, -1, 1, _("Unicode"), -1);
545 
546 	for (i = 0; i < GEANY_ENCODINGS_MAX; i++)
547 	{
548 		gchar *encoding_string;
549 
550 		switch (encodings[i].group)
551 		{
552 			case WESTEUROPEAN: iter_parent = &iter_westeuro; break;
553 			case EASTEUROPEAN: iter_parent = &iter_easteuro; break;
554 			case EASTASIAN: iter_parent = &iter_eastasian; break;
555 			case ASIAN: iter_parent = &iter_asian; break;
556 			case MIDDLEEASTERN: iter_parent = &iter_middleeast; break;
557 			case UNICODE: iter_parent = &iter_utf8; break;
558 			case NONE:
559 			default: iter_parent = NULL;
560 		}
561 		gtk_tree_store_append(store, &iter_current, iter_parent);
562 		encoding_string = encodings_to_string(&encodings[i]);
563 		gtk_tree_store_set(store, &iter_current, 0, i, 1, encoding_string, -1);
564 		g_free(encoding_string);
565 	}
566 
567 	gtk_tree_sortable_set_sort_column_id(GTK_TREE_SORTABLE(store), 1, GTK_SORT_ASCENDING);
568 	gtk_tree_sortable_set_sort_func(GTK_TREE_SORTABLE(store), 1, encoding_combo_store_sort_func, NULL, NULL);
569 
570 	return store;
571 }
572 
573 
encodings_encoding_store_get_encoding(GtkTreeStore * store,GtkTreeIter * iter)574 gint encodings_encoding_store_get_encoding(GtkTreeStore *store, GtkTreeIter *iter)
575 {
576 	gint enc;
577 	gtk_tree_model_get(GTK_TREE_MODEL(store), iter, 0, &enc, -1);
578 	return enc;
579 }
580 
581 
encodings_encoding_store_get_iter(GtkTreeStore * store,GtkTreeIter * iter,gint enc)582 gboolean encodings_encoding_store_get_iter(GtkTreeStore *store, GtkTreeIter *iter, gint enc)
583 {
584 	if (gtk_tree_model_get_iter_first(GTK_TREE_MODEL(store), iter))
585 	{
586 		do
587 		{
588 			if (encodings_encoding_store_get_encoding(store, iter) == enc)
589 				return TRUE;
590 		}
591 		while (ui_tree_model_iter_any_next(GTK_TREE_MODEL(store), iter, TRUE));
592 	}
593 	return FALSE;
594 }
595 
596 
encodings_encoding_store_cell_data_func(GtkCellLayout * cell_layout,GtkCellRenderer * cell,GtkTreeModel * tree_model,GtkTreeIter * iter,gpointer data)597 void encodings_encoding_store_cell_data_func(GtkCellLayout *cell_layout,
598 											 GtkCellRenderer *cell,
599 											 GtkTreeModel *tree_model,
600 											 GtkTreeIter *iter,
601 											 gpointer data)
602 {
603 	gboolean sensitive = !gtk_tree_model_iter_has_child(tree_model, iter);
604 	gchar *text;
605 
606 	gtk_tree_model_get(tree_model, iter, 1, &text, -1);
607 	g_object_set(cell, "sensitive", sensitive, "text", text, NULL);
608 	g_free(text);
609 }
610 
611 
612 /**
613  *  Tries to convert @a buffer into UTF-8 encoding from the encoding specified with @a charset.
614  *  If @a fast is not set, additional checks to validate the converted string are performed.
615  *
616  *  @param buffer The input string to convert.
617  *  @param size The length of the string, or -1 if the string is nul-terminated.
618  *  @param charset The charset to be used for conversion.
619  *  @param fast @c TRUE to only convert the input and skip extended checks on the converted string.
620  *
621  *  @return If the conversion was successful, a newly allocated nul-terminated string,
622  *    which must be freed with @c g_free(). Otherwise @c NULL.
623  **/
624 GEANY_API_SYMBOL
encodings_convert_to_utf8_from_charset(const gchar * buffer,gssize size,const gchar * charset,gboolean fast)625 gchar *encodings_convert_to_utf8_from_charset(const gchar *buffer, gssize size,
626 											  const gchar *charset, gboolean fast)
627 {
628 	gchar *utf8_content = NULL;
629 	GError *conv_error = NULL;
630 	gchar* converted_contents = NULL;
631 	gsize bytes_written;
632 
633 	g_return_val_if_fail(buffer != NULL, NULL);
634 	g_return_val_if_fail(charset != NULL, NULL);
635 
636 	converted_contents = g_convert(buffer, size, "UTF-8", charset, NULL,
637 								   &bytes_written, &conv_error);
638 
639 	if (fast)
640 	{
641 		utf8_content = converted_contents;
642 		if (conv_error != NULL) g_error_free(conv_error);
643 	}
644 	else if (conv_error != NULL || ! g_utf8_validate(converted_contents, bytes_written, NULL))
645 	{
646 		if (conv_error != NULL)
647 		{
648 			geany_debug("Couldn't convert from %s to UTF-8 (%s).", charset, conv_error->message);
649 			g_error_free(conv_error);
650 			conv_error = NULL;
651 		}
652 		else
653 			geany_debug("Couldn't convert from %s to UTF-8.", charset);
654 
655 		utf8_content = NULL;
656 		g_free(converted_contents);
657 	}
658 	else
659 	{
660 		geany_debug("Converted from %s to UTF-8.", charset);
661 		utf8_content = converted_contents;
662 	}
663 
664 	return utf8_content;
665 }
666 
667 
encodings_check_regexes(const gchar * buffer,gsize size)668 static gchar *encodings_check_regexes(const gchar *buffer, gsize size)
669 {
670 	guint i;
671 
672 	for (i = 0; i < G_N_ELEMENTS(pregs); i++)
673 	{
674 		gchar *charset;
675 
676 		if ((charset = regex_match(pregs[i], buffer, size)) != NULL)
677 			return charset;
678 	}
679 	return NULL;
680 }
681 
682 
encodings_convert_to_utf8_with_suggestion(const gchar * buffer,gssize size,const gchar * suggested_charset,gchar ** used_encoding)683 static gchar *encodings_convert_to_utf8_with_suggestion(const gchar *buffer, gssize size,
684 		const gchar *suggested_charset, gchar **used_encoding)
685 {
686 	const gchar *locale_charset = NULL;
687 	const gchar *charset;
688 	gchar *utf8_content;
689 	gboolean check_suggestion = suggested_charset != NULL;
690 	gboolean check_locale = FALSE;
691 	gint i, preferred_charset;
692 
693 	if (size == -1)
694 	{
695 		size = strlen(buffer);
696 	}
697 
698 	/* current locale is not UTF-8, we have to check this charset */
699 	check_locale = ! g_get_charset(&locale_charset);
700 
701 	/* First check for preferred charset, if specified */
702 	preferred_charset = file_prefs.default_open_encoding;
703 
704 	if (preferred_charset == (gint) encodings[GEANY_ENCODING_NONE].idx ||
705 		preferred_charset < 0 ||
706 		preferred_charset >= GEANY_ENCODINGS_MAX)
707 	{
708 		preferred_charset = -1;
709 	}
710 
711 	/* -1 means "Preferred charset" */
712 	for (i = -1; i < GEANY_ENCODINGS_MAX; i++)
713 	{
714 		if (G_UNLIKELY(i == (gint) encodings[GEANY_ENCODING_NONE].idx))
715 			continue;
716 
717 		if (check_suggestion)
718 		{
719 			check_suggestion = FALSE;
720 			charset = encodings_normalize_charset(suggested_charset);
721 			if (charset == NULL) /* we failed at normalizing suggested encoding, try it as is */
722 				charset = suggested_charset;
723 			i = -2; /* keep i below the start value to have it again at -1 on the next loop run */
724 		}
725 		else if (check_locale)
726 		{
727 			check_locale = FALSE;
728 			charset = locale_charset;
729 			i = -2; /* keep i below the start value to have it again at -1 on the next loop run */
730 		}
731 		else if (i == -1)
732 		{
733 			if (preferred_charset >= 0)
734 			{
735 				charset = encodings[preferred_charset].charset;
736 				geany_debug("Using preferred charset: %s", charset);
737 			}
738 			else
739 				continue;
740 		}
741 		else if (i >= 0)
742 			charset = encodings[i].charset;
743 		else /* in this case we have i == -2, continue to increase i and go ahead */
744 			continue;
745 
746 		if (G_UNLIKELY(charset == NULL))
747 			continue;
748 
749 		geany_debug("Trying to convert %" G_GSIZE_FORMAT " bytes of data from %s into UTF-8.",
750 			size, charset);
751 		utf8_content = encodings_convert_to_utf8_from_charset(buffer, size, charset, FALSE);
752 
753 		if (G_LIKELY(utf8_content != NULL))
754 		{
755 			if (used_encoding != NULL)
756 			{
757 				if (G_UNLIKELY(*used_encoding != NULL))
758 				{
759 					geany_debug("%s:%d", __FILE__, __LINE__);
760 					g_free(*used_encoding);
761 				}
762 				*used_encoding = g_strdup(charset);
763 			}
764 			return utf8_content;
765 		}
766 	}
767 
768 	return NULL;
769 }
770 
771 
772 /**
773  *  Tries to convert @a buffer into UTF-8 encoding and store the detected original encoding in
774  *  @a used_encoding.
775  *
776  *  @param buffer the input string to convert.
777  *  @param size the length of the string, or -1 if the string is nul-terminated.
778  *  @param used_encoding @out @optional return location of the detected encoding of the input string, or @c NULL.
779  *
780  *  @return @nullable If the conversion was successful, a newly allocated nul-terminated string,
781  *    which must be freed with @c g_free(). Otherwise @c NULL.
782  **/
783 GEANY_API_SYMBOL
encodings_convert_to_utf8(const gchar * buffer,gssize size,gchar ** used_encoding)784 gchar *encodings_convert_to_utf8(const gchar *buffer, gssize size, gchar **used_encoding)
785 {
786 	gchar *regex_charset;
787 	gchar *utf8;
788 
789 	/* first try to read the encoding from the file content */
790 	regex_charset = encodings_check_regexes(buffer, size);
791 	utf8 = encodings_convert_to_utf8_with_suggestion(buffer, size, regex_charset, used_encoding);
792 	g_free(regex_charset);
793 
794 	return utf8;
795 }
796 
797 
798 /* If there's a BOM, return a corresponding GEANY_ENCODING_UTF_* index,
799  * otherwise GEANY_ENCODING_NONE.
800  * */
encodings_scan_unicode_bom(const gchar * string,gsize len,guint * bom_len)801 GeanyEncodingIndex encodings_scan_unicode_bom(const gchar *string, gsize len, guint *bom_len)
802 {
803 	if (len >= 3)
804 	{
805 		if (bom_len)
806 			*bom_len = 3;
807 
808 		if ((guchar)string[0] == 0xef && (guchar)string[1] == 0xbb &&
809 			(guchar)string[2] == 0xbf)
810 		{
811 			return GEANY_ENCODING_UTF_8;
812 		}
813 	}
814 	if (len >= 4)
815 	{
816 		if (bom_len)
817 			*bom_len = 4;
818 
819 		if ((guchar)string[0] == 0x00 && (guchar)string[1] == 0x00 &&
820 				 (guchar)string[2] == 0xfe && (guchar)string[3] == 0xff)
821 		{
822 			return GEANY_ENCODING_UTF_32BE; /* Big endian */
823 		}
824 		if ((guchar)string[0] == 0xff && (guchar)string[1] == 0xfe &&
825 				 (guchar)string[2] == 0x00 && (guchar)string[3] == 0x00)
826 		{
827 			return GEANY_ENCODING_UTF_32LE; /* Little endian */
828 		}
829 		if ((string[0] == 0x2b && string[1] == 0x2f && string[2] == 0x76) &&
830 				 (string[3] == 0x38 || string[3] == 0x39 || string[3] == 0x2b || string[3] == 0x2f))
831 		{
832 			 return GEANY_ENCODING_UTF_7;
833 		}
834 	}
835 	if (len >= 2)
836 	{
837 		if (bom_len)
838 			*bom_len = 2;
839 
840 		if ((guchar)string[0] == 0xfe && (guchar)string[1] == 0xff)
841 		{
842 			return GEANY_ENCODING_UTF_16BE; /* Big endian */
843 		}
844 		if ((guchar)string[0] == 0xff && (guchar)string[1] == 0xfe)
845 		{
846 			return GEANY_ENCODING_UTF_16LE; /* Little endian */
847 		}
848 	}
849 	if (bom_len)
850 		*bom_len = 0;
851 	return GEANY_ENCODING_NONE;
852 }
853 
854 
encodings_is_unicode_charset(const gchar * string)855 gboolean encodings_is_unicode_charset(const gchar *string)
856 {
857 	if (string != NULL &&
858 		(strncmp(string, "UTF", 3) == 0 || strncmp(string, "UCS", 3) == 0))
859 	{
860 		return TRUE;
861 	}
862 	return FALSE;
863 }
864 
865 
866 typedef struct
867 {
868 	gchar		*data;	/* null-terminated data */
869 	gsize		 size;	/* actual data size */
870 	gsize		 len;	/* string length of data */
871 	gchar		*enc;
872 	gboolean	 bom;
873 	gboolean	 partial;
874 } BufferData;
875 
876 
877 /* convert data with the specified encoding */
878 static gboolean
handle_forced_encoding(BufferData * buffer,const gchar * forced_enc)879 handle_forced_encoding(BufferData *buffer, const gchar *forced_enc)
880 {
881 	GeanyEncodingIndex enc_idx;
882 
883 	if (utils_str_equal(forced_enc, "UTF-8"))
884 	{
885 		if (! g_utf8_validate(buffer->data, buffer->len, NULL))
886 		{
887 			return FALSE;
888 		}
889 	}
890 	else
891 	{
892 		gchar *converted_text = encodings_convert_to_utf8_from_charset(
893 										buffer->data, buffer->size, forced_enc, FALSE);
894 		if (converted_text == NULL)
895 		{
896 			return FALSE;
897 		}
898 		else
899 		{
900 			SETPTR(buffer->data, converted_text);
901 			buffer->len = strlen(converted_text);
902 		}
903 	}
904 	enc_idx = encodings_scan_unicode_bom(buffer->data, buffer->size, NULL);
905 	buffer->bom = (enc_idx == GEANY_ENCODING_UTF_8);
906 	buffer->enc = g_strdup(forced_enc);
907 	return TRUE;
908 }
909 
910 
911 /* detect encoding and convert to UTF-8 if necessary */
912 static gboolean
handle_encoding(BufferData * buffer,GeanyEncodingIndex enc_idx)913 handle_encoding(BufferData *buffer, GeanyEncodingIndex enc_idx)
914 {
915 	g_return_val_if_fail(buffer->enc == NULL, FALSE);
916 	g_return_val_if_fail(buffer->bom == FALSE, FALSE);
917 
918 	if (buffer->size == 0)
919 	{
920 		/* we have no data so assume UTF-8, buffer->len can be 0 even we have an empty
921 		 * e.g. UTF32 file with a BOM(so size is 4, len is 0) */
922 		buffer->enc = g_strdup("UTF-8");
923 	}
924 	else
925 	{
926 		/* first check for a BOM */
927 		if (enc_idx != GEANY_ENCODING_NONE)
928 		{
929 			buffer->enc = g_strdup(encodings[enc_idx].charset);
930 			buffer->bom = TRUE;
931 
932 			if (enc_idx != GEANY_ENCODING_UTF_8) /* the BOM indicated something else than UTF-8 */
933 			{
934 				gchar *converted_text = encodings_convert_to_utf8_from_charset(
935 										buffer->data, buffer->size, buffer->enc, FALSE);
936 				if (converted_text != NULL)
937 				{
938 					SETPTR(buffer->data, converted_text);
939 					buffer->len = strlen(converted_text);
940 				}
941 				else
942 				{
943 					/* there was a problem converting data from BOM encoding type */
944 					SETPTR(buffer->enc, NULL);
945 					buffer->bom = FALSE;
946 				}
947 			}
948 		}
949 
950 		if (buffer->enc == NULL)	/* either there was no BOM or the BOM encoding failed */
951 		{
952 			/* first try to read the encoding from the file content */
953 			gchar *regex_charset = encodings_check_regexes(buffer->data, buffer->size);
954 
955 			/* try UTF-8 first */
956 			if (encodings_get_idx_from_charset(regex_charset) == GEANY_ENCODING_UTF_8 &&
957 				(buffer->size == buffer->len) && g_utf8_validate(buffer->data, buffer->len, NULL))
958 			{
959 				buffer->enc = g_strdup("UTF-8");
960 			}
961 			else
962 			{
963 				/* detect the encoding */
964 				gchar *converted_text = encodings_convert_to_utf8_with_suggestion(buffer->data,
965 					buffer->size, regex_charset, &buffer->enc);
966 
967 				if (converted_text == NULL)
968 				{
969 					g_free(regex_charset);
970 					return FALSE;
971 				}
972 				SETPTR(buffer->data, converted_text);
973 				buffer->len = strlen(converted_text);
974 			}
975 			g_free(regex_charset);
976 		}
977 	}
978 	return TRUE;
979 }
980 
981 
982 static void
handle_bom(BufferData * buffer)983 handle_bom(BufferData *buffer)
984 {
985 	guint bom_len;
986 
987 	encodings_scan_unicode_bom(buffer->data, buffer->size, &bom_len);
988 	g_return_if_fail(bom_len != 0);
989 
990 	/* use filedata->len here because the contents are already converted into UTF-8 */
991 	buffer->len -= bom_len;
992 	/* overwrite the BOM with the remainder of the file contents, plus the NULL terminator. */
993 	memmove(buffer->data, buffer->data + bom_len, buffer->len + 1);
994 	buffer->data = g_realloc(buffer->data, buffer->len + 1);
995 }
996 
997 
998 /* loads textfile data, verifies and converts to forced_enc or UTF-8. Also handles BOM. */
handle_buffer(BufferData * buffer,const gchar * forced_enc)999 static gboolean handle_buffer(BufferData *buffer, const gchar *forced_enc)
1000 {
1001 	GeanyEncodingIndex tmp_enc_idx;
1002 
1003 	/* temporarily retrieve the encoding idx based on the BOM to suppress the following warning
1004 	 * if we have a BOM */
1005 	tmp_enc_idx = encodings_scan_unicode_bom(buffer->data, buffer->size, NULL);
1006 
1007 	/* check whether the size of the loaded data is equal to the size of the file in the
1008 	 * filesystem file size may be 0 to allow opening files in /proc/ which have typically a
1009 	 * file size of 0 bytes */
1010 	if (buffer->len != buffer->size && buffer->size != 0 && (
1011 		tmp_enc_idx == GEANY_ENCODING_UTF_8 || /* tmp_enc_idx can be UTF-7/8/16/32, UCS and None */
1012 		tmp_enc_idx == GEANY_ENCODING_UTF_7))  /* filter UTF-7/8 where no NULL bytes are allowed */
1013 	{
1014 		buffer->partial = TRUE;
1015 	}
1016 
1017 	/* Determine character encoding and convert to UTF-8 */
1018 	if (forced_enc != NULL)
1019 	{
1020 		/* the encoding should be ignored(requested by user), so open the file "as it is" */
1021 		if (utils_str_equal(forced_enc, encodings[GEANY_ENCODING_NONE].charset))
1022 		{
1023 			buffer->bom = FALSE;
1024 			buffer->enc = g_strdup(encodings[GEANY_ENCODING_NONE].charset);
1025 		}
1026 		else if (! handle_forced_encoding(buffer, forced_enc))
1027 		{
1028 			return FALSE;
1029 		}
1030 	}
1031 	else if (! handle_encoding(buffer, tmp_enc_idx))
1032 	{
1033 		return FALSE;
1034 	}
1035 
1036 	if (buffer->bom)
1037 		handle_bom(buffer);
1038 	return TRUE;
1039 }
1040 
1041 
1042 /*
1043  * Tries to convert @a buffer into UTF-8 encoding. Unlike encodings_convert_to_utf8()
1044  * and encodings_convert_to_utf8_from_charset() it handles the possible BOM in the data.
1045  *
1046  * @param buf a pointer to modifiable null-terminated buffer to convert.
1047  *   It may or may not be modified, and should be freed whatever happens.
1048  * @param size a pointer to the size of the buffer (expected to be e.g. the on-disk
1049  *   file size). It will be updated to the new size.
1050  * @param forced_enc forced encoding to use, or @c NULL
1051  * @param used_encoding return location for the actually used encoding, or @c NULL
1052  * @param has_bom return location to store whether the data had a BOM, or @c NULL
1053  * @param partial return location to store whether the conversion may be partial, or @c NULL
1054  *
1055  * @return @C TRUE if the conversion succeeded, @c FALSE otherwise.
1056  */
encodings_convert_to_utf8_auto(gchar ** buf,gsize * size,const gchar * forced_enc,gchar ** used_encoding,gboolean * has_bom,gboolean * partial)1057 gboolean encodings_convert_to_utf8_auto(gchar **buf, gsize *size, const gchar *forced_enc,
1058 		gchar **used_encoding, gboolean *has_bom, gboolean *partial)
1059 {
1060 	BufferData buffer;
1061 
1062 	buffer.data = *buf;
1063 	buffer.size = *size;
1064 	/* use strlen to check for null chars */
1065 	buffer.len = strlen(buffer.data);
1066 	buffer.enc = NULL;
1067 	buffer.bom = FALSE;
1068 	buffer.partial = FALSE;
1069 
1070 	if (! handle_buffer(&buffer, forced_enc))
1071 		return FALSE;
1072 
1073 	*size = buffer.len;
1074 	if (used_encoding)
1075 		*used_encoding = buffer.enc;
1076 	else
1077 		g_free(buffer.enc);
1078 	if (has_bom)
1079 		*has_bom = buffer.bom;
1080 	if (partial)
1081 		*partial = buffer.partial;
1082 
1083 	*buf = buffer.data;
1084 	return TRUE;
1085 }
1086