1 /**************************************************************************
2  *   chars.c  --  This file is part of GNU nano.                          *
3  *                                                                        *
4  *   Copyright (C) 2001-2011, 2013-2021 Free Software Foundation, Inc.    *
5  *   Copyright (C) 2016-2021 Benno Schulenberg                            *
6  *                                                                        *
7  *   GNU nano is free software: you can redistribute it and/or modify     *
8  *   it under the terms of the GNU General Public License as published    *
9  *   by the Free Software Foundation, either version 3 of the License,    *
10  *   or (at your option) any later version.                               *
11  *                                                                        *
12  *   GNU nano is distributed in the hope that it will be useful,          *
13  *   but WITHOUT ANY WARRANTY; without even the implied warranty          *
14  *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.              *
15  *   See the GNU General Public License for more details.                 *
16  *                                                                        *
17  *   You should have received a copy of the GNU General Public License    *
18  *   along with this program.  If not, see http://www.gnu.org/licenses/.  *
19  *                                                                        *
20  **************************************************************************/
21 
22 #include "prototypes.h"
23 
24 #include <ctype.h>
25 #include <string.h>
26 
27 #ifdef ENABLE_UTF8
28 #include <wchar.h>
29 #include <wctype.h>
30 
31 static bool use_utf8 = FALSE;
32 		/* Whether we've enabled UTF-8 support. */
33 
34 /* Enable UTF-8 support. */
utf8_init(void)35 void utf8_init(void)
36 {
37 	use_utf8 = TRUE;
38 }
39 
40 /* Is UTF-8 support enabled? */
using_utf8(void)41 bool using_utf8(void)
42 {
43 	return use_utf8;
44 }
45 #endif /* ENABLE_UTF8 */
46 
47 #ifdef ENABLE_SPELLER
48 /* Return TRUE when the given character is some kind of letter. */
is_alpha_char(const char * c)49 bool is_alpha_char(const char *c)
50 {
51 #ifdef ENABLE_UTF8
52 	wchar_t wc;
53 
54 	if (mbtowide(&wc, c) < 0)
55 		return FALSE;
56 
57 	return iswalpha(wc);
58 #else
59 	return isalpha((unsigned char)*c);
60 #endif
61 }
62 #endif /* ENABLE_SPELLER */
63 
64 /* Return TRUE when the given character is some kind of letter or a digit. */
is_alnum_char(const char * c)65 bool is_alnum_char(const char *c)
66 {
67 #ifdef ENABLE_UTF8
68 	wchar_t wc;
69 
70 	if (mbtowide(&wc, c) < 0)
71 		return FALSE;
72 
73 	return iswalnum(wc);
74 #else
75 	return isalnum((unsigned char)*c);
76 #endif
77 }
78 
79 /* Return TRUE when the given character is space or tab or other whitespace. */
is_blank_char(const char * c)80 bool is_blank_char(const char *c)
81 {
82 #ifdef ENABLE_UTF8
83 	wchar_t wc;
84 
85 	if ((signed char)*c >= 0)
86 		return (*c == ' ' || *c == '\t');
87 
88 	if (mbtowide(&wc, c) < 0)
89 		return FALSE;
90 
91 	return iswblank(wc);
92 #else
93 	return isblank((unsigned char)*c);
94 #endif
95 }
96 
97 /* Return TRUE when the given character is a control character. */
is_cntrl_char(const char * c)98 bool is_cntrl_char(const char *c)
99 {
100 #ifdef ENABLE_UTF8
101 	if (use_utf8) {
102 		return ((c[0] & 0xE0) == 0 || c[0] == DEL_CODE ||
103 				((signed char)c[0] == -62 && (signed char)c[1] < -96));
104 	} else
105 #endif
106 		return ((*c & 0x60) == 0 || *c == DEL_CODE);
107 }
108 
109 /* Return TRUE when the given character is a punctuation character. */
is_punct_char(const char * c)110 bool is_punct_char(const char *c)
111 {
112 #ifdef ENABLE_UTF8
113 	wchar_t wc;
114 
115 	if (mbtowide(&wc, c) < 0)
116 		return FALSE;
117 
118 	return iswpunct(wc);
119 #else
120 	return ispunct((unsigned char)*c);
121 #endif
122 }
123 
124 /* Return TRUE when the given character is word-forming (it is alphanumeric or
125  * specified in 'wordchars', or it is punctuation when allow_punct is TRUE). */
is_word_char(const char * c,bool allow_punct)126 bool is_word_char(const char *c, bool allow_punct)
127 {
128 	if (*c == '\0')
129 		return FALSE;
130 
131 	if (is_alnum_char(c))
132 		return TRUE;
133 
134 	if (allow_punct && is_punct_char(c))
135 		return TRUE;
136 
137 	if (word_chars != NULL && *word_chars != '\0') {
138 		char symbol[MAXCHARLEN + 1];
139 		int symlen = collect_char(c, symbol);
140 
141 		symbol[symlen] = '\0';
142 		return (strstr(word_chars, symbol) != NULL);
143 	} else
144 		return FALSE;
145 }
146 
147 /* Return the visible representation of control character c. */
control_rep(const signed char c)148 char control_rep(const signed char c)
149 {
150 	if (c == DEL_CODE)
151 		return '?';
152 	else if (c == -97)
153 		return '=';
154 	else if (c < 0)
155 		return c + 224;
156 	else
157 		return c + 64;
158 }
159 
160 /* Return the visible representation of multibyte control character c. */
control_mbrep(const char * c,bool isdata)161 char control_mbrep(const char *c, bool isdata)
162 {
163 	/* An embedded newline is an encoded NUL if it is data. */
164 	if (*c == '\n' && (isdata || as_an_at))
165 		return '@';
166 
167 #ifdef ENABLE_UTF8
168 	if (use_utf8) {
169 		if ((unsigned char)c[0] < 128)
170 			return control_rep(c[0]);
171 		else
172 			return control_rep(c[1]);
173 	} else
174 #endif
175 		return control_rep(*c);
176 }
177 
178 #ifdef ENABLE_UTF8
179 /* Convert the given multibyte sequence c to wide character wc, and return
180  * the number of bytes in the sequence, or -1 for an invalid sequence. */
mbtowide(wchar_t * wc,const char * c)181 int mbtowide(wchar_t *wc, const char *c)
182 {
183 	if ((signed char)*c < 0 && use_utf8) {
184 		unsigned char v1 = (unsigned char)c[0];
185 		unsigned char v2 = (unsigned char)c[1] ^ 0x80;
186 
187 		if (v2 > 0x3F || v1 < 0xC2)
188 			return -1;
189 
190 		if (v1 < 0xE0) {
191 			*wc = (((unsigned int)(v1 & 0x1F) << 6) | (unsigned int)v2);
192 			return 2;
193 		}
194 
195 		unsigned char v3 = (unsigned char)c[2] ^ 0x80;
196 
197 		if (v3 > 0x3F)
198 			return -1;
199 
200 		if (v1 < 0xF0) {
201 			if ((v1 > 0xE0 || v2 >= 0x20) && (v1 != 0xED || v2 < 0x20)) {
202 				*wc = (((unsigned int)(v1 & 0x0F) << 12) |
203 							((unsigned int)v2 << 6) | (unsigned int)v3);
204 				return 3;
205 			} else
206 				return -1;
207 		}
208 
209 		unsigned char v4 = (unsigned char)c[3] ^ 0x80;
210 
211 		if (v4 > 0x3F || v1 > 0xF4)
212 			return -1;
213 
214 		if ((v1 > 0xF0 || v2 >= 0x10) && (v1 != 0xF4 || v2 < 0x10)) {
215 			*wc = (((unsigned int)(v1 & 0x07) << 18) | ((unsigned int)v2 << 12) |
216 							((unsigned int)v3 << 6) | (unsigned int)v4);
217 			return 4;
218 		} else
219 			return -1;
220 	}
221 
222 	*wc = (unsigned int)*c;
223 	return 1;
224 }
225 
226 /* Return TRUE when the given character occupies two cells. */
is_doublewidth(const char * ch)227 bool is_doublewidth(const char *ch)
228 {
229 	wchar_t wc;
230 
231 	/* Only from U+1100 can code points have double width. */
232 	if ((unsigned char)*ch < 0xE1 || !use_utf8)
233 		return FALSE;
234 
235 	if (mbtowide(&wc, ch) < 0)
236 		return FALSE;
237 
238 	return (wcwidth(wc) == 2);
239 }
240 
241 /* Return TRUE when the given character occupies zero cells. */
is_zerowidth(const char * ch)242 bool is_zerowidth(const char *ch)
243 {
244 	wchar_t wc;
245 
246 	/* Only from U+0300 can code points have zero width. */
247 	if ((unsigned char)*ch < 0xCC || !use_utf8)
248 		return FALSE;
249 
250 	if (mbtowide(&wc, ch) < 0)
251 		return FALSE;
252 
253 #if defined(__OpenBSD__)
254 	/* Work around an OpenBSD bug -- see https://sv.gnu.org/bugs/?60393. */
255 	if (wc >= 0xF0000)
256 		return FALSE;
257 #endif
258 
259 	return (wcwidth(wc) == 0);
260 }
261 #endif /* ENABLE_UTF8 */
262 
263 /* Return the number of bytes in the character that starts at *pointer. */
char_length(const char * pointer)264 int char_length(const char *pointer)
265 {
266 #ifdef ENABLE_UTF8
267 	if ((unsigned char)*pointer > 0xC1 && use_utf8) {
268 		unsigned char c1 = (unsigned char)pointer[0];
269 		unsigned char c2 = (unsigned char)pointer[1];
270 
271 		if ((c2 ^ 0x80) > 0x3F)
272 			return 1;
273 
274 		if (c1 < 0xE0)
275 			return 2;
276 
277 		if (((unsigned char)pointer[2] ^ 0x80) > 0x3F)
278 			return 1;
279 
280 		if (c1 < 0xF0) {
281 			if ((c1 > 0xE0 || c2 >= 0xA0) && (c1 != 0xED || c2 < 0xA0))
282 				return 3;
283 			else
284 				return 1;
285 		}
286 
287 		if (((unsigned char)pointer[3] ^ 0x80) > 0x3F)
288 			return 1;
289 
290 		if (c1 > 0xF4)
291 			return 1;
292 
293 		if ((c1 > 0xF0 || c2 >= 0x90) && (c1 != 0xF4 || c2 < 0x90))
294 			return 4;
295 	}
296 #endif
297 		return 1;
298 }
299 
300 /* Return the number of (multibyte) characters in the given string. */
mbstrlen(const char * pointer)301 size_t mbstrlen(const char *pointer)
302 {
303 	size_t count = 0;
304 
305 	while (*pointer != '\0') {
306 		pointer += char_length(pointer);
307 		count++;
308 	}
309 
310 	return count;
311 }
312 
313 /* Return the length (in bytes) of the character at the start of the
314  * given string, and return a copy of this character in *thechar. */
collect_char(const char * string,char * thechar)315 int collect_char(const char *string, char *thechar)
316 {
317 	int charlen = char_length(string);
318 
319 	for (int i = 0; i < charlen; i++)
320 		thechar[i] = string[i];
321 
322 	return charlen;
323 }
324 
325 /* Return the length (in bytes) of the character at the start of
326  * the given string, and add this character's width to *column. */
advance_over(const char * string,size_t * column)327 int advance_over(const char *string, size_t *column)
328 {
329 #ifdef ENABLE_UTF8
330 	if ((signed char)*string < 0 && use_utf8) {
331 		/* A UTF-8 upper control code has two bytes and takes two columns. */
332 		if (((unsigned char)string[0] == 0xC2 && (signed char)string[1] < -96)) {
333 			*column += 2;
334 			return 2;
335 		} else {
336 			wchar_t wc;
337 			int charlen = mbtowide(&wc, string);
338 
339 			if (charlen < 0) {
340 				*column += 1;
341 				return 1;
342 			}
343 
344 			int width = wcwidth(wc);
345 
346 #if defined(__OpenBSD__)
347 			*column += (width < 0 || wc >= 0xF0000) ? 1 : width;
348 #else
349 			*column += (width < 0) ? 1 : width;
350 #endif
351 			return charlen;
352 		}
353 	}
354 #endif
355 
356 	if ((unsigned char)*string < 0x20) {
357 		if (*string == '\t')
358 			*column += tabsize - *column % tabsize;
359 		else
360 			*column += 2;
361 	} else if (0x7E < (unsigned char)*string && (unsigned char)*string < 0xA0)
362 		*column += 2;
363 	else
364 		*column += 1;
365 
366 	return 1;
367 }
368 
369 /* Return the index in buf of the beginning of the multibyte character
370  * before the one at pos. */
step_left(const char * buf,size_t pos)371 size_t step_left(const char *buf, size_t pos)
372 {
373 #ifdef ENABLE_UTF8
374 	if (use_utf8) {
375 		size_t before, charlen = 0;
376 
377 		if (pos < 4)
378 			before = 0;
379 		else {
380 			const char *ptr = buf + pos;
381 
382 			/* Probe for a valid starter byte in the preceding four bytes. */
383 			if ((signed char)*(--ptr) > -65)
384 				before = pos - 1;
385 			else if ((signed char)*(--ptr) > -65)
386 				before = pos - 2;
387 			else if ((signed char)*(--ptr) > -65)
388 				before = pos - 3;
389 			else if ((signed char)*(--ptr) > -65)
390 				before = pos - 4;
391 			else
392 				before = pos - 1;
393 		}
394 
395 		/* Move forward again until we reach the original character,
396 		 * so we know the length of its preceding character. */
397 		while (before < pos) {
398 			charlen = char_length(buf + before);
399 			before += charlen;
400 		}
401 
402 		return before - charlen;
403 	} else
404 #endif
405 		return (pos == 0 ? 0 : pos - 1);
406 }
407 
408 /* Return the index in buf of the beginning of the multibyte character
409  * after the one at pos. */
step_right(const char * buf,size_t pos)410 size_t step_right(const char *buf, size_t pos)
411 {
412 	return pos + char_length(buf + pos);
413 }
414 
415 /* This function is equivalent to strcasecmp() for multibyte strings. */
mbstrcasecmp(const char * s1,const char * s2)416 int mbstrcasecmp(const char *s1, const char *s2)
417 {
418 	return mbstrncasecmp(s1, s2, HIGHEST_POSITIVE);
419 }
420 
421 /* This function is equivalent to strncasecmp() for multibyte strings. */
mbstrncasecmp(const char * s1,const char * s2,size_t n)422 int mbstrncasecmp(const char *s1, const char *s2, size_t n)
423 {
424 #ifdef ENABLE_UTF8
425 	if (use_utf8) {
426 		wchar_t wc1, wc2;
427 
428 		while (*s1 != '\0' && *s2 != '\0' && n > 0) {
429 			if ((signed char)*s1 >= 0 && (signed char)*s2 >= 0) {
430 				if ('A' <= (*s1 & 0x5F) && (*s1 & 0x5F) <= 'Z') {
431 					if ('A' <= (*s2 & 0x5F) && (*s2 & 0x5F) <= 'Z') {
432 						if ((*s1 & 0x5F) != (*s2 & 0x5F))
433 							return ((*s1 & 0x5F) - (*s2 & 0x5F));
434 					} else
435 						return ((*s1 | 0x20) - *s2);
436 				} else if ('A' <= (*s2 & 0x5F) && (*s2 & 0x5F) <= 'Z')
437 					return (*s1 - (*s2 | 0x20));
438 				else if (*s1 != *s2)
439 					return (*s1 - *s2);
440 
441 				s1++; s2++; n--;
442 				continue;
443 			}
444 
445 			bool bad1 = (mbtowide(&wc1, s1) < 0);
446 			bool bad2 = (mbtowide(&wc2, s2) < 0);
447 
448 			if (bad1 || bad2) {
449 				if (*s1 != *s2)
450 					return (unsigned char)*s1 - (unsigned char)*s2;
451 
452 				if (bad1 != bad2)
453 					return (bad1 ? 1 : -1);
454 			} else {
455 				int difference = towlower(wc1) - towlower(wc2);
456 
457 				if (difference != 0)
458 					return difference;
459 			}
460 
461 			s1 += char_length(s1);
462 			s2 += char_length(s2);
463 			n--;
464 		}
465 
466 		return (n > 0) ? ((unsigned char)*s1 - (unsigned char)*s2) : 0;
467 	} else
468 #endif
469 		return strncasecmp(s1, s2, n);
470 }
471 
472 /* This function is equivalent to strcasestr() for multibyte strings. */
mbstrcasestr(const char * haystack,const char * needle)473 char *mbstrcasestr(const char *haystack, const char *needle)
474 {
475 #ifdef ENABLE_UTF8
476 	if (use_utf8) {
477 		size_t needle_len = mbstrlen(needle);
478 
479 		while (*haystack != '\0') {
480 			if (mbstrncasecmp(haystack, needle, needle_len) == 0)
481 				return (char *)haystack;
482 
483 			haystack += char_length(haystack);
484 		}
485 
486 		return NULL;
487 	} else
488 #endif
489 		return (char *)strcasestr(haystack, needle);
490 }
491 
492 /* This function is equivalent to strstr(), except in that it scans the
493  * string in reverse, starting at pointer. */
revstrstr(const char * haystack,const char * needle,const char * pointer)494 char *revstrstr(const char *haystack, const char *needle,
495 		const char *pointer)
496 {
497 	size_t needle_len = strlen(needle);
498 	size_t tail_len = strlen(pointer);
499 
500 	if (tail_len < needle_len)
501 		pointer -= (needle_len - tail_len);
502 
503 	while (pointer >= haystack) {
504 		if (strncmp(pointer, needle, needle_len) == 0)
505 			return (char *)pointer;
506 		pointer--;
507 	}
508 
509 	return NULL;
510 }
511 
512 /* This function is equivalent to strcasestr(), except in that it scans
513  * the string in reverse, starting at pointer. */
revstrcasestr(const char * haystack,const char * needle,const char * pointer)514 char *revstrcasestr(const char *haystack, const char *needle,
515 		const char *pointer)
516 {
517 	size_t needle_len = strlen(needle);
518 	size_t tail_len = strlen(pointer);
519 
520 	if (tail_len < needle_len)
521 		pointer -= (needle_len - tail_len);
522 
523 	while (pointer >= haystack) {
524 		if (strncasecmp(pointer, needle, needle_len) == 0)
525 			return (char *)pointer;
526 		pointer--;
527 	}
528 
529 	return NULL;
530 }
531 
532 /* This function is equivalent to strcasestr() for multibyte strings,
533  * except in that it scans the string in reverse, starting at pointer. */
mbrevstrcasestr(const char * haystack,const char * needle,const char * pointer)534 char *mbrevstrcasestr(const char *haystack, const char *needle,
535 		const char *pointer)
536 {
537 #ifdef ENABLE_UTF8
538 	if (use_utf8) {
539 		size_t needle_len = mbstrlen(needle);
540 		size_t tail_len = mbstrlen(pointer);
541 
542 		if (tail_len < needle_len)
543 			pointer -= (needle_len - tail_len);
544 
545 		if (pointer < haystack)
546 			return NULL;
547 
548 		while (TRUE) {
549 			if (mbstrncasecmp(pointer, needle, needle_len) == 0)
550 				return (char *)pointer;
551 
552 			if (pointer == haystack)
553 				return NULL;
554 
555 			pointer = haystack + step_left(haystack, pointer - haystack);
556 		}
557 	} else
558 #endif
559 		return revstrcasestr(haystack, needle, pointer);
560 }
561 
562 #if !defined(NANO_TINY) || defined(ENABLE_JUSTIFY)
563 /* This function is equivalent to strchr() for multibyte strings. */
mbstrchr(const char * string,const char * chr)564 char *mbstrchr(const char *string, const char *chr)
565 {
566 #ifdef ENABLE_UTF8
567 	if (use_utf8) {
568 		bool bad_s = FALSE, bad_c = FALSE;
569 		wchar_t ws, wc;
570 
571 		if (mbtowide(&wc, chr) < 0) {
572 			wc = (unsigned char)*chr;
573 			bad_c = TRUE;
574 		}
575 
576 		while (*string != '\0') {
577 			int symlen = mbtowide(&ws, string);
578 
579 			if (symlen < 0) {
580 				ws = (unsigned char)*string;
581 				bad_s = TRUE;
582 			}
583 
584 			if (ws == wc && bad_s == bad_c)
585 				break;
586 
587 			string += symlen;
588 		}
589 
590 		if (*string == '\0')
591 			return NULL;
592 
593 		return (char *)string;
594 	} else
595 #endif
596 		return strchr(string, *chr);
597 }
598 #endif /* !NANO_TINY || ENABLE_JUSTIFY */
599 
600 #ifndef NANO_TINY
601 /* Locate, in the given string, the first occurrence of any of
602  * the characters in accept, searching forward. */
mbstrpbrk(const char * string,const char * accept)603 char *mbstrpbrk(const char *string, const char *accept)
604 {
605 	while (*string != '\0') {
606 		if (mbstrchr(accept, string) != NULL)
607 			return (char *)string;
608 
609 		string += char_length(string);
610 	}
611 
612 	return NULL;
613 }
614 
615 /* Locate, in the string that starts at head, the first occurrence of any of
616  * the characters in accept, starting from pointer and searching backwards. */
mbrevstrpbrk(const char * head,const char * accept,const char * pointer)617 char *mbrevstrpbrk(const char *head, const char *accept, const char *pointer)
618 {
619 	if (*pointer == '\0') {
620 		if (pointer == head)
621 			return NULL;
622 		pointer = head + step_left(head, pointer - head);
623 	}
624 
625 	while (TRUE) {
626 		if (mbstrchr(accept, pointer) != NULL)
627 			return (char *)pointer;
628 
629 		/* If we've reached the head of the string, we found nothing. */
630 		if (pointer == head)
631 			return NULL;
632 
633 		pointer = head + step_left(head, pointer - head);
634 	}
635 }
636 #endif /* !NANO_TINY */
637 
638 #if defined(ENABLE_NANORC) && (!defined(NANO_TINY) || defined(ENABLE_JUSTIFY))
639 /* Return TRUE if the given string contains at least one blank character. */
has_blank_char(const char * string)640 bool has_blank_char(const char *string)
641 {
642 	while (*string != '\0' && !is_blank_char(string))
643 		string += char_length(string);
644 
645 	return *string;
646 }
647 #endif
648 
649 /* Return TRUE when the given string is empty or consists of only blanks. */
white_string(const char * string)650 bool white_string(const char *string)
651 {
652 	while (*string != '\0' && (is_blank_char(string) || *string == '\r'))
653 		string += char_length(string);
654 
655 	return !*string;
656 }
657