1 #include <string.h>
2 
3 #include "iregex.h"
4 
5 struct _MatchInfo {
6 	const char *valid_string;
7 	GMatchInfo *g_match_info;
8 };
9 
10 static const gchar *
make_valid_utf8(const gchar * text,gboolean * free_ret)11 make_valid_utf8(const gchar *text, gboolean *free_ret)
12 {
13 	GString *str;
14 	const gchar *ptr;
15 	if (g_utf8_validate(text, -1, NULL)) {
16 		if (free_ret)
17 			*free_ret = FALSE;
18 		return text;
19 	}
20 
21 	str = g_string_sized_new(strlen(text) + 12);
22 
23 	ptr = text;
24 	while (*ptr) {
25 		gunichar c = g_utf8_get_char_validated(ptr, -1);
26 		/* the unicode is invalid */
27 		if (c == (gunichar)-1 || c == (gunichar)-2) {
28 			/* encode the byte into PUA-A */
29 			g_string_append_unichar(str, (gunichar) (0xfff00 | (*ptr & 0xff)));
30 			ptr++;
31 		} else {
32 			g_string_append_unichar(str, c);
33 			ptr = g_utf8_next_char(ptr);
34 		}
35 	}
36 
37 	if (free_ret)
38 		*free_ret = TRUE;
39 	return g_string_free(str, FALSE);
40 }
41 
42 Regex *
i_regex_new(const gchar * pattern,GRegexCompileFlags compile_options,GRegexMatchFlags match_options,GError ** error)43 i_regex_new (const gchar *pattern,
44              GRegexCompileFlags compile_options,
45              GRegexMatchFlags match_options,
46              GError **error)
47 {
48 	const gchar *valid_pattern;
49 	gboolean free_valid_pattern;
50 	Regex *ret = NULL;
51 
52 	valid_pattern = make_valid_utf8(pattern, &free_valid_pattern);
53 	ret = g_regex_new(valid_pattern, compile_options, match_options, error);
54 
55 	if (free_valid_pattern)
56 		g_free_not_null((gchar *)valid_pattern);
57 
58 	return ret;
59 }
60 
61 void
i_regex_unref(Regex * regex)62 i_regex_unref (Regex *regex)
63 {
64 	g_regex_unref(regex);
65 }
66 
67 gboolean
i_regex_match(const Regex * regex,const gchar * string,GRegexMatchFlags match_options,MatchInfo ** match_info)68 i_regex_match (const Regex *regex,
69                const gchar *string,
70                GRegexMatchFlags match_options,
71                MatchInfo **match_info)
72 {
73 	gboolean ret;
74 	gboolean free_valid_string;
75 	const gchar *valid_string = make_valid_utf8(string, &free_valid_string);
76 
77 	if (match_info != NULL)
78 		*match_info = g_new0(MatchInfo, 1);
79 
80 	ret = g_regex_match(regex, valid_string, match_options,
81 			    match_info != NULL ? &(*match_info)->g_match_info : NULL);
82 
83 	if (free_valid_string) {
84 		if (match_info != NULL)
85 			(*match_info)->valid_string = valid_string;
86 		else
87 			g_free_not_null((gchar *)valid_string);
88 	}
89 
90 	return ret;
91 }
92 
93 static gsize
strlen_pua_oddly(const char * str)94 strlen_pua_oddly(const char *str)
95 {
96 	const gchar *ptr;
97 	gsize ret = 0;
98 	ptr = str;
99 
100 	while (*ptr) {
101 		const gchar *old;
102 		gunichar c = g_utf8_get_char(ptr);
103 		old = ptr;
104 		ptr = g_utf8_next_char(ptr);
105 
106 		/* it is our PUA encoded byte */
107 		if ((c & 0xfff00) == 0xfff00)
108 			ret++;
109 		else
110 			ret += ptr - old;
111 	}
112 
113 	return ret;
114 }
115 
116 /* new_string should be passed in here from the i_regex_match call.
117    The start_pos and end_pos will then be calculated as if they were on
118    the original string */
119 gboolean
i_match_info_fetch_pos(const MatchInfo * match_info,gint match_num,gint * start_pos,gint * end_pos)120 i_match_info_fetch_pos (const MatchInfo *match_info,
121                         gint match_num,
122                         gint *start_pos,
123                         gint *end_pos)
124 {
125 	gint tmp_start, tmp_end, new_start_pos;
126 	gboolean ret;
127 
128 	if (!match_info->valid_string || (!start_pos && !end_pos))
129 		return g_match_info_fetch_pos(match_info->g_match_info,
130 					      match_num, start_pos, end_pos);
131 
132 	ret = g_match_info_fetch_pos(match_info->g_match_info,
133 				     match_num, &tmp_start, &tmp_end);
134 	if (start_pos || end_pos) {
135 		const gchar *str = match_info->valid_string;
136 		gchar *to_start = g_strndup(str, tmp_start);
137 		new_start_pos = strlen_pua_oddly(to_start);
138 		g_free_not_null(to_start);
139 
140 		if (start_pos)
141 			*start_pos = new_start_pos;
142 
143 		if (end_pos) {
144 			gchar *to_end = g_strndup(str + tmp_start, tmp_end - tmp_start);
145 			*end_pos = new_start_pos + strlen_pua_oddly(to_end);
146 			g_free_not_null(to_end);
147 		}
148 	}
149 	return ret;
150 }
151 
152 gboolean
i_match_info_matches(const MatchInfo * match_info)153 i_match_info_matches (const MatchInfo *match_info)
154 {
155 	g_return_val_if_fail(match_info != NULL, FALSE);
156 
157 	return g_match_info_matches(match_info->g_match_info);
158 }
159 
160 void
i_match_info_free(MatchInfo * match_info)161 i_match_info_free (MatchInfo *match_info)
162 {
163 	g_match_info_free(match_info->g_match_info);
164 	g_free(match_info);
165 }
166