1 // Scintilla source code edit control
2 /** @file WordList.cxx
3  ** Hold a list of words.
4  **/
5 // Copyright 1998-2002 by Neil Hodgson <neilh@scintilla.org>
6 // The License.txt file describes the conditions under which this software may be distributed.
7 
8 #include <cstdlib>
9 #include <cassert>
10 #include <cstring>
11 
12 #include <algorithm>
13 #include <iterator>
14 
15 #include "WordList.h"
16 
17 using namespace Scintilla;
18 
19 /**
20  * Creates an array that points into each word in the string and puts \0 terminators
21  * after each word.
22  */
ArrayFromWordList(char * wordlist,int * len,bool onlyLineEnds=false)23 static char **ArrayFromWordList(char *wordlist, int *len, bool onlyLineEnds = false) {
24 	int prev = '\n';
25 	int words = 0;
26 	// For rapid determination of whether a character is a separator, build
27 	// a look up table.
28 	bool wordSeparator[256] = {};	// Initialise all to false.
29 	wordSeparator[static_cast<unsigned int>('\r')] = true;
30 	wordSeparator[static_cast<unsigned int>('\n')] = true;
31 	if (!onlyLineEnds) {
32 		wordSeparator[static_cast<unsigned int>(' ')] = true;
33 		wordSeparator[static_cast<unsigned int>('\t')] = true;
34 	}
35 	for (int j = 0; wordlist[j]; j++) {
36 		const int curr = static_cast<unsigned char>(wordlist[j]);
37 		if (!wordSeparator[curr] && wordSeparator[prev])
38 			words++;
39 		prev = curr;
40 	}
41 	char **keywords = new char *[words + 1];
42 	int wordsStore = 0;
43 	const size_t slen = strlen(wordlist);
44 	if (words) {
45 		prev = '\0';
46 		for (size_t k = 0; k < slen; k++) {
47 			if (!wordSeparator[static_cast<unsigned char>(wordlist[k])]) {
48 				if (!prev) {
49 					keywords[wordsStore] = &wordlist[k];
50 					wordsStore++;
51 				}
52 			} else {
53 				wordlist[k] = '\0';
54 			}
55 			prev = wordlist[k];
56 		}
57 	}
58 	assert(wordsStore < (words + 1));
59 	keywords[wordsStore] = &wordlist[slen];
60 	*len = wordsStore;
61 	return keywords;
62 }
63 
WordList(bool onlyLineEnds_)64 WordList::WordList(bool onlyLineEnds_) :
65 	words(0), list(0), len(0), onlyLineEnds(onlyLineEnds_) {
66 	// Prevent warnings by static analyzers about uninitialized starts.
67 	starts[0] = -1;
68 }
69 
~WordList()70 WordList::~WordList() {
71 	Clear();
72 }
73 
operator bool() const74 WordList::operator bool() const {
75 	return len ? true : false;
76 }
77 
operator !=(const WordList & other) const78 bool WordList::operator!=(const WordList &other) const {
79 	if (len != other.len)
80 		return true;
81 	for (int i=0; i<len; i++) {
82 		if (strcmp(words[i], other.words[i]) != 0)
83 			return true;
84 	}
85 	return false;
86 }
87 
Length() const88 int WordList::Length() const {
89 	return len;
90 }
91 
Clear()92 void WordList::Clear() {
93 	if (words) {
94 		delete []list;
95 		delete []words;
96 	}
97 	words = 0;
98 	list = 0;
99 	len = 0;
100 }
101 
102 #ifdef _MSC_VER
103 
cmpWords(const char * a,const char * b)104 static bool cmpWords(const char *a, const char *b) {
105 	return strcmp(a, b) < 0;
106 }
107 
108 #else
109 
cmpWords(const void * a,const void * b)110 static int cmpWords(const void *a, const void *b) {
111 	return strcmp(*static_cast<const char * const *>(a), *static_cast<const char * const *>(b));
112 }
113 
SortWordList(char ** words,unsigned int len)114 static void SortWordList(char **words, unsigned int len) {
115 	qsort(words, len, sizeof(*words), cmpWords);
116 }
117 
118 #endif
119 
Set(const char * s)120 void WordList::Set(const char *s) {
121 	Clear();
122 	const size_t lenS = strlen(s) + 1;
123 	list = new char[lenS];
124 	memcpy(list, s, lenS);
125 	words = ArrayFromWordList(list, &len, onlyLineEnds);
126 #ifdef _MSC_VER
127 	std::sort(words, words + len, cmpWords);
128 #else
129 	SortWordList(words, len);
130 #endif
131 	std::fill(starts, std::end(starts), -1);
132 	for (int l = len - 1; l >= 0; l--) {
133 		unsigned char indexChar = words[l][0];
134 		starts[indexChar] = l;
135 	}
136 }
137 
138 /** Check whether a string is in the list.
139  * List elements are either exact matches or prefixes.
140  * Prefix elements start with '^' and match all strings that start with the rest of the element
141  * so '^GTK_' matches 'GTK_X', 'GTK_MAJOR_VERSION', and 'GTK_'.
142  */
InList(const char * s) const143 bool WordList::InList(const char *s) const {
144 	if (0 == words)
145 		return false;
146 	const unsigned char firstChar = s[0];
147 	int j = starts[firstChar];
148 	if (j >= 0) {
149 		while (words[j][0] == firstChar) {
150 			if (s[1] == words[j][1]) {
151 				const char *a = words[j] + 1;
152 				const char *b = s + 1;
153 				while (*a && *a == *b) {
154 					a++;
155 					b++;
156 				}
157 				if (!*a && !*b)
158 					return true;
159 			}
160 			j++;
161 		}
162 	}
163 	j = starts[static_cast<unsigned int>('^')];
164 	if (j >= 0) {
165 		while (words[j][0] == '^') {
166 			const char *a = words[j] + 1;
167 			const char *b = s;
168 			while (*a && *a == *b) {
169 				a++;
170 				b++;
171 			}
172 			if (!*a)
173 				return true;
174 			j++;
175 		}
176 	}
177 	return false;
178 }
179 
180 /** similar to InList, but word s can be a substring of keyword.
181  * eg. the keyword define is defined as def~ine. This means the word must start
182  * with def to be a keyword, but also defi, defin and define are valid.
183  * The marker is ~ in this case.
184  */
InListAbbreviated(const char * s,const char marker) const185 bool WordList::InListAbbreviated(const char *s, const char marker) const {
186 	if (0 == words)
187 		return false;
188 	const unsigned char firstChar = s[0];
189 	int j = starts[firstChar];
190 	if (j >= 0) {
191 		while (words[j][0] == firstChar) {
192 			bool isSubword = false;
193 			int start = 1;
194 			if (words[j][1] == marker) {
195 				isSubword = true;
196 				start++;
197 			}
198 			if (s[1] == words[j][start]) {
199 				const char *a = words[j] + start;
200 				const char *b = s + 1;
201 				while (*a && *a == *b) {
202 					a++;
203 					if (*a == marker) {
204 						isSubword = true;
205 						a++;
206 					}
207 					b++;
208 				}
209 				if ((!*a || isSubword) && !*b)
210 					return true;
211 			}
212 			j++;
213 		}
214 	}
215 	j = starts[static_cast<unsigned int>('^')];
216 	if (j >= 0) {
217 		while (words[j][0] == '^') {
218 			const char *a = words[j] + 1;
219 			const char *b = s;
220 			while (*a && *a == *b) {
221 				a++;
222 				b++;
223 			}
224 			if (!*a)
225 				return true;
226 			j++;
227 		}
228 	}
229 	return false;
230 }
231 
232 /** similar to InListAbbreviated, but word s can be a abridged version of a keyword.
233 * eg. the keyword is defined as "after.~:". This means the word must have a prefix (begins with) of
234 * "after." and suffix (ends with) of ":" to be a keyword, Hence "after.field:" , "after.form.item:" are valid.
235 * Similarly "~.is.valid" keyword is suffix only... hence "field.is.valid" , "form.is.valid" are valid.
236 * The marker is ~ in this case.
237 * No multiple markers check is done and wont work.
238 */
InListAbridged(const char * s,const char marker) const239 bool WordList::InListAbridged(const char *s, const char marker) const {
240 	if (0 == words)
241 		return false;
242 	const unsigned char firstChar = s[0];
243 	int j = starts[firstChar];
244 	if (j >= 0) {
245 		while (words[j][0] == firstChar) {
246 			const char *a = words[j];
247 			const char *b = s;
248 			while (*a && *a == *b) {
249 				a++;
250 				if (*a == marker) {
251 					a++;
252 					const size_t suffixLengthA = strlen(a);
253 					const size_t suffixLengthB = strlen(b);
254 					if (suffixLengthA >= suffixLengthB)
255 						break;
256 					b = b + suffixLengthB - suffixLengthA - 1;
257 				}
258 				b++;
259 			}
260 			if (!*a  && !*b)
261 				return true;
262 			j++;
263 		}
264 	}
265 
266 	j = starts[static_cast<unsigned int>(marker)];
267 	if (j >= 0) {
268 		while (words[j][0] == marker) {
269 			const char *a = words[j] + 1;
270 			const char *b = s;
271 			const size_t suffixLengthA = strlen(a);
272 			const size_t suffixLengthB = strlen(b);
273 			if (suffixLengthA > suffixLengthB) {
274 				j++;
275 				continue;
276 			}
277 			b = b + suffixLengthB - suffixLengthA;
278 
279 			while (*a && *a == *b) {
280 				a++;
281 				b++;
282 			}
283 			if (!*a && !*b)
284 				return true;
285 			j++;
286 		}
287 	}
288 
289 	return false;
290 }
291 
WordAt(int n) const292 const char *WordList::WordAt(int n) const {
293 	return words[n];
294 }
295 
296