1 // Scintilla source code edit control
2 /** @file WordList.cxx
3  ** Hold a list of words.
4  **/
5 // Copyright 1998-2002 by Neil Hodgson <neilh@scintilla.org>
6 // The License.txt file describes the conditions under which this software may be distributed.
7 
8 #include <cstdlib>
9 #include <cassert>
10 #include <cstring>
11 
12 #include <algorithm>
13 #include <iterator>
14 
15 #include "WordList.h"
16 
17 using namespace Scintilla;
18 
19 /**
20  * Creates an array that points into each word in the string and puts \0 terminators
21  * after each word.
22  */
ArrayFromWordList(char * wordlist,size_t slen,int * len,bool onlyLineEnds=false)23 static char **ArrayFromWordList(char *wordlist, size_t slen, int *len, bool onlyLineEnds = false) {
24 	int prev = '\n';
25 	int words = 0;
26 	// For rapid determination of whether a character is a separator, build
27 	// a look up table.
28 	bool wordSeparator[256] = {};	// Initialise all to false.
29 	wordSeparator[static_cast<unsigned int>('\r')] = true;
30 	wordSeparator[static_cast<unsigned int>('\n')] = true;
31 	if (!onlyLineEnds) {
32 		wordSeparator[static_cast<unsigned int>(' ')] = true;
33 		wordSeparator[static_cast<unsigned int>('\t')] = true;
34 	}
35 	for (int j = 0; wordlist[j]; j++) {
36 		const int curr = static_cast<unsigned char>(wordlist[j]);
37 		if (!wordSeparator[curr] && wordSeparator[prev])
38 			words++;
39 		prev = curr;
40 	}
41 	char **keywords = new char *[words + 1];
42 	int wordsStore = 0;
43 	if (words) {
44 		prev = '\0';
45 		for (size_t k = 0; k < slen; k++) {
46 			if (!wordSeparator[static_cast<unsigned char>(wordlist[k])]) {
47 				if (!prev) {
48 					keywords[wordsStore] = &wordlist[k];
49 					wordsStore++;
50 				}
51 			} else {
52 				wordlist[k] = '\0';
53 			}
54 			prev = wordlist[k];
55 		}
56 	}
57 	assert(wordsStore < (words + 1));
58 	keywords[wordsStore] = &wordlist[slen];
59 	*len = wordsStore;
60 	return keywords;
61 }
62 
WordList(bool onlyLineEnds_)63 WordList::WordList(bool onlyLineEnds_) :
64 	words(0), list(0), len(0), onlyLineEnds(onlyLineEnds_) {
65 	// Prevent warnings by static analyzers about uninitialized starts.
66 	starts[0] = -1;
67 }
68 
~WordList()69 WordList::~WordList() {
70 	Clear();
71 }
72 
operator bool() const73 WordList::operator bool() const {
74 	return len ? true : false;
75 }
76 
operator !=(const WordList & other) const77 bool WordList::operator!=(const WordList &other) const {
78 	if (len != other.len)
79 		return true;
80 	for (int i=0; i<len; i++) {
81 		if (strcmp(words[i], other.words[i]) != 0)
82 			return true;
83 	}
84 	return false;
85 }
86 
Length() const87 int WordList::Length() const {
88 	return len;
89 }
90 
Clear()91 void WordList::Clear() {
92 	if (words) {
93 		delete []list;
94 		delete []words;
95 	}
96 	words = 0;
97 	list = 0;
98 	len = 0;
99 }
100 
101 #ifdef _MSC_VER
102 
cmpWords(const char * a,const char * b)103 static bool cmpWords(const char *a, const char *b) {
104 	return strcmp(a, b) < 0;
105 }
106 
107 #else
108 
cmpWords(const void * a,const void * b)109 static int cmpWords(const void *a, const void *b) {
110 	return strcmp(*static_cast<const char * const *>(a), *static_cast<const char * const *>(b));
111 }
112 
SortWordList(char ** words,unsigned int len)113 static void SortWordList(char **words, unsigned int len) {
114 	qsort(words, len, sizeof(*words), cmpWords);
115 }
116 
117 #endif
118 
Set(const char * s)119 bool WordList::Set(const char *s) {
120 	const size_t lenS = strlen(s) + 1;
121 	char *listTemp = new char[lenS];
122 	memcpy(listTemp, s, lenS);
123 	int lenTemp = 0;
124 	char **wordsTemp = ArrayFromWordList(listTemp, lenS - 1, &lenTemp, onlyLineEnds);
125 #ifdef _MSC_VER
126 	std::sort(wordsTemp, wordsTemp + lenTemp, cmpWords);
127 #else
128 	SortWordList(wordsTemp, lenTemp);
129 #endif
130 
131 	if (lenTemp == len) {
132 		bool changed = false;
133 		for (int i = 0; i < lenTemp; i++) {
134 			if (strcmp(words[i], wordsTemp[i]) != 0) {
135 				changed = true;
136 				break;
137 			}
138 		}
139 		if (!changed) {
140 			delete []listTemp;
141 			delete []wordsTemp;
142 			return false;
143 		}
144 	}
145 
146 	Clear();
147 	words = wordsTemp;
148 	list = listTemp;
149 	len = lenTemp;
150 	std::fill(starts, std::end(starts), -1);
151 	for (int l = len - 1; l >= 0; l--) {
152 		unsigned char indexChar = words[l][0];
153 		starts[indexChar] = l;
154 	}
155 	return true;
156 }
157 
158 /** Check whether a string is in the list.
159  * List elements are either exact matches or prefixes.
160  * Prefix elements start with '^' and match all strings that start with the rest of the element
161  * so '^GTK_' matches 'GTK_X', 'GTK_MAJOR_VERSION', and 'GTK_'.
162  */
InList(const char * s) const163 bool WordList::InList(const char *s) const {
164 	if (0 == words)
165 		return false;
166 	const unsigned char firstChar = s[0];
167 	int j = starts[firstChar];
168 	if (j >= 0) {
169 		while (words[j][0] == firstChar) {
170 			if (s[1] == words[j][1]) {
171 				const char *a = words[j] + 1;
172 				const char *b = s + 1;
173 				while (*a && *a == *b) {
174 					a++;
175 					b++;
176 				}
177 				if (!*a && !*b)
178 					return true;
179 			}
180 			j++;
181 		}
182 	}
183 	j = starts[static_cast<unsigned int>('^')];
184 	if (j >= 0) {
185 		while (words[j][0] == '^') {
186 			const char *a = words[j] + 1;
187 			const char *b = s;
188 			while (*a && *a == *b) {
189 				a++;
190 				b++;
191 			}
192 			if (!*a)
193 				return true;
194 			j++;
195 		}
196 	}
197 	return false;
198 }
199 
200 /** similar to InList, but word s can be a substring of keyword.
201  * eg. the keyword define is defined as def~ine. This means the word must start
202  * with def to be a keyword, but also defi, defin and define are valid.
203  * The marker is ~ in this case.
204  */
InListAbbreviated(const char * s,const char marker) const205 bool WordList::InListAbbreviated(const char *s, const char marker) const {
206 	if (0 == words)
207 		return false;
208 	const unsigned char firstChar = s[0];
209 	int j = starts[firstChar];
210 	if (j >= 0) {
211 		while (words[j][0] == firstChar) {
212 			bool isSubword = false;
213 			int start = 1;
214 			if (words[j][1] == marker) {
215 				isSubword = true;
216 				start++;
217 			}
218 			if (s[1] == words[j][start]) {
219 				const char *a = words[j] + start;
220 				const char *b = s + 1;
221 				while (*a && *a == *b) {
222 					a++;
223 					if (*a == marker) {
224 						isSubword = true;
225 						a++;
226 					}
227 					b++;
228 				}
229 				if ((!*a || isSubword) && !*b)
230 					return true;
231 			}
232 			j++;
233 		}
234 	}
235 	j = starts[static_cast<unsigned int>('^')];
236 	if (j >= 0) {
237 		while (words[j][0] == '^') {
238 			const char *a = words[j] + 1;
239 			const char *b = s;
240 			while (*a && *a == *b) {
241 				a++;
242 				b++;
243 			}
244 			if (!*a)
245 				return true;
246 			j++;
247 		}
248 	}
249 	return false;
250 }
251 
252 /** similar to InListAbbreviated, but word s can be a abridged version of a keyword.
253 * eg. the keyword is defined as "after.~:". This means the word must have a prefix (begins with) of
254 * "after." and suffix (ends with) of ":" to be a keyword, Hence "after.field:" , "after.form.item:" are valid.
255 * Similarly "~.is.valid" keyword is suffix only... hence "field.is.valid" , "form.is.valid" are valid.
256 * The marker is ~ in this case.
257 * No multiple markers check is done and wont work.
258 */
InListAbridged(const char * s,const char marker) const259 bool WordList::InListAbridged(const char *s, const char marker) const {
260 	if (0 == words)
261 		return false;
262 	const unsigned char firstChar = s[0];
263 	int j = starts[firstChar];
264 	if (j >= 0) {
265 		while (words[j][0] == firstChar) {
266 			const char *a = words[j];
267 			const char *b = s;
268 			while (*a && *a == *b) {
269 				a++;
270 				if (*a == marker) {
271 					a++;
272 					const size_t suffixLengthA = strlen(a);
273 					const size_t suffixLengthB = strlen(b);
274 					if (suffixLengthA >= suffixLengthB)
275 						break;
276 					b = b + suffixLengthB - suffixLengthA - 1;
277 				}
278 				b++;
279 			}
280 			if (!*a  && !*b)
281 				return true;
282 			j++;
283 		}
284 	}
285 
286 	j = starts[static_cast<unsigned int>(marker)];
287 	if (j >= 0) {
288 		while (words[j][0] == marker) {
289 			const char *a = words[j] + 1;
290 			const char *b = s;
291 			const size_t suffixLengthA = strlen(a);
292 			const size_t suffixLengthB = strlen(b);
293 			if (suffixLengthA > suffixLengthB) {
294 				j++;
295 				continue;
296 			}
297 			b = b + suffixLengthB - suffixLengthA;
298 
299 			while (*a && *a == *b) {
300 				a++;
301 				b++;
302 			}
303 			if (!*a && !*b)
304 				return true;
305 			j++;
306 		}
307 	}
308 
309 	return false;
310 }
311 
WordAt(int n) const312 const char *WordList::WordAt(int n) const {
313 	return words[n];
314 }
315 
316