xref: /openbsd/usr.bin/sort/vsort.c (revision 5bf6c2a6)
1 /*	$OpenBSD: vsort.c,v 1.2 2015/04/01 21:46:38 millert Exp $	*/
2 
3 /*-
4  * Copyright (C) 2012 Oleg Moskalenko <mom040267@gmail.com>
5  * Copyright (C) 2012 Gabor Kovesdan <gabor@FreeBSD.org>
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29 
30 #include <sys/types.h>
31 
32 #include <ctype.h>
33 #include <stdlib.h>
34 #include <string.h>
35 
36 #include "sort.h"
37 #include "vsort.h"
38 
39 static inline bool
isdigit_clocale(wchar_t c)40 isdigit_clocale(wchar_t c)
41 {
42 	return (c >= L'0' && c <= L'9');
43 }
44 
45 static inline bool
isalpha_clocale(wchar_t c)46 isalpha_clocale(wchar_t c)
47 {
48 	return ((c >= L'a' && c <= L'z') || (c >= L'A' && c <= L'Z'));
49 }
50 
51 static inline bool
isalnum_clocale(wchar_t c)52 isalnum_clocale(wchar_t c)
53 {
54 	return ((c >= L'a' && c <= L'z') || (c >= L'A' && c <= L'Z') ||
55 	    (c >= L'0' && c <= L'9'));
56 }
57 
58 /*
59  * Find string suffix of format: (\.[A-Za-z~][A-Za-z0-9~]*)*$
60  * Set length of string before suffix.
61  */
62 static void
find_suffix(bwstring_iterator si,bwstring_iterator se,size_t * len)63 find_suffix(bwstring_iterator si, bwstring_iterator se, size_t *len)
64 {
65 	wchar_t c;
66 	size_t clen;
67 	bool expect_alpha, sfx;
68 
69 	sfx = false;
70 	expect_alpha = false;
71 	*len = 0;
72 	clen = 0;
73 
74 	while ((si < se) && (c = bws_get_iter_value(si))) {
75 		if (expect_alpha) {
76 			expect_alpha = false;
77 			if (!isalpha_clocale(c) && (c != L'~'))
78 				sfx = false;
79 		} else if (c == L'.') {
80 			expect_alpha = true;
81 			if (!sfx) {
82 				sfx = true;
83 				*len = clen;
84 			}
85 		} else if (!isalnum_clocale(c) && (c != L'~'))
86 			sfx = false;
87 
88 		si = bws_iterator_inc(si, 1);
89 		++clen;
90 	}
91 
92 	/* This code must be here to make the implementation compatible
93 	 * with WORDING of GNU sort documentation.
94 	 * But the GNU sort implementation is not following its own
95 	 * documentation.  GNU sort allows empty file extensions
96 	 * (just dot with nothing after); but the regular expression in
97 	 * their documentation does not allow empty file extensions.
98 	 * We chose to make our implementation compatible with GNU sort
99 	 * implementation.  If they will ever fix their bug, this code
100 	 * must be uncommented. Or they may choose to fix the info page,
101 	 * then the code stays commented.
102 	 *
103 	 if (expect_alpha)
104 	 	sfx = false;
105 	 */
106 
107 	if (!sfx)
108 		*len = clen;
109 }
110 
111 static inline int
cmp_chars(wchar_t c1,wchar_t c2)112 cmp_chars(wchar_t c1, wchar_t c2)
113 {
114 	if (c1 == c2)
115 		return 0;
116 
117 	if (c1 == L'~')
118 		return -1;
119 	if (c2 == L'~')
120 		return 1;
121 
122 	if (isdigit_clocale(c1) || !c1)
123 		return (isdigit_clocale(c2) || !c2) ? 0 : -1;
124 
125 	if (isdigit_clocale(c2) || !c2)
126 		return 1;
127 
128 	if (isalpha_clocale(c1))
129 		return isalpha_clocale(c2) ? (int)c1 - (int)c2 : -1;
130 
131 	if (isalpha_clocale(c2))
132 		return 1;
133 
134 	return (int)c1 - (int)c2;
135 }
136 
137 static int
cmpversions(bwstring_iterator si1,bwstring_iterator se1,bwstring_iterator si2,bwstring_iterator se2)138 cmpversions(bwstring_iterator si1, bwstring_iterator se1,
139     bwstring_iterator si2, bwstring_iterator se2)
140 {
141 	int cmp, diff;
142 
143 	while ((si1 < se1) || (si2 < se2)) {
144 		diff = 0;
145 
146 		while (((si1 < se1) &&
147 		    !isdigit_clocale(bws_get_iter_value(si1))) ||
148 		    ((si2 < se2) && !isdigit_clocale(bws_get_iter_value(si2)))) {
149 			wchar_t c1, c2;
150 
151 			c1 = (si1 < se1) ? bws_get_iter_value(si1) : 0;
152 			c2 = (si2 < se2) ? bws_get_iter_value(si2) : 0;
153 
154 			cmp = cmp_chars(c1, c2);
155 			if (cmp)
156 				return cmp;
157 
158 			if (si1 < se1)
159 				si1 = bws_iterator_inc(si1, 1);
160 			if (si2 < se2)
161 				si2 = bws_iterator_inc(si2, 1);
162 		}
163 
164 		while (bws_get_iter_value(si1) == L'0')
165 			si1 = bws_iterator_inc(si1, 1);
166 
167 		while (bws_get_iter_value(si2) == L'0')
168 			si2 = bws_iterator_inc(si2, 1);
169 
170 		while (isdigit_clocale(bws_get_iter_value(si1)) &&
171 		    isdigit_clocale(bws_get_iter_value(si2))) {
172 			if (!diff)
173 				diff = ((int)bws_get_iter_value(si1) -
174 				    (int)bws_get_iter_value(si2));
175 			si1 = bws_iterator_inc(si1, 1);
176 			si2 = bws_iterator_inc(si2, 1);
177 		}
178 
179 		if (isdigit_clocale(bws_get_iter_value(si1)))
180 			return 1;
181 
182 		if (isdigit_clocale(bws_get_iter_value(si2)))
183 			return -1;
184 
185 		if (diff)
186 			return diff;
187 	}
188 
189 	return 0;
190 }
191 
192 /*
193  * Compare two version strings
194  */
195 int
vcmp(struct bwstring * s1,struct bwstring * s2)196 vcmp(struct bwstring *s1, struct bwstring *s2)
197 {
198 	bwstring_iterator si1, si2;
199 	wchar_t c1, c2;
200 	size_t len1, len2, slen1, slen2;
201 	int cmp_bytes, cmp_res;
202 
203 	if (s1 == s2)
204 		return 0;
205 
206 	cmp_bytes = bwscmp(s1, s2, 0);
207 	if (cmp_bytes == 0)
208 		return 0;
209 
210 	len1 = slen1 = BWSLEN(s1);
211 	len2 = slen2 = BWSLEN(s2);
212 
213 	if (slen1 < 1)
214 		return -1;
215 	if (slen2 < 1)
216 		return 1;
217 
218 	si1 = bws_begin(s1);
219 	si2 = bws_begin(s2);
220 
221 	c1 = bws_get_iter_value(si1);
222 	c2 = bws_get_iter_value(si2);
223 
224 	if (c1 == L'.' && (slen1 == 1))
225 		return -1;
226 
227 	if (c2 == L'.' && (slen2 == 1))
228 		return 1;
229 
230 	if (slen1 == 2 && c1 == L'.' &&
231 	    bws_get_iter_value(bws_iterator_inc(si1, 1)) == L'.')
232 		return -1;
233 	if (slen2 == 2 && c2 == L'.' &&
234 	    bws_get_iter_value(bws_iterator_inc(si2, 1)) == L'.')
235 		return 1;
236 
237 	if (c1 == L'.' && c2 != L'.')
238 		return -1;
239 	if (c1 != L'.' && c2 == L'.')
240 		return 1;
241 
242 	if (c1 == L'.' && c2 == L'.') {
243 		si1 = bws_iterator_inc(si1, 1);
244 		si2 = bws_iterator_inc(si2, 1);
245 	}
246 
247 	find_suffix(si1, bws_end(s1), &len1);
248 	find_suffix(si2, bws_end(s2), &len2);
249 
250 	if ((len1 == len2) && (bws_iterator_cmp(si1, si2, len1) == 0))
251 		return cmp_bytes;
252 
253 	cmp_res = cmpversions(si1, bws_iterator_inc(si1, len1), si2,
254 	    bws_iterator_inc(si2, len2));
255 
256 	if (cmp_res == 0)
257 	  cmp_res = cmp_bytes;
258 
259 	return cmp_res;
260 }
261