xref: /freebsd/usr.bin/sort/vsort.c (revision 5e3934b1)
1c66bbc91SGabor Kovesdan /*-
24d846d26SWarner Losh  * SPDX-License-Identifier: BSD-2-Clause
31de7b4b8SPedro F. Giffuni  *
4c859c6ddSGabor Kovesdan  * Copyright (C) 2012 Oleg Moskalenko <mom040267@gmail.com>
5c66bbc91SGabor Kovesdan  * Copyright (C) 2012 Gabor Kovesdan <gabor@FreeBSD.org>
6c66bbc91SGabor Kovesdan  * All rights reserved.
7c66bbc91SGabor Kovesdan  *
8c66bbc91SGabor Kovesdan  * Redistribution and use in source and binary forms, with or without
9c66bbc91SGabor Kovesdan  * modification, are permitted provided that the following conditions
10c66bbc91SGabor Kovesdan  * are met:
11c66bbc91SGabor Kovesdan  * 1. Redistributions of source code must retain the above copyright
12c66bbc91SGabor Kovesdan  *    notice, this list of conditions and the following disclaimer.
13c66bbc91SGabor Kovesdan  * 2. Redistributions in binary form must reproduce the above copyright
14c66bbc91SGabor Kovesdan  *    notice, this list of conditions and the following disclaimer in the
15c66bbc91SGabor Kovesdan  *    documentation and/or other materials provided with the distribution.
16c66bbc91SGabor Kovesdan  *
17c66bbc91SGabor Kovesdan  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18c66bbc91SGabor Kovesdan  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19c66bbc91SGabor Kovesdan  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20c66bbc91SGabor Kovesdan  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21c66bbc91SGabor Kovesdan  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22c66bbc91SGabor Kovesdan  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23c66bbc91SGabor Kovesdan  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24c66bbc91SGabor Kovesdan  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25c66bbc91SGabor Kovesdan  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26c66bbc91SGabor Kovesdan  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27c66bbc91SGabor Kovesdan  * SUCH DAMAGE.
28c66bbc91SGabor Kovesdan  */
29c66bbc91SGabor Kovesdan 
30c66bbc91SGabor Kovesdan #include <sys/types.h>
31c66bbc91SGabor Kovesdan 
32c66bbc91SGabor Kovesdan #include <ctype.h>
33c66bbc91SGabor Kovesdan #include <stdlib.h>
34c66bbc91SGabor Kovesdan #include <string.h>
35c66bbc91SGabor Kovesdan 
36c66bbc91SGabor Kovesdan #include "sort.h"
37c66bbc91SGabor Kovesdan #include "vsort.h"
38c66bbc91SGabor Kovesdan 
39c66bbc91SGabor Kovesdan static inline bool
isdigit_clocale(wchar_t c)40c66bbc91SGabor Kovesdan isdigit_clocale(wchar_t c)
41c66bbc91SGabor Kovesdan {
42e5f71a07SPedro F. Giffuni 
43c66bbc91SGabor Kovesdan 	return (c >= L'0' && c <= L'9');
44c66bbc91SGabor Kovesdan }
45c66bbc91SGabor Kovesdan 
46c66bbc91SGabor Kovesdan static inline bool
isalpha_clocale(wchar_t c)47c66bbc91SGabor Kovesdan isalpha_clocale(wchar_t c)
48c66bbc91SGabor Kovesdan {
49e5f71a07SPedro F. Giffuni 
50c66bbc91SGabor Kovesdan 	return ((c >= L'a' && c <= L'z') || (c >= L'A' && c <= L'Z'));
51c66bbc91SGabor Kovesdan }
52c66bbc91SGabor Kovesdan 
53c66bbc91SGabor Kovesdan static inline bool
isalnum_clocale(wchar_t c)54c66bbc91SGabor Kovesdan isalnum_clocale(wchar_t c)
55c66bbc91SGabor Kovesdan {
56e5f71a07SPedro F. Giffuni 
57c66bbc91SGabor Kovesdan 	return ((c >= L'a' && c <= L'z') || (c >= L'A' && c <= L'Z') ||
58c66bbc91SGabor Kovesdan 	    (c >= L'0' && c <= L'9'));
59c66bbc91SGabor Kovesdan }
60c66bbc91SGabor Kovesdan 
61c66bbc91SGabor Kovesdan /*
62c66bbc91SGabor Kovesdan  * Find string suffix of format: (\.[A-Za-z~][A-Za-z0-9~]*)*$
63c66bbc91SGabor Kovesdan  * Set length of string before suffix.
64c66bbc91SGabor Kovesdan  */
65c66bbc91SGabor Kovesdan static void
find_suffix(bwstring_iterator si,bwstring_iterator se,size_t * len)66c66bbc91SGabor Kovesdan find_suffix(bwstring_iterator si, bwstring_iterator se, size_t *len)
67c66bbc91SGabor Kovesdan {
68c66bbc91SGabor Kovesdan 	wchar_t c;
69c66bbc91SGabor Kovesdan 	size_t clen;
70c66bbc91SGabor Kovesdan 	bool expect_alpha, sfx;
71c66bbc91SGabor Kovesdan 
72c66bbc91SGabor Kovesdan 	sfx = false;
73c66bbc91SGabor Kovesdan 	expect_alpha = false;
74c66bbc91SGabor Kovesdan 	*len = 0;
75c66bbc91SGabor Kovesdan 	clen = 0;
76c66bbc91SGabor Kovesdan 
77c66bbc91SGabor Kovesdan 	while ((si < se) && (c = bws_get_iter_value(si))) {
78c66bbc91SGabor Kovesdan 		if (expect_alpha) {
79c66bbc91SGabor Kovesdan 			expect_alpha = false;
80c66bbc91SGabor Kovesdan 			if (!isalpha_clocale(c) && (c != L'~'))
81c66bbc91SGabor Kovesdan 				sfx = false;
82c66bbc91SGabor Kovesdan 		} else if (c == L'.') {
83c66bbc91SGabor Kovesdan 			expect_alpha = true;
84c66bbc91SGabor Kovesdan 			if (!sfx) {
85c66bbc91SGabor Kovesdan 				sfx = true;
86c66bbc91SGabor Kovesdan 				*len = clen;
87c66bbc91SGabor Kovesdan 			}
88c66bbc91SGabor Kovesdan 		} else if (!isalnum_clocale(c) && (c != L'~'))
89c66bbc91SGabor Kovesdan 			sfx = false;
90c66bbc91SGabor Kovesdan 
91c66bbc91SGabor Kovesdan 		si = bws_iterator_inc(si, 1);
92c66bbc91SGabor Kovesdan 		++clen;
93c66bbc91SGabor Kovesdan 	}
94c66bbc91SGabor Kovesdan 
95c66bbc91SGabor Kovesdan 	/* This code must be here to make the implementation compatible
96c66bbc91SGabor Kovesdan 	 * with WORDING of GNU sort documentation.
97c66bbc91SGabor Kovesdan 	 * But the GNU sort implementation is not following its own
98c66bbc91SGabor Kovesdan 	 * documentation.  GNU sort allows empty file extensions
99c66bbc91SGabor Kovesdan 	 * (just dot with nothing after); but the regular expression in
100c66bbc91SGabor Kovesdan 	 * their documentation does not allow empty file extensions.
101c66bbc91SGabor Kovesdan 	 * We chose to make our implementation compatible with GNU sort
102c66bbc91SGabor Kovesdan 	 * implementation.  If they will ever fix their bug, this code
103c66bbc91SGabor Kovesdan 	 * must be uncommented. Or they may choose to fix the info page,
104c66bbc91SGabor Kovesdan 	 * then the code stays commented.
105c66bbc91SGabor Kovesdan 	 *
106c66bbc91SGabor Kovesdan 	 if (expect_alpha)
107c66bbc91SGabor Kovesdan 	 	sfx = false;
108c66bbc91SGabor Kovesdan 	 */
109c66bbc91SGabor Kovesdan 
110c66bbc91SGabor Kovesdan 	if (!sfx)
111c66bbc91SGabor Kovesdan 		*len = clen;
112c66bbc91SGabor Kovesdan }
113c66bbc91SGabor Kovesdan 
114c66bbc91SGabor Kovesdan static inline int
cmp_chars(wchar_t c1,wchar_t c2)115c66bbc91SGabor Kovesdan cmp_chars(wchar_t c1, wchar_t c2)
116c66bbc91SGabor Kovesdan {
117e5f71a07SPedro F. Giffuni 
118c66bbc91SGabor Kovesdan 	if (c1 == c2)
119c66bbc91SGabor Kovesdan 		return (0);
120c66bbc91SGabor Kovesdan 
121c66bbc91SGabor Kovesdan 	if (c1 == L'~')
122c66bbc91SGabor Kovesdan 		return (-1);
123c66bbc91SGabor Kovesdan 	if (c2 == L'~')
124c66bbc91SGabor Kovesdan 		return (+1);
125c66bbc91SGabor Kovesdan 
126c66bbc91SGabor Kovesdan 	if (isdigit_clocale(c1) || !c1)
127c66bbc91SGabor Kovesdan 		return ((isdigit_clocale(c2) || !c2) ? 0 : -1);
128c66bbc91SGabor Kovesdan 
129c66bbc91SGabor Kovesdan 	if (isdigit_clocale(c2) || !c2)
130c66bbc91SGabor Kovesdan 		return (+1);
131c66bbc91SGabor Kovesdan 
132c66bbc91SGabor Kovesdan 	if (isalpha_clocale(c1))
133c66bbc91SGabor Kovesdan 		return ((isalpha_clocale(c2)) ? ((int) c1 - (int) c2) : -1);
134c66bbc91SGabor Kovesdan 
135c66bbc91SGabor Kovesdan 	if (isalpha_clocale(c2))
136c66bbc91SGabor Kovesdan 		return (+1);
137c66bbc91SGabor Kovesdan 
138c66bbc91SGabor Kovesdan 	return ((int) c1 - (int) c2);
139c66bbc91SGabor Kovesdan }
140c66bbc91SGabor Kovesdan 
141c66bbc91SGabor Kovesdan static int
cmpversions(bwstring_iterator si1,bwstring_iterator se1,bwstring_iterator si2,bwstring_iterator se2)142c66bbc91SGabor Kovesdan cmpversions(bwstring_iterator si1, bwstring_iterator se1,
143c66bbc91SGabor Kovesdan     bwstring_iterator si2, bwstring_iterator se2)
144c66bbc91SGabor Kovesdan {
145c66bbc91SGabor Kovesdan 	int cmp, diff;
146c66bbc91SGabor Kovesdan 
147c66bbc91SGabor Kovesdan 	while ((si1 < se1) || (si2 < se2)) {
148c66bbc91SGabor Kovesdan 		diff = 0;
149c66bbc91SGabor Kovesdan 
150c66bbc91SGabor Kovesdan 		while (((si1 < se1) &&
151c66bbc91SGabor Kovesdan 		    !isdigit_clocale(bws_get_iter_value(si1))) ||
152c66bbc91SGabor Kovesdan 		    ((si2 < se2) && !isdigit_clocale(bws_get_iter_value(si2)))) {
153c66bbc91SGabor Kovesdan 			wchar_t c1, c2;
154c66bbc91SGabor Kovesdan 
155c66bbc91SGabor Kovesdan 			c1 = (si1 < se1) ? bws_get_iter_value(si1) : 0;
156c66bbc91SGabor Kovesdan 			c2 = (si2 < se2) ? bws_get_iter_value(si2) : 0;
157c66bbc91SGabor Kovesdan 
158c66bbc91SGabor Kovesdan 			cmp = cmp_chars(c1, c2);
159c66bbc91SGabor Kovesdan 			if (cmp)
160c66bbc91SGabor Kovesdan 				return (cmp);
161c66bbc91SGabor Kovesdan 
162c66bbc91SGabor Kovesdan 			if (si1 < se1)
163c66bbc91SGabor Kovesdan 				si1 = bws_iterator_inc(si1, 1);
164c66bbc91SGabor Kovesdan 			if (si2 < se2)
165c66bbc91SGabor Kovesdan 				si2 = bws_iterator_inc(si2, 1);
166c66bbc91SGabor Kovesdan 		}
167c66bbc91SGabor Kovesdan 
168c66bbc91SGabor Kovesdan 		while (bws_get_iter_value(si1) == L'0')
169c66bbc91SGabor Kovesdan 			si1 = bws_iterator_inc(si1, 1);
170c66bbc91SGabor Kovesdan 
171c66bbc91SGabor Kovesdan 		while (bws_get_iter_value(si2) == L'0')
172c66bbc91SGabor Kovesdan 			si2 = bws_iterator_inc(si2, 1);
173c66bbc91SGabor Kovesdan 
174c66bbc91SGabor Kovesdan 		while (isdigit_clocale(bws_get_iter_value(si1)) &&
175c66bbc91SGabor Kovesdan 		    isdigit_clocale(bws_get_iter_value(si2))) {
176c66bbc91SGabor Kovesdan 			if (!diff)
177c66bbc91SGabor Kovesdan 				diff = ((int)bws_get_iter_value(si1) -
178c66bbc91SGabor Kovesdan 				    (int)bws_get_iter_value(si2));
179c66bbc91SGabor Kovesdan 			si1 = bws_iterator_inc(si1, 1);
180c66bbc91SGabor Kovesdan 			si2 = bws_iterator_inc(si2, 1);
181c66bbc91SGabor Kovesdan 		}
182c66bbc91SGabor Kovesdan 
183c66bbc91SGabor Kovesdan 		if (isdigit_clocale(bws_get_iter_value(si1)))
184c66bbc91SGabor Kovesdan 			return (1);
185c66bbc91SGabor Kovesdan 
186c66bbc91SGabor Kovesdan 		if (isdigit_clocale(bws_get_iter_value(si2)))
187c66bbc91SGabor Kovesdan 			return (-1);
188c66bbc91SGabor Kovesdan 
189c66bbc91SGabor Kovesdan 		if (diff)
190c66bbc91SGabor Kovesdan 			return (diff);
191c66bbc91SGabor Kovesdan 	}
192c66bbc91SGabor Kovesdan 
193c66bbc91SGabor Kovesdan 	return (0);
194c66bbc91SGabor Kovesdan }
195c66bbc91SGabor Kovesdan 
196c66bbc91SGabor Kovesdan /*
197c66bbc91SGabor Kovesdan  * Compare two version strings
198c66bbc91SGabor Kovesdan  */
199c66bbc91SGabor Kovesdan int
vcmp(struct bwstring * s1,struct bwstring * s2)200c66bbc91SGabor Kovesdan vcmp(struct bwstring *s1, struct bwstring *s2)
201c66bbc91SGabor Kovesdan {
202c66bbc91SGabor Kovesdan 	bwstring_iterator si1, si2;
203c66bbc91SGabor Kovesdan 	wchar_t c1, c2;
204c66bbc91SGabor Kovesdan 	size_t len1, len2, slen1, slen2;
205c66bbc91SGabor Kovesdan 	int cmp_bytes, cmp_res;
206c66bbc91SGabor Kovesdan 
207c66bbc91SGabor Kovesdan 	if (s1 == s2)
208c66bbc91SGabor Kovesdan 		return (0);
209c66bbc91SGabor Kovesdan 
210c66bbc91SGabor Kovesdan 	cmp_bytes = bwscmp(s1, s2, 0);
211c66bbc91SGabor Kovesdan 	if (cmp_bytes == 0)
212c66bbc91SGabor Kovesdan 		return (0);
213c66bbc91SGabor Kovesdan 
214c66bbc91SGabor Kovesdan 	len1 = slen1 = BWSLEN(s1);
215c66bbc91SGabor Kovesdan 	len2 = slen2 = BWSLEN(s2);
216c66bbc91SGabor Kovesdan 
217c66bbc91SGabor Kovesdan 	if (slen1 < 1)
218c66bbc91SGabor Kovesdan 		return (-1);
219c66bbc91SGabor Kovesdan 	if (slen2 < 1)
220c66bbc91SGabor Kovesdan 		return (+1);
221c66bbc91SGabor Kovesdan 
222c66bbc91SGabor Kovesdan 	si1 = bws_begin(s1);
223c66bbc91SGabor Kovesdan 	si2 = bws_begin(s2);
224c66bbc91SGabor Kovesdan 
225c66bbc91SGabor Kovesdan 	c1 = bws_get_iter_value(si1);
226c66bbc91SGabor Kovesdan 	c2 = bws_get_iter_value(si2);
227c66bbc91SGabor Kovesdan 
228c66bbc91SGabor Kovesdan 	if (c1 == L'.' && (slen1 == 1))
229c66bbc91SGabor Kovesdan 		return (-1);
230c66bbc91SGabor Kovesdan 
231c66bbc91SGabor Kovesdan 	if (c2 == L'.' && (slen2 == 1))
232c66bbc91SGabor Kovesdan 		return (+1);
233c66bbc91SGabor Kovesdan 
234c66bbc91SGabor Kovesdan 	if (slen1 == 2 && c1 == L'.' &&
235c66bbc91SGabor Kovesdan 	    bws_get_iter_value(bws_iterator_inc(si1, 1)) == L'.')
236c66bbc91SGabor Kovesdan 		return (-1);
237c66bbc91SGabor Kovesdan 	if (slen2 == 2 && c2 == L'.' &&
238c66bbc91SGabor Kovesdan 	    bws_get_iter_value(bws_iterator_inc(si2, 1)) == L'.')
239c66bbc91SGabor Kovesdan 		return (+1);
240c66bbc91SGabor Kovesdan 
241c66bbc91SGabor Kovesdan 	if (c1 == L'.' && c2 != L'.')
242c66bbc91SGabor Kovesdan 		return (-1);
243c66bbc91SGabor Kovesdan 	if (c1 != L'.' && c2 == L'.')
244c66bbc91SGabor Kovesdan 		return (+1);
245c66bbc91SGabor Kovesdan 
246c66bbc91SGabor Kovesdan 	if (c1 == L'.' && c2 == L'.') {
247c66bbc91SGabor Kovesdan 		si1 = bws_iterator_inc(si1, 1);
248c66bbc91SGabor Kovesdan 		si2 = bws_iterator_inc(si2, 1);
249c66bbc91SGabor Kovesdan 	}
250c66bbc91SGabor Kovesdan 
251c66bbc91SGabor Kovesdan 	find_suffix(si1, bws_end(s1), &len1);
252c66bbc91SGabor Kovesdan 	find_suffix(si2, bws_end(s2), &len2);
253c66bbc91SGabor Kovesdan 
254c66bbc91SGabor Kovesdan 	if ((len1 == len2) && (bws_iterator_cmp(si1, si2, len1) == 0))
255c66bbc91SGabor Kovesdan 		return (cmp_bytes);
256c66bbc91SGabor Kovesdan 
257c66bbc91SGabor Kovesdan 	cmp_res = cmpversions(si1, bws_iterator_inc(si1, len1), si2,
258c66bbc91SGabor Kovesdan 	    bws_iterator_inc(si2, len2));
259c66bbc91SGabor Kovesdan 
260c66bbc91SGabor Kovesdan 	if (cmp_res == 0)
261c66bbc91SGabor Kovesdan 	  cmp_res = cmp_bytes;
262c66bbc91SGabor Kovesdan 
263c66bbc91SGabor Kovesdan 	return (cmp_res);
264c66bbc91SGabor Kovesdan }
265