1 /*
2 * Copyright 2006-2008 The FLWOR Foundation.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 #pragma once
17 #ifndef ZORBA_UTF8_UTIL_H
18 #define ZORBA_UTF8_UTIL_H
19
20 #include <algorithm>
21 #include <cwchar>
22 #include <string>
23
24 #include "ascii_util.h"
25 #include "cxx_util.h"
26 #include "string_util.h"
27 #include "unicode_util.h"
28 #include "utf8_string.h"
29 #include "utf8_util_base.h"
30
31 #include "zorbatypes/collation_manager.h"
32 #include "zorbautils/hashfun.h"
33
34 #ifdef ZORBA_NO_ICU
35 # include "diagnostics/assert.h"
36 #else
37 # include <unicode/coll.h>
38 # include <unicode/sortkey.h>
39 #endif /* ZORBA_NO_ICU */
40
41 namespace zorba {
42 namespace utf8 {
43
44 using ascii::begins_with;
45 using ascii::ends_with;
46 using ascii::is_whitespace;
47 using ascii::normalize_whitespace;
48 using ascii::trim_whitespace;
49
50 ////////// begins/ends_with ///////////////////////////////////////////////////
51
52 /**
53 * Checks whether a string begins with a given prefix.
54 *
55 * @tparam StringType The string type.
56 * @param s The string to check.
57 * @param c The prefix character.
58 * @param n The number of bytes to compare.
59 * @return Returns \c true only if \a s begins with \a c.
60 */
61 template<class StringType> inline
begins_with(utf8_string<StringType> const & s,char c)62 bool begins_with( utf8_string<StringType> const &s, char c ) {
63 return ascii::begins_with( *s.get(), c );
64 }
65
66 /**
67 * Checks whether a string begins with a given prefix.
68 *
69 * @tparam StringType The string type.
70 * @param s The string to check.
71 * @param ps The prefix string.
72 * @param n The number of bytes to compare.
73 * @return Returns \c true only if \a s begins with \a ps.
74 */
75 template<class StringType> inline
begins_with(utf8_string<StringType> const & s,char const * ps,typename StringType::size_type n)76 bool begins_with( utf8_string<StringType> const &s, char const *ps,
77 typename StringType::size_type n ) {
78 return ascii::begins_with( *s.get(), ps, n );
79 }
80
81 /**
82 * Checks whether a string begins with a given prefix.
83 *
84 * @tparam StringType The string type.
85 * @param s The string to check.
86 * @param ps The prefix string.
87 * @return Returns \c true only if \a s begins with \a ps.
88 */
89 template<class StringType> inline
begins_with(utf8_string<StringType> const & s,char const * ps)90 bool begins_with( utf8_string<StringType> const &s, char const *ps ) {
91 return ascii::begins_with( *s.get(), ps );
92 }
93
94 /**
95 * Checks whether a string begins with a given prefix.
96 *
97 * @tparam StringType The string type.
98 * @tparam PrefixStringType The suffix string type.
99 * @param s The string to check.
100 * @param ps The suffix string.
101 * @return Returns \c true only if \a s ends with \a ps.
102 */
103 template<class StringType,class PrefixStringType> inline
begins_with(utf8_string<StringType> const & s,PrefixStringType const & ps)104 bool begins_with( utf8_string<StringType> const &s,
105 PrefixStringType const &ps ) {
106 return ascii::begins_with( *s.get(), ps.data(), ps.size() );
107 }
108
109 /**
110 * Checks whether a string begins with a given prefix.
111 *
112 * @tparam StringType The string type.
113 * @tparam PrefixStringType The suffix string type.
114 * @param s The string to check.
115 * @param ps The suffix string.
116 * @return Returns \c true only if \a s ends with \a ps.
117 */
118 template<class StringType,class PrefixStringType> inline
begins_with(StringType const & s,utf8_string<PrefixStringType> const & ps)119 bool begins_with( StringType const &s,
120 utf8_string<PrefixStringType> const &ps ) {
121 return ascii::begins_with( s, *ps.get() );
122 }
123
124 /**
125 * Checks whether a string begins with a given prefix.
126 *
127 * @tparam StringType The string type.
128 * @tparam PrefixStringType The suffix string type.
129 * @param s The string to check.
130 * @param ps The suffix string.
131 * @return Returns \c true only if \a s ends with \a ps.
132 */
133 template<class StringType,class PrefixStringType> inline
begins_with(utf8_string<StringType> const & s,utf8_string<PrefixStringType> const & ps)134 bool begins_with( utf8_string<StringType> const &s,
135 utf8_string<PrefixStringType> const &ps ) {
136 return ascii::begins_with( *s.get(), *ps.get() );
137 }
138
139 /**
140 * Checks whether a string ends with a given suffix.
141 *
142 * @tparam StringType The string type.
143 * @param s The string to check.
144 * @param c The suffix character.
145 * @param n The number of bytes to compare.
146 * @return Returns \c true only if \a s ends with \a c.
147 */
148 template<class StringType> inline
ends_with(utf8_string<StringType> const & s,char c)149 bool ends_with( utf8_string<StringType> const &s, char c ) {
150 return ascii::ends_with( *s.get(), c );
151 }
152
153 /**
154 * Checks whether a string ends with a given suffix.
155 *
156 * @tparam StringType The string type.
157 * @param s The string to check.
158 * @param ps The suffix string.
159 * @param n The number of bytes to compare.
160 * @return Returns \c true only if \a s ends with \a ps.
161 */
162 template<class StringType> inline
ends_with(utf8_string<StringType> const & s,char const * ps,typename StringType::size_type n)163 bool ends_with( utf8_string<StringType> const &s, char const *ps,
164 typename StringType::size_type n ) {
165 return ascii::ends_with( *s.get(), ps, n );
166 }
167
168 /**
169 * Checks whether a string ends with a given suffix.
170 *
171 * @tparam StringType The string type.
172 * @param s The string to check.
173 * @param ps The suffix string.
174 * @return Returns \c true only if \a s ends with \a ps.
175 */
176 template<class StringType> inline
ends_with(utf8_string<StringType> const & s,char const * ps)177 bool ends_with( utf8_string<StringType> const &s, char const *ps ) {
178 return ascii::ends_with( *s.get(), ps );
179 }
180
181 /**
182 * Checks whether a string ends with a given suffix.
183 *
184 * @tparam StringType The string type.
185 * @tparam SuffixStringType The suffix string type.
186 * @param s The string to check.
187 * @param ps The suffix string.
188 * @return Returns \c true only if \a s ends with \a ps.
189 */
190 template<class StringType,class SuffixStringType> inline
ends_with(utf8_string<StringType> const & s,SuffixStringType const & ps)191 bool ends_with( utf8_string<StringType> const &s,
192 SuffixStringType const &ps ) {
193 return ascii::ends_with( *s.get(), ps.data(), ps.size() );
194 }
195
196 /**
197 * Checks whether a string ends with a given suffix.
198 *
199 * @tparam StringType The string type.
200 * @tparam SuffixStringType The suffix string type.
201 * @param s The string to check.
202 * @param ps The suffix string.
203 * @return Returns \c true only if \a s ends with \a ps.
204 */
205 template<class StringType,class SuffixStringType> inline
ends_with(StringType const & s,utf8_string<SuffixStringType> const & ps)206 bool ends_with( StringType const &s,
207 utf8_string<SuffixStringType> const &ps ) {
208 return ascii::ends_with( s, *ps.get() );
209 }
210
211 /**
212 * Checks whether a string ends with a given suffix.
213 *
214 * @tparam StringType The string type.
215 * @tparam SuffixStringType The suffix string type.
216 * @param s The string to check.
217 * @param ps The suffix string.
218 * @return Returns \c true only if \a s ends with \a ps.
219 */
220 template<class StringType,class SuffixStringType> inline
ends_with(utf8_string<StringType> const & s,utf8_string<SuffixStringType> const & ps)221 bool ends_with( utf8_string<StringType> const &s,
222 utf8_string<SuffixStringType> const &ps ) {
223 return ascii::ends_with( *s.get(), *ps.get() );
224 }
225
226 ////////// Case conversion ////////////////////////////////////////////////////
227
228 /**
229 * Converts a string to lower-case.
230 *
231 * @tparam InputStringType The input string type.
232 * @tparam OutputStringType The output string type.
233 * @param in The input string.
234 * @param out The output string (which must be different from \a in). Its
235 * contents are overwritten.
236 */
237 template<class InputStringType,class OutputStringType>
238 void to_lower( InputStringType const &in, OutputStringType *out );
239
240 /**
241 * Converts a string to lower-case.
242 *
243 * @tparam StringType The input string type.
244 * @param s The string.
245 */
246 template<class StringType> inline
to_lower(StringType & s)247 void to_lower( StringType &s ) {
248 StringType temp;
249 to_lower( s, &temp );
250 s = temp;
251 }
252
253 /**
254 * Converts a string to upper-case.
255 *
256 * @tparam InputStringType The input string type.
257 * @tparam OutputStringType The output string type.
258 * @param in The input string.
259 * @param out The output string (which must be different from \a in). Its
260 * contents are overwritten.
261 */
262 template<class InputStringType,class OutputStringType>
263 void to_upper( InputStringType const &in, OutputStringType *out );
264
265 /**
266 * Converts a string to upper-case.
267 *
268 * @tparam StringType The input string type.
269 * @param s The string.
270 */
271 template<class StringType> inline
to_upper(StringType & s)272 void to_upper( StringType &s ) {
273 StringType temp;
274 to_upper( s, &temp );
275 s = temp;
276 }
277
278 ////////// Code-point functions ///////////////////////////////////////////////
279
280 /**
281 * Appends a sequence of Unicode code-points to a string.
282 *
283 * @tparam InputIterator The iterator type.
284 * @param i The begining iterator.
285 * @param j The ending iterator.
286 * @param s A pointer to the string to be appended to.
287 */
288 template<class InputIterator,class StringType> inline
append_codepoints(InputIterator i,InputIterator j,StringType * s)289 void append_codepoints( InputIterator i, InputIterator j, StringType *s ) {
290 typename utf8_stringify<StringType>::type u( *s );
291 std::copy( i, j, std::back_inserter( u ) );
292 }
293
294 /**
295 * Converts a string to a sequence of Uncode code-points.
296 *
297 * @tparam StringType The type of string.
298 * @tparam ContainerType The type of STL container to put the codepoint values.
299 * @param s The string to get the codepoints for.
300 * @param c A pointer to the container to put the codepoint values. The
301 * containers contents are overwritten.
302 */
303 template<class StringType,class ContainerType> inline
to_codepoints(StringType const & s,ContainerType * c)304 void to_codepoints( StringType const &s, ContainerType *c ) {
305 typename utf8_stringify<StringType const>::type const u( s );
306 std::copy( u.begin(), u.end(), std::back_inserter( *c ) );
307 }
308
309 ////////// Encoding conversion ////////////////////////////////////////////////
310
311 #ifndef ZORBA_NO_ICU
312
313 /**
314 * Converts a unicode::char_type array into a UTF-8 encoded string.
315 *
316 * @param in The Unicode characters to convert.
317 * @param in_len The number of unicode characters (not bytes) to convert.
318 * @param out A pointer to a pointer to the starting location for the result.
319 * It is the caller's responsibility to deallocate this.
320 * @param out_len If not \c nullptr, the number of bytes (not characters) of
321 * the UTF-8 string are put here.
322 * @return Returns \c true only if the conversion succeeded.
323 */
324 bool to_string( unicode::char_type const *in, unicode::size_type in_len,
325 storage_type **out, size_type *out_len = nullptr );
326
327 /**
328 * Converts a unicode::char_type array into a UTF-8 encoded string.
329 *
330 * @param in The Unicode characters to convert.
331 * @param out A pointer to a pointer to the starting location for the result.
332 * It is the caller's responsibility to deallocate this.
333 * @param out_len If not \c nullptr, the number of bytes (not characters) of
334 * the UTF-8 string are put here.
335 * @return Returns \c true only if the conversion succeeded.
336 */
337 inline bool to_string( unicode::char_type const *in, storage_type **out,
338 size_type *out_len = nullptr ) {
339 return to_string( in, u_strlen( in ), out, out_len );
340 }
341
342 /**
343 * Converts a unicode::string into a UTF-8 encoded string.
344 *
345 * @param in The unicode::string to convert.
346 * @param out A pointer to a pointer to the starting location for the result.
347 * It is the caller's responsibility to deallocate this.
348 * @param out_len If not \c nullptr, the number of bytes (not characters) of
349 * the UTF-8 string are put here.
350 * @return Returns \c true only if the conversion succeeded.
351 */
352 inline bool to_string( unicode::string const &in, storage_type **out,
353 size_type *out_len = nullptr ) {
354 return to_string( in.getBuffer(), in.length(), out, out_len );
355 }
356
357 /**
358 * Converts a unicode::char_type array into a UTF-8 encoded string.
359 *
360 * @param in The Unicode characters to convert.
361 * @param in_len The number of unicode characters to convert.
362 * @param out A pointer to the result string.
363 * @return Returns \c true only if the conversion succeeded.
364 */
365 template<class StringType>
366 bool to_string( unicode::char_type const *in, size_type in_len,
367 StringType *out );
368
369 /**
370 * Converts a unicode::char_type array into a UTF-8 encoded string.
371 *
372 * @param in The Unicode characters to convert.
373 * @param out A pointer to the result string.
374 * @return Returns \c true only if the conversion succeeded.
375 */
376 template<class StringType> inline
to_string(unicode::char_type const * in,StringType * out)377 bool to_string( unicode::char_type const *in, StringType *out ) {
378 return to_string( in, u_strlen( in ), out );
379 }
380
381 #endif /* ZORBA_NO_ICU */
382
383 /**
384 * Converts a unicode::string into a UTF-8 encoded string.
385 *
386 * @param in The unicode::string to convert.
387 * @param out A pointer to the result string.
388 * @return Returns \c true only if the conversion succeeded.
389 */
390 template<class StringType> inline
to_string(unicode::string const & in,StringType * out)391 bool to_string( unicode::string const &in, StringType *out ) {
392 #ifndef ZORBA_NO_ICU
393 return to_string( in.getBuffer(), in.length(), out );
394 #else
395 *out = in.c_str();
396 return true;
397 #endif /* ZORBA_NO_ICU */
398 }
399
400 #ifndef ZORBA_NO_ICU
401
402 //
403 // On Windows, UChar == wchar_t, so these functions would multiply define those
404 // previously.
405 //
406 #ifndef WIN32
407
408 /**
409 * Converts a wide-character string into a UTF-8 encoded string.
410 *
411 * @param in The wide-character string to convert.
412 * @param in_len The length of the unicode::string.
413 * @param out A pointer to a pointer to the starting location for the result.
414 * It is the caller's responsibility to deallocate this.
415 * @param out_len If not \c nullptr, the number of bytes (not characters) of
416 * the UTF-8 string are put here.
417 * @return Returns \c true only if the conversion succeeded.
418 */
419 bool to_string( wchar_t const *in, size_type in_len, storage_type **out,
420 size_type *out_len = nullptr );
421
422 /**
423 * Converts a wide-character string into a UTF-8 encoded string.
424 *
425 * @param in The wide-character string to convert.
426 * @param out A pointer to a pointer to the starting location for the result.
427 * It is the caller's responsibility to deallocate this.
428 * @param out_len If not \c nullptr, the number of bytes (not characters) of
429 * the UTF-8 string are put here.
430 * @return Returns \c true only if the conversion succeeded.
431 */
432 inline bool to_string( wchar_t const *in, storage_type **out,
433 size_type *out_len = nullptr ) {
434 return to_string( in, std::wcslen( in ), out, out_len );
435 }
436
437 /**
438 * Converts a wide-character string into a UTF-8 encoded string.
439 *
440 * @tparam StringType The type of the result string.
441 * @param in The wide-character string to convert.
442 * @param in_len The length of the unicode::string.
443 * @param out A pointer to the result string.
444 * @return Returns \c true only if the conversion succeeded.
445 */
446 template<class StringType>
447 bool to_string( wchar_t const *in, size_type in_len, StringType *out );
448
449 /**
450 * Converts a wide-character string into a UTF-8 encoded string.
451 *
452 * @tparam StringType The type of the result string.
453 * @param in The wide-character string to convert.
454 * @param out A pointer to the result string.
455 * @return Returns \c true only if the conversion succeeded.
456 */
457 template<class StringType> inline
to_string(wchar_t const * in,StringType * out)458 bool to_string( wchar_t const *in, StringType *out ) {
459 return to_string( in, std::wcslen( in ), out );
460 }
461
462 #endif /* WIN32 */
463
464 /**
465 * Converts a wide-character string into a UTF-8 encoded string.
466 *
467 * @tparam StringType The type of the result string.
468 * @param in The wide-character string to convert.
469 * @param out A pointer to the result string.
470 * @return Returns \c true only if the conversion succeeded.
471 */
472 template<class StringType> inline
to_string(std::wstring const & in,StringType * out)473 bool to_string( std::wstring const &in, StringType *out ) {
474 return to_string( in.data(), in.size(), out );
475 }
476
477 /**
478 * Converts a UTF-8 encoded string to a wchar_t array.
479 *
480 * @param in The UTF-8 encoded string to convert.
481 * @param in_len The number of bytes (not characters) of \a in.
482 * @param out A pointer to a pointer to the starting location for the result.
483 * It is the caller's responsibility to deallocate this.
484 * @param out_len If not \c nullptr, the number of characters (not bytes) of
485 * the wchar_t string are put here.
486 * @return Returns \c true only if the conversion succeeded.
487 */
488 bool to_wchar_t( storage_type const *in, size_type in_len, wchar_t **out,
489 unicode::size_type *out_len );
490
491 /**
492 * Converts a UTF-8 encoded string to a wchar_t array.
493 *
494 * @param in The UTF-8 encoded string to convert.
495 * @param out A pointer to a pointer to the starting location for the result.
496 * It is the caller's responsibility to deallocate this.
497 * @param out_len If not \c nullptr, the number of characters (not bytes) of
498 * the \c wchar_t string are put here.
499 * @return Returns \c true only if the conversion succeeded.
500 */
to_wchar_t(storage_type const * in,wchar_t ** out,unicode::size_type * out_len)501 inline bool to_wchar_t( storage_type const *in, wchar_t **out,
502 unicode::size_type *out_len ) {
503 return to_wchar_t( in, std::strlen( in ), out, out_len );
504 }
505
506 /**
507 * Converts a UTF-8 encoded string to a wchar_t array.
508 *
509 * @tparam StringType The string type.
510 * @param in The UTF-8 encoded string to convert.
511 * @param out A pointer to a pointer to the starting location for the result.
512 * It is the caller's responsibility to deallocate this.
513 * @param out_len If not \c nullptr, the number of characters (not bytes) of
514 * the \c wchar_t string are put here.
515 * @return Returns \c true only if the conversion succeeded.
516 */
517 template<class StringType> inline
to_wchar_t(StringType const & in,wchar_t ** out,unicode::size_type * out_len)518 bool to_wchar_t( StringType const &in, wchar_t **out,
519 unicode::size_type *out_len ) {
520 return to_wchar_t( in.data(), in.size(), out, out_len );
521 }
522
523 #endif /* ZORBA_NO_ICU */
524
525 ////////// HTML URI ///////////////////////////////////////////////////////////
526
527 /**
528 * A %back_html_uri_insert_iterator can be used to append characters to a string
529 * escaping all non-printing ASCII characters.
530 *
531 * @tparam StringType The string type.
532 */
533 template<class StringType>
534 class back_html_uri_insert_iterator :
535 public
536 ztd::back_insert_iterator_base<
537 StringType, back_html_uri_insert_iterator<StringType>
538 >
539 {
540 typedef ztd::back_insert_iterator_base<
541 StringType, back_html_uri_insert_iterator<StringType>
542 > base_type;
543 public:
544 typedef typename base_type::container_type container_type;
545 typedef typename StringType::value_type value_type;
546
547 /**
548 * Constructs a %back_html_uri_insert_iterator.
549 *
550 * @param s The string to append to.
551 */
back_html_uri_insert_iterator(StringType & s)552 explicit back_html_uri_insert_iterator( StringType &s ) : base_type( s ) {
553 buf_[0] = '%';
554 }
555
556 back_html_uri_insert_iterator& operator=( value_type c );
557
558 private:
559 char buf_[3]; // %xx -- no need for null at end
560 };
561
562 /**
563 * This is a convenience function to create a back_html_uri_insert_iterator.
564 *
565 * @tparam StringType The string type.
566 * @param out The output string.
567 */
568 template<class StringType> inline back_html_uri_insert_iterator<StringType>
back_html_uri_inserter(StringType & out)569 back_html_uri_inserter( StringType &out ) {
570 return back_html_uri_insert_iterator<StringType>( out );
571 }
572
573 /**
574 * Escapes all non-printable ASCII characters
575 *
576 * @tparam InputStringType The input string type.
577 * @tparam OutputStringType The output String type.
578 * @param in The input string.
579 * @param out The output string (which must be different from \a in).
580 */
581 template<class InputStringType,class OutputStringType> inline
to_html_uri(InputStringType const & in,OutputStringType * out)582 void to_html_uri( InputStringType const &in, OutputStringType *out ) {
583 typename utf8_stringify<InputStringType const>::type const u_in( in );
584 typename utf8_stringify<OutputStringType>::type u_out( *out );
585 std::copy( u_in.begin(), u_in.end(), back_html_uri_inserter( u_out ) );
586 }
587
588 /**
589 * Escapes all non-printable ASCII characters
590 *
591 * @tparam StringType The string type.
592 * @param s The string.
593 */
594 template<class StringType> inline
to_html_uri(StringType & s)595 void to_html_uri( StringType &s ) {
596 StringType temp;
597 to_html_uri( s, &temp );
598 s = temp;
599 }
600
601 ////////// IRI ////////////////////////////////////////////////////////////////
602
603 /**
604 * A %back_iri_insert_iterator can be used to append characters to a string
605 * ensuring that illegal characters in an IRI. See RFC 3987.
606 *
607 * @tparam StringType The string type.
608 */
609 template<class StringType>
610 class back_iri_insert_iterator :
611 public
612 ztd::back_insert_iterator_base<
613 StringType, back_iri_insert_iterator<StringType>
614 >
615 {
616 typedef ztd::back_insert_iterator_base<
617 StringType, back_iri_insert_iterator<StringType>
618 > base_type;
619 public:
620 typedef typename base_type::container_type container_type;
621 typedef typename StringType::value_type value_type;
622
623 /**
624 * Constructs a %back_iri_insert_iterator.
625 *
626 * @param s The string to append to.
627 */
back_iri_insert_iterator(StringType & s)628 explicit back_iri_insert_iterator( StringType &s ) : base_type( s ) {
629 buf_[0] = '%';
630 }
631
632 back_iri_insert_iterator& operator=( value_type c );
633
634 private:
635 char buf_[3]; // %xx -- no need for null at end
636 };
637
638 /**
639 * This is a convenience function to create a back_iri_insert_iterator.
640 *
641 * @tparam StringType The string type.
642 * @param out The output string.
643 */
644 template<class StringType> inline back_iri_insert_iterator<StringType>
back_iri_inserter(StringType & out)645 back_iri_inserter( StringType &out ) {
646 return back_iri_insert_iterator<StringType>( out );
647 }
648
649 /**
650 * Converts an IRI to a URI.
651 *
652 * @tparam InputStringType The input string type.
653 * @tparam OutputStringType The output String type.
654 * @param in The input string.
655 * @param out The output string (which must be different from \a in).
656 * See RFC 3987.
657 */
658 template<class InputStringType,class OutputStringType> inline
iri_to_uri(InputStringType const & in,OutputStringType * out)659 void iri_to_uri( InputStringType const &in, OutputStringType *out ) {
660 typename utf8_stringify<InputStringType const>::type const u_in( in );
661 typename utf8_stringify<OutputStringType>::type u_out( *out );
662 std::copy( u_in.begin(), u_in.end(), back_iri_inserter( u_out ) );
663 }
664
665 /**
666 * Converts an IRI to a URI.
667 *
668 * @tparam StringType The string type.
669 * @param s The string.
670 * See RFC 3987.
671 */
672 template<class StringType> inline
iri_to_uri(StringType & s)673 void iri_to_uri( StringType &s ) {
674 StringType temp;
675 iri_to_uri( s, &temp );
676 s = temp;
677 }
678
679 ////////// Unicode normalization //////////////////////////////////////////////
680
681 #ifndef ZORBA_NO_ICU
682 /**
683 * Normalizes the Unicode characters in the string.
684 *
685 * @tparam InputStringType The input string type.
686 * @tparam OutputStringType The output String type.
687 * @param in The input string.
688 * @param out The output String (which may be the same as \a in).
689 */
690 template<class InputStringType,class OutputStringType>
691 bool normalize( InputStringType const &in, unicode::normalization::type n,
692 OutputStringType *out );
693 #endif /* ZORBA_NO_ICU */
694
695 ////////// Whitespace /////////////////////////////////////////////////////////
696
697 /**
698 * Converts sequences of one or more whitespace characters to a single space.
699 * Additionally, all leading and trailing whitespace is removed.
700 *
701 * @tparam InputStringType The input string type.
702 * @tparam OutputStringType The output string type.
703 * @param in The input string.
704 * @param out The output string.
705 */
706 template<class InputStringType,class OutputStringType> inline
normalize_whitespace(utf8_string<InputStringType> const & in,utf8_string<OutputStringType> * out)707 void normalize_whitespace( utf8_string<InputStringType> const &in,
708 utf8_string<OutputStringType> *out ) {
709 ascii::normalize_whitespace( *in.get(), out->get() );
710 }
711
712 /**
713 * Removes all leading and trailing whitespace.
714 *
715 * @tparam InputStringType The input string type.
716 * @tparam OutputStringType The output string type.
717 * @param in The input string.
718 * @param out The output string (which must be different from \a in).
719 */
720 template<class InputStringType,class OutputStringType> inline
trim_whitespace(utf8_string<InputStringType> const & in,utf8_string<OutputStringType> * out)721 void trim_whitespace( utf8_string<InputStringType> const &in,
722 utf8_string<OutputStringType> *out ) {
723 ascii::trim_whitespace( *in.get(), out->get() );
724 }
725
726 /**
727 * Removes all leading and trailing whitespace.
728 *
729 * @tparam StringType The string type.
730 * @param s The string.
731 */
732 template<class StringType> inline
trim_whitespace(utf8_string<StringType> & s)733 void trim_whitespace( utf8_string<StringType> &s ) {
734 ascii::trim_whitespace( *s.get() );
735 }
736
737 ////////// Miscellaneous //////////////////////////////////////////////////////
738
739 /**
740 * Reverses the characters in a string.
741 *
742 * @tparam InputStringType The input string type.
743 * @tparam OutputStringType The output string type.
744 * @param in The input string.
745 * @param out The output string.
746 */
747 template<class InputStringType,class OutputStringType> inline
reverse(InputStringType const & in,OutputStringType * out)748 void reverse( InputStringType const &in, OutputStringType *out ) {
749 typename utf8_stringify<InputStringType const>::type const u_in( in );
750 typename utf8_stringify<OutputStringType>::type u_out( *out );
751 std::reverse_copy( u_in.begin(), u_in.end(), std::back_inserter( u_out ) );
752 }
753
754 /**
755 * Strips all diacritical marks from all characters converting them to their
756 * closest ASCII equivalents.
757 *
758 * @tparam InputStringType The input string type.
759 * @tparam OutputStringType The output string type.
760 * @param in The input string.
761 * @param out The output string.
762 * @return Returns \c true only if the strip succeeded.
763 */
764 template<class InputStringType,class OutputStringType>
765 bool strip_diacritics( InputStringType const &in, OutputStringType *out );
766
767 /**
768 *
769 */
770 template<class StringType1,class StringType2> inline
compare(const StringType1 & s1,const StringType2 & s2,const XQPCollator * collation)771 int compare(const StringType1 &s1, const StringType2 &s2,
772 const XQPCollator* collation)
773 {
774 #ifndef ZORBA_NO_ICU
775 if (collation == NULL || collation->doMemCmp())
776 return s1.compare(s2);
777
778 unicode::string us1;
779 unicode::string us2;
780
781 unicode::to_string(s1, &us1);
782 unicode::to_string(s2, &us2);
783
784 return static_cast<Collator*>( collation->getCollator() )->compare(us1, us2);
785 #else
786 return s1.compare(s2);
787 #endif /* ZORBA_NO_ICU */
788 }
789
790
791 /**
792 *
793 */
794 template<class StringType> inline
795 uint32_t hash(const StringType& s, const XQPCollator* collation = NULL)
796 {
797 #ifndef ZORBA_NO_ICU
798 if (!collation || collation->doMemCmp())
799 #endif
800 {
801 const char* str = s.data();
802 ulong len = (ulong)s.size();
803 uint32_t hash = 5381;
804 ulong i = 0;
805 int c;
806 while (i < len && (c = *str++))
807 {
808 hash = ((hash << 5) + hash) + c;
809 ++i;
810 }
811 return hash;
812 //return hashfun::h32((void*)(s.data()), s.size());
813 }
814
815 #ifndef ZORBA_NO_ICU
816 CollationKey collKey;
817 UErrorCode status = U_ZERO_ERROR;
818
819 unicode::string uni_s;
820 unicode::to_string(s, &uni_s);
821
822 static_cast<Collator*>(collation->getCollator())->
823 getCollationKey(uni_s, collKey, status);
824
825 if(U_FAILURE(status))
826 {
827 assert(false);
828 }
829
830 return collKey.hashCode();
831 #else
832 ZORBA_ASSERT(false);
833 #endif /* ZORBA_NO_ICU */
834 }
835
836 ///////////////////////////////////////////////////////////////////////////////
837
838 } // namespace utf8
839 } // namespace zorba
840
841 #include "utf8_util.tcc"
842
843 #endif /* ZORBA_UTF8_UTIL_H */
844 /*
845 * Local variables:
846 * mode: c++
847 * End:
848 */
849 /* vim:set et sw=2 ts=2: */
850