1 /*
2  * Copyright 2006-2008 The FLWOR Foundation.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #pragma once
17 #ifndef ZORBA_UTF8_UTIL_H
18 #define ZORBA_UTF8_UTIL_H
19 
20 #include <algorithm>
21 #include <cwchar>
22 #include <string>
23 
24 #include "ascii_util.h"
25 #include "cxx_util.h"
26 #include "string_util.h"
27 #include "unicode_util.h"
28 #include "utf8_string.h"
29 #include "utf8_util_base.h"
30 
31 #include "zorbatypes/collation_manager.h"
32 #include "zorbautils/hashfun.h"
33 
34 #ifdef ZORBA_NO_ICU
35 # include "diagnostics/assert.h"
36 #else
37 # include <unicode/coll.h>
38 # include <unicode/sortkey.h>
39 #endif /* ZORBA_NO_ICU */
40 
41 namespace zorba {
42 namespace utf8 {
43 
44 using ascii::begins_with;
45 using ascii::ends_with;
46 using ascii::is_whitespace;
47 using ascii::normalize_whitespace;
48 using ascii::trim_whitespace;
49 
50 ////////// begins/ends_with ///////////////////////////////////////////////////
51 
52 /**
53  * Checks whether a string begins with a given prefix.
54  *
55  * @tparam StringType The string type.
56  * @param s The string to check.
57  * @param c The prefix character.
58  * @param n The number of bytes to compare.
59  * @return Returns \c true only if \a s begins with \a c.
60  */
61 template<class StringType> inline
begins_with(utf8_string<StringType> const & s,char c)62 bool begins_with( utf8_string<StringType> const &s, char c ) {
63   return ascii::begins_with( *s.get(), c );
64 }
65 
66 /**
67  * Checks whether a string begins with a given prefix.
68  *
69  * @tparam StringType The string type.
70  * @param s The string to check.
71  * @param ps The prefix string.
72  * @param n The number of bytes to compare.
73  * @return Returns \c true only if \a s begins with \a ps.
74  */
75 template<class StringType> inline
begins_with(utf8_string<StringType> const & s,char const * ps,typename StringType::size_type n)76 bool begins_with( utf8_string<StringType> const &s, char const *ps,
77                   typename StringType::size_type n ) {
78   return ascii::begins_with( *s.get(), ps, n );
79 }
80 
81 /**
82  * Checks whether a string begins with a given prefix.
83  *
84  * @tparam StringType The string type.
85  * @param s The string to check.
86  * @param ps The prefix string.
87  * @return Returns \c true only if \a s begins with \a ps.
88  */
89 template<class StringType> inline
begins_with(utf8_string<StringType> const & s,char const * ps)90 bool begins_with( utf8_string<StringType> const &s, char const *ps ) {
91   return ascii::begins_with( *s.get(), ps );
92 }
93 
94 /**
95  * Checks whether a string begins with a given prefix.
96  *
97  * @tparam StringType The string type.
98  * @tparam PrefixStringType The suffix string type.
99  * @param s The string to check.
100  * @param ps The suffix string.
101  * @return Returns \c true only if \a s ends with \a ps.
102  */
103 template<class StringType,class PrefixStringType> inline
begins_with(utf8_string<StringType> const & s,PrefixStringType const & ps)104 bool begins_with( utf8_string<StringType> const &s,
105                   PrefixStringType const &ps ) {
106   return ascii::begins_with( *s.get(), ps.data(), ps.size() );
107 }
108 
109 /**
110  * Checks whether a string begins with a given prefix.
111  *
112  * @tparam StringType The string type.
113  * @tparam PrefixStringType The suffix string type.
114  * @param s The string to check.
115  * @param ps The suffix string.
116  * @return Returns \c true only if \a s ends with \a ps.
117  */
118 template<class StringType,class PrefixStringType> inline
begins_with(StringType const & s,utf8_string<PrefixStringType> const & ps)119 bool begins_with( StringType const &s,
120                   utf8_string<PrefixStringType> const &ps ) {
121   return ascii::begins_with( s, *ps.get() );
122 }
123 
124 /**
125  * Checks whether a string begins with a given prefix.
126  *
127  * @tparam StringType The string type.
128  * @tparam PrefixStringType The suffix string type.
129  * @param s The string to check.
130  * @param ps The suffix string.
131  * @return Returns \c true only if \a s ends with \a ps.
132  */
133 template<class StringType,class PrefixStringType> inline
begins_with(utf8_string<StringType> const & s,utf8_string<PrefixStringType> const & ps)134 bool begins_with( utf8_string<StringType> const &s,
135                   utf8_string<PrefixStringType> const &ps ) {
136   return ascii::begins_with( *s.get(), *ps.get() );
137 }
138 
139 /**
140  * Checks whether a string ends with a given suffix.
141  *
142  * @tparam StringType The string type.
143  * @param s The string to check.
144  * @param c The suffix character.
145  * @param n The number of bytes to compare.
146  * @return Returns \c true only if \a s ends with \a c.
147  */
148 template<class StringType> inline
ends_with(utf8_string<StringType> const & s,char c)149 bool ends_with( utf8_string<StringType> const &s, char c ) {
150   return ascii::ends_with( *s.get(), c );
151 }
152 
153 /**
154  * Checks whether a string ends with a given suffix.
155  *
156  * @tparam StringType The string type.
157  * @param s The string to check.
158  * @param ps The suffix string.
159  * @param n The number of bytes to compare.
160  * @return Returns \c true only if \a s ends with \a ps.
161  */
162 template<class StringType> inline
ends_with(utf8_string<StringType> const & s,char const * ps,typename StringType::size_type n)163 bool ends_with( utf8_string<StringType> const &s, char const *ps,
164                 typename StringType::size_type n ) {
165   return ascii::ends_with( *s.get(), ps, n );
166 }
167 
168 /**
169  * Checks whether a string ends with a given suffix.
170  *
171  * @tparam StringType The string type.
172  * @param s The string to check.
173  * @param ps The suffix string.
174  * @return Returns \c true only if \a s ends with \a ps.
175  */
176 template<class StringType> inline
ends_with(utf8_string<StringType> const & s,char const * ps)177 bool ends_with( utf8_string<StringType> const &s, char const *ps ) {
178   return ascii::ends_with( *s.get(), ps );
179 }
180 
181 /**
182  * Checks whether a string ends with a given suffix.
183  *
184  * @tparam StringType The string type.
185  * @tparam SuffixStringType The suffix string type.
186  * @param s The string to check.
187  * @param ps The suffix string.
188  * @return Returns \c true only if \a s ends with \a ps.
189  */
190 template<class StringType,class SuffixStringType> inline
ends_with(utf8_string<StringType> const & s,SuffixStringType const & ps)191 bool ends_with( utf8_string<StringType> const &s,
192                 SuffixStringType const &ps ) {
193   return ascii::ends_with( *s.get(), ps.data(), ps.size() );
194 }
195 
196 /**
197  * Checks whether a string ends with a given suffix.
198  *
199  * @tparam StringType The string type.
200  * @tparam SuffixStringType The suffix string type.
201  * @param s The string to check.
202  * @param ps The suffix string.
203  * @return Returns \c true only if \a s ends with \a ps.
204  */
205 template<class StringType,class SuffixStringType> inline
ends_with(StringType const & s,utf8_string<SuffixStringType> const & ps)206 bool ends_with( StringType const &s,
207                 utf8_string<SuffixStringType> const &ps ) {
208   return ascii::ends_with( s, *ps.get() );
209 }
210 
211 /**
212  * Checks whether a string ends with a given suffix.
213  *
214  * @tparam StringType The string type.
215  * @tparam SuffixStringType The suffix string type.
216  * @param s The string to check.
217  * @param ps The suffix string.
218  * @return Returns \c true only if \a s ends with \a ps.
219  */
220 template<class StringType,class SuffixStringType> inline
ends_with(utf8_string<StringType> const & s,utf8_string<SuffixStringType> const & ps)221 bool ends_with( utf8_string<StringType> const &s,
222                 utf8_string<SuffixStringType> const &ps ) {
223   return ascii::ends_with( *s.get(), *ps.get() );
224 }
225 
226 ////////// Case conversion ////////////////////////////////////////////////////
227 
228 /**
229  * Converts a string to lower-case.
230  *
231  * @tparam InputStringType The input string type.
232  * @tparam OutputStringType The output string type.
233  * @param in The input string.
234  * @param out The output string (which must be different from \a in).  Its
235  * contents are overwritten.
236  */
237 template<class InputStringType,class OutputStringType>
238 void to_lower( InputStringType const &in, OutputStringType *out );
239 
240 /**
241  * Converts a string to lower-case.
242  *
243  * @tparam StringType The input string type.
244  * @param s The string.
245  */
246 template<class StringType> inline
to_lower(StringType & s)247 void to_lower( StringType &s ) {
248   StringType temp;
249   to_lower( s, &temp );
250   s = temp;
251 }
252 
253 /**
254  * Converts a string to upper-case.
255  *
256  * @tparam InputStringType The input string type.
257  * @tparam OutputStringType The output string type.
258  * @param in The input string.
259  * @param out The output string (which must be different from \a in).  Its
260  * contents are overwritten.
261  */
262 template<class InputStringType,class OutputStringType>
263 void to_upper( InputStringType const &in, OutputStringType *out );
264 
265 /**
266  * Converts a string to upper-case.
267  *
268  * @tparam StringType The input string type.
269  * @param s The string.
270  */
271 template<class StringType> inline
to_upper(StringType & s)272 void to_upper( StringType &s ) {
273   StringType temp;
274   to_upper( s, &temp );
275   s = temp;
276 }
277 
278 ////////// Code-point functions ///////////////////////////////////////////////
279 
280 /**
281  * Appends a sequence of Unicode code-points to a string.
282  *
283  * @tparam InputIterator The iterator type.
284  * @param i The begining iterator.
285  * @param j The ending iterator.
286  * @param s A pointer to the string to be appended to.
287  */
288 template<class InputIterator,class StringType> inline
append_codepoints(InputIterator i,InputIterator j,StringType * s)289 void append_codepoints( InputIterator i, InputIterator j, StringType *s ) {
290   typename utf8_stringify<StringType>::type u( *s );
291   std::copy( i, j, std::back_inserter( u ) );
292 }
293 
294 /**
295  * Converts a string to a sequence of Uncode code-points.
296  *
297  * @tparam StringType The type of string.
298  * @tparam ContainerType The type of STL container to put the codepoint values.
299  * @param s The string to get the codepoints for.
300  * @param c A pointer to the container to put the codepoint values.  The
301  * containers contents are overwritten.
302  */
303 template<class StringType,class ContainerType> inline
to_codepoints(StringType const & s,ContainerType * c)304 void to_codepoints( StringType const &s, ContainerType *c ) {
305   typename utf8_stringify<StringType const>::type const u( s );
306   std::copy( u.begin(), u.end(), std::back_inserter( *c ) );
307 }
308 
309 ////////// Encoding conversion ////////////////////////////////////////////////
310 
311 #ifndef ZORBA_NO_ICU
312 
313 /**
314  * Converts a unicode::char_type array into a UTF-8 encoded string.
315  *
316  * @param in The Unicode characters to convert.
317  * @param in_len The number of unicode characters (not bytes) to convert.
318  * @param out A pointer to a pointer to the starting location for the result.
319  * It is the caller's responsibility to deallocate this.
320  * @param out_len If not \c nullptr, the number of bytes (not characters) of
321  * the UTF-8 string are put here.
322  * @return Returns \c true only if the conversion succeeded.
323  */
324 bool to_string( unicode::char_type const *in, unicode::size_type in_len,
325                 storage_type **out, size_type *out_len = nullptr );
326 
327 /**
328  * Converts a unicode::char_type array into a UTF-8 encoded string.
329  *
330  * @param in The Unicode characters to convert.
331  * @param out A pointer to a pointer to the starting location for the result.
332  * It is the caller's responsibility to deallocate this.
333  * @param out_len If not \c nullptr, the number of bytes (not characters) of
334  * the UTF-8 string are put here.
335  * @return Returns \c true only if the conversion succeeded.
336  */
337 inline bool to_string( unicode::char_type const *in, storage_type **out,
338                        size_type *out_len = nullptr ) {
339   return to_string( in, u_strlen( in ), out, out_len );
340 }
341 
342 /**
343  * Converts a unicode::string into a UTF-8 encoded string.
344  *
345  * @param in The unicode::string to convert.
346  * @param out A pointer to a pointer to the starting location for the result.
347  * It is the caller's responsibility to deallocate this.
348  * @param out_len If not \c nullptr, the number of bytes (not characters) of
349  * the UTF-8 string are put here.
350  * @return Returns \c true only if the conversion succeeded.
351  */
352 inline bool to_string( unicode::string const &in, storage_type **out,
353                        size_type *out_len = nullptr ) {
354   return to_string( in.getBuffer(), in.length(), out, out_len );
355 }
356 
357 /**
358  * Converts a unicode::char_type array into a UTF-8 encoded string.
359  *
360  * @param in The Unicode characters to convert.
361  * @param in_len The number of unicode characters to convert.
362  * @param out A pointer to the result string.
363  * @return Returns \c true only if the conversion succeeded.
364  */
365 template<class StringType>
366 bool to_string( unicode::char_type const *in, size_type in_len,
367                 StringType *out );
368 
369 /**
370  * Converts a unicode::char_type array into a UTF-8 encoded string.
371  *
372  * @param in The Unicode characters to convert.
373  * @param out A pointer to the result string.
374  * @return Returns \c true only if the conversion succeeded.
375  */
376 template<class StringType> inline
to_string(unicode::char_type const * in,StringType * out)377 bool to_string( unicode::char_type const *in, StringType *out ) {
378   return to_string( in, u_strlen( in ), out );
379 }
380 
381 #endif /* ZORBA_NO_ICU */
382 
383 /**
384  * Converts a unicode::string into a UTF-8 encoded string.
385  *
386  * @param in The unicode::string to convert.
387  * @param out A pointer to the result string.
388  * @return Returns \c true only if the conversion succeeded.
389  */
390 template<class StringType> inline
to_string(unicode::string const & in,StringType * out)391 bool to_string( unicode::string const &in, StringType *out ) {
392 #ifndef ZORBA_NO_ICU
393   return to_string( in.getBuffer(), in.length(), out );
394 #else
395   *out = in.c_str();
396   return true;
397 #endif /* ZORBA_NO_ICU */
398 }
399 
400 #ifndef ZORBA_NO_ICU
401 
402 //
403 // On Windows, UChar == wchar_t, so these functions would multiply define those
404 // previously.
405 //
406 #ifndef WIN32
407 
408 /**
409  * Converts a wide-character string into a UTF-8 encoded string.
410  *
411  * @param in The wide-character string to convert.
412  * @param in_len The length of the unicode::string.
413  * @param out A pointer to a pointer to the starting location for the result.
414  * It is the caller's responsibility to deallocate this.
415  * @param out_len If not \c nullptr, the number of bytes (not characters) of
416  * the UTF-8 string are put here.
417  * @return Returns \c true only if the conversion succeeded.
418  */
419 bool to_string( wchar_t const *in, size_type in_len, storage_type **out,
420                 size_type *out_len = nullptr );
421 
422 /**
423  * Converts a wide-character string into a UTF-8 encoded string.
424  *
425  * @param in The wide-character string to convert.
426  * @param out A pointer to a pointer to the starting location for the result.
427  * It is the caller's responsibility to deallocate this.
428  * @param out_len If not \c nullptr, the number of bytes (not characters) of
429  * the UTF-8 string are put here.
430  * @return Returns \c true only if the conversion succeeded.
431  */
432 inline bool to_string( wchar_t const *in, storage_type **out,
433                        size_type *out_len = nullptr ) {
434   return to_string( in, std::wcslen( in ), out, out_len );
435 }
436 
437 /**
438  * Converts a wide-character string into a UTF-8 encoded string.
439  *
440  * @tparam StringType The type of the result string.
441  * @param in The wide-character string to convert.
442  * @param in_len The length of the unicode::string.
443  * @param out A pointer to the result string.
444  * @return Returns \c true only if the conversion succeeded.
445  */
446 template<class StringType>
447 bool to_string( wchar_t const *in, size_type in_len, StringType *out );
448 
449 /**
450  * Converts a wide-character string into a UTF-8 encoded string.
451  *
452  * @tparam StringType The type of the result string.
453  * @param in The wide-character string to convert.
454  * @param out A pointer to the result string.
455  * @return Returns \c true only if the conversion succeeded.
456  */
457 template<class StringType> inline
to_string(wchar_t const * in,StringType * out)458 bool to_string( wchar_t const *in, StringType *out ) {
459   return to_string( in, std::wcslen( in ), out );
460 }
461 
462 #endif /* WIN32 */
463 
464 /**
465  * Converts a wide-character string into a UTF-8 encoded string.
466  *
467  * @tparam StringType The type of the result string.
468  * @param in The wide-character string to convert.
469  * @param out A pointer to the result string.
470  * @return Returns \c true only if the conversion succeeded.
471  */
472 template<class StringType> inline
to_string(std::wstring const & in,StringType * out)473 bool to_string( std::wstring const &in, StringType *out ) {
474   return to_string( in.data(), in.size(), out );
475 }
476 
477 /**
478  * Converts a UTF-8 encoded string to a wchar_t array.
479  *
480  * @param in The UTF-8 encoded string to convert.
481  * @param in_len The number of bytes (not characters) of \a in.
482  * @param out A pointer to a pointer to the starting location for the result.
483  * It is the caller's responsibility to deallocate this.
484  * @param out_len If not \c nullptr, the number of characters (not bytes) of
485  * the wchar_t string are put here.
486  * @return Returns \c true only if the conversion succeeded.
487  */
488 bool to_wchar_t( storage_type const *in, size_type in_len, wchar_t **out,
489                  unicode::size_type *out_len );
490 
491 /**
492  * Converts a UTF-8 encoded string to a wchar_t array.
493  *
494  * @param in The UTF-8 encoded string to convert.
495  * @param out A pointer to a pointer to the starting location for the result.
496  * It is the caller's responsibility to deallocate this.
497  * @param out_len If not \c nullptr, the number of characters (not bytes) of
498  * the \c wchar_t string are put here.
499  * @return Returns \c true only if the conversion succeeded.
500  */
to_wchar_t(storage_type const * in,wchar_t ** out,unicode::size_type * out_len)501 inline bool to_wchar_t( storage_type const *in, wchar_t **out,
502                         unicode::size_type *out_len ) {
503   return to_wchar_t( in, std::strlen( in ), out, out_len );
504 }
505 
506 /**
507  * Converts a UTF-8 encoded string to a wchar_t array.
508  *
509  * @tparam StringType The string type.
510  * @param in The UTF-8 encoded string to convert.
511  * @param out A pointer to a pointer to the starting location for the result.
512  * It is the caller's responsibility to deallocate this.
513  * @param out_len If not \c nullptr, the number of characters (not bytes) of
514  * the \c wchar_t string are put here.
515  * @return Returns \c true only if the conversion succeeded.
516  */
517 template<class StringType> inline
to_wchar_t(StringType const & in,wchar_t ** out,unicode::size_type * out_len)518 bool to_wchar_t( StringType const &in, wchar_t **out,
519                  unicode::size_type *out_len ) {
520   return to_wchar_t( in.data(), in.size(), out, out_len );
521 }
522 
523 #endif /* ZORBA_NO_ICU */
524 
525 ////////// HTML URI ///////////////////////////////////////////////////////////
526 
527 /**
528  * A %back_html_uri_insert_iterator can be used to append characters to a string
529  * escaping all non-printing ASCII characters.
530  *
531  * @tparam StringType The string type.
532  */
533 template<class StringType>
534 class back_html_uri_insert_iterator :
535   public
536     ztd::back_insert_iterator_base<
537       StringType, back_html_uri_insert_iterator<StringType>
538     >
539 {
540   typedef ztd::back_insert_iterator_base<
541     StringType, back_html_uri_insert_iterator<StringType>
542   > base_type;
543 public:
544   typedef typename base_type::container_type container_type;
545   typedef typename StringType::value_type value_type;
546 
547   /**
548    * Constructs a %back_html_uri_insert_iterator.
549    *
550    * @param s The string to append to.
551    */
back_html_uri_insert_iterator(StringType & s)552   explicit back_html_uri_insert_iterator( StringType &s ) : base_type( s ) {
553     buf_[0] = '%';
554   }
555 
556   back_html_uri_insert_iterator& operator=( value_type c );
557 
558 private:
559   char buf_[3]; // %xx -- no need for null at end
560 };
561 
562 /**
563  * This is a convenience function to create a back_html_uri_insert_iterator.
564  *
565  * @tparam StringType The string type.
566  * @param out The output string.
567  */
568 template<class StringType> inline back_html_uri_insert_iterator<StringType>
back_html_uri_inserter(StringType & out)569 back_html_uri_inserter( StringType &out ) {
570   return back_html_uri_insert_iterator<StringType>( out );
571 }
572 
573 /**
574  * Escapes all non-printable ASCII characters
575  *
576  * @tparam InputStringType The input string type.
577  * @tparam OutputStringType The output String type.
578  * @param in The input string.
579  * @param out The output string (which must be different from \a in).
580  */
581 template<class InputStringType,class OutputStringType> inline
to_html_uri(InputStringType const & in,OutputStringType * out)582 void to_html_uri( InputStringType const &in, OutputStringType *out ) {
583   typename utf8_stringify<InputStringType const>::type const u_in( in );
584   typename utf8_stringify<OutputStringType>::type u_out( *out );
585   std::copy( u_in.begin(), u_in.end(), back_html_uri_inserter( u_out ) );
586 }
587 
588 /**
589  * Escapes all non-printable ASCII characters
590  *
591  * @tparam StringType The string type.
592  * @param s The string.
593  */
594 template<class StringType> inline
to_html_uri(StringType & s)595 void to_html_uri( StringType &s ) {
596   StringType temp;
597   to_html_uri( s, &temp );
598   s = temp;
599 }
600 
601 ////////// IRI ////////////////////////////////////////////////////////////////
602 
603 /**
604  * A %back_iri_insert_iterator can be used to append characters to a string
605  * ensuring that illegal characters in an IRI.  See RFC 3987.
606  *
607  * @tparam StringType The string type.
608  */
609 template<class StringType>
610 class back_iri_insert_iterator :
611   public
612     ztd::back_insert_iterator_base<
613       StringType, back_iri_insert_iterator<StringType>
614     >
615 {
616   typedef ztd::back_insert_iterator_base<
617     StringType, back_iri_insert_iterator<StringType>
618   > base_type;
619 public:
620   typedef typename base_type::container_type container_type;
621   typedef typename StringType::value_type value_type;
622 
623   /**
624    * Constructs a %back_iri_insert_iterator.
625    *
626    * @param s The string to append to.
627    */
back_iri_insert_iterator(StringType & s)628   explicit back_iri_insert_iterator( StringType &s ) : base_type( s ) {
629     buf_[0] = '%';
630   }
631 
632   back_iri_insert_iterator& operator=( value_type c );
633 
634 private:
635   char buf_[3]; // %xx -- no need for null at end
636 };
637 
638 /**
639  * This is a convenience function to create a back_iri_insert_iterator.
640  *
641  * @tparam StringType The string type.
642  * @param out The output string.
643  */
644 template<class StringType> inline back_iri_insert_iterator<StringType>
back_iri_inserter(StringType & out)645 back_iri_inserter( StringType &out ) {
646   return back_iri_insert_iterator<StringType>( out );
647 }
648 
649 /**
650  * Converts an IRI to a URI.
651  *
652  * @tparam InputStringType The input string type.
653  * @tparam OutputStringType The output String type.
654  * @param in The input string.
655  * @param out The output string (which must be different from \a in).
656  * See RFC 3987.
657  */
658 template<class InputStringType,class OutputStringType> inline
iri_to_uri(InputStringType const & in,OutputStringType * out)659 void iri_to_uri( InputStringType const &in, OutputStringType *out ) {
660   typename utf8_stringify<InputStringType const>::type const u_in( in );
661   typename utf8_stringify<OutputStringType>::type u_out( *out );
662   std::copy( u_in.begin(), u_in.end(), back_iri_inserter( u_out ) );
663 }
664 
665 /**
666  * Converts an IRI to a URI.
667  *
668  * @tparam StringType The string type.
669  * @param s The string.
670  * See RFC 3987.
671  */
672 template<class StringType> inline
iri_to_uri(StringType & s)673 void iri_to_uri( StringType &s ) {
674   StringType temp;
675   iri_to_uri( s, &temp );
676   s = temp;
677 }
678 
679 ////////// Unicode normalization //////////////////////////////////////////////
680 
681 #ifndef ZORBA_NO_ICU
682 /**
683  * Normalizes the Unicode characters in the string.
684  *
685  * @tparam InputStringType The input string type.
686  * @tparam OutputStringType The output String type.
687  * @param in The input string.
688  * @param out The output String (which may be the same as \a in).
689  */
690 template<class InputStringType,class OutputStringType>
691 bool normalize( InputStringType const &in, unicode::normalization::type n,
692                 OutputStringType *out );
693 #endif /* ZORBA_NO_ICU */
694 
695 ////////// Whitespace /////////////////////////////////////////////////////////
696 
697 /**
698  * Converts sequences of one or more whitespace characters to a single space.
699  * Additionally, all leading and trailing whitespace is removed.
700  *
701  * @tparam InputStringType The input string type.
702  * @tparam OutputStringType The output string type.
703  * @param in The input string.
704  * @param out The output string.
705  */
706 template<class InputStringType,class OutputStringType> inline
normalize_whitespace(utf8_string<InputStringType> const & in,utf8_string<OutputStringType> * out)707 void normalize_whitespace( utf8_string<InputStringType> const &in,
708                            utf8_string<OutputStringType> *out ) {
709   ascii::normalize_whitespace( *in.get(), out->get() );
710 }
711 
712 /**
713  * Removes all leading and trailing whitespace.
714  *
715  * @tparam InputStringType The input string type.
716  * @tparam OutputStringType The output string type.
717  * @param in The input string.
718  * @param out The output string (which must be different from \a in).
719  */
720 template<class InputStringType,class OutputStringType> inline
trim_whitespace(utf8_string<InputStringType> const & in,utf8_string<OutputStringType> * out)721 void trim_whitespace( utf8_string<InputStringType> const &in,
722                       utf8_string<OutputStringType> *out ) {
723   ascii::trim_whitespace( *in.get(), out->get() );
724 }
725 
726 /**
727  * Removes all leading and trailing whitespace.
728  *
729  * @tparam StringType The string type.
730  * @param s The string.
731  */
732 template<class StringType> inline
trim_whitespace(utf8_string<StringType> & s)733 void trim_whitespace( utf8_string<StringType> &s ) {
734   ascii::trim_whitespace( *s.get() );
735 }
736 
737 ////////// Miscellaneous //////////////////////////////////////////////////////
738 
739 /**
740  * Reverses the characters in a string.
741  *
742  * @tparam InputStringType The input string type.
743  * @tparam OutputStringType The output string type.
744  * @param in The input string.
745  * @param out The output string.
746  */
747 template<class InputStringType,class OutputStringType> inline
reverse(InputStringType const & in,OutputStringType * out)748 void reverse( InputStringType const &in, OutputStringType *out ) {
749   typename utf8_stringify<InputStringType const>::type const u_in( in );
750   typename utf8_stringify<OutputStringType>::type u_out( *out );
751   std::reverse_copy( u_in.begin(), u_in.end(), std::back_inserter( u_out ) );
752 }
753 
754 /**
755  * Strips all diacritical marks from all characters converting them to their
756  * closest ASCII equivalents.
757  *
758  * @tparam InputStringType The input string type.
759  * @tparam OutputStringType The output string type.
760  * @param in The input string.
761  * @param out The output string.
762  * @return Returns \c true only if the strip succeeded.
763  */
764 template<class InputStringType,class OutputStringType>
765 bool strip_diacritics( InputStringType const &in, OutputStringType *out );
766 
767 /**
768  *
769  */
770 template<class StringType1,class StringType2> inline
compare(const StringType1 & s1,const StringType2 & s2,const XQPCollator * collation)771 int compare(const StringType1 &s1, const StringType2 &s2,
772             const XQPCollator* collation)
773 {
774 #ifndef ZORBA_NO_ICU
775   if (collation == NULL || collation->doMemCmp())
776     return s1.compare(s2);
777 
778   unicode::string us1;
779   unicode::string us2;
780 
781   unicode::to_string(s1, &us1);
782   unicode::to_string(s2, &us2);
783 
784   return static_cast<Collator*>( collation->getCollator() )->compare(us1, us2);
785 #else
786   return s1.compare(s2);
787 #endif /* ZORBA_NO_ICU */
788 }
789 
790 
791 /**
792  *
793  */
794 template<class StringType> inline
795 uint32_t hash(const StringType& s, const XQPCollator* collation = NULL)
796 {
797 #ifndef ZORBA_NO_ICU
798   if (!collation || collation->doMemCmp())
799 #endif
800   {
801     const char* str = s.data();
802     ulong len = (ulong)s.size();
803     uint32_t hash = 5381;
804     ulong i = 0;
805     int c;
806     while (i < len && (c = *str++))
807     {
808       hash = ((hash << 5) + hash) + c;
809       ++i;
810     }
811     return hash;
812     //return hashfun::h32((void*)(s.data()), s.size());
813   }
814 
815 #ifndef ZORBA_NO_ICU
816   CollationKey collKey;
817   UErrorCode status = U_ZERO_ERROR;
818 
819   unicode::string uni_s;
820   unicode::to_string(s, &uni_s);
821 
822   static_cast<Collator*>(collation->getCollator())->
823   getCollationKey(uni_s, collKey, status);
824 
825   if(U_FAILURE(status))
826   {
827     assert(false);
828   }
829 
830   return collKey.hashCode();
831 #else
832   ZORBA_ASSERT(false);
833 #endif /* ZORBA_NO_ICU */
834 }
835 
836 ///////////////////////////////////////////////////////////////////////////////
837 
838 } // namespace utf8
839 } // namespace zorba
840 
841 #include "utf8_util.tcc"
842 
843 #endif /* ZORBA_UTF8_UTIL_H */
844 /*
845  * Local variables:
846  * mode: c++
847  * End:
848  */
849 /* vim:set et sw=2 ts=2: */
850