1 /*
2  * Copyright (c) 2015-2016, Intel Corporation
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions are met:
6  *
7  *  * Redistributions of source code must retain the above copyright notice,
8  *    this list of conditions and the following disclaimer.
9  *  * Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *  * Neither the name of Intel Corporation nor the names of its contributors
13  *    may be used to endorse or promote products derived from this software
14  *    without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26  * POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 #include "Utf8ComponentClass.h"
30 
31 #include <algorithm>
32 
33 using namespace std;
34 
35 namespace ue2 {
36 
37 #define UCP_FN(cat)                                                     \
38 CodePointSet getUcp##cat(void) {                                        \
39     CodePointSet rv;                                                    \
40     for (u32 i = 0; i < ARRAY_LENGTH(ucp_##cat##_def); i += 2) {        \
41         rv.setRange(ucp_##cat##_def[i], ucp_##cat##_def[i + 1]);        \
42     }                                                                   \
43     return rv;                                                          \
44 }
45 
46 struct unicase {
47     unichar base;
48     unichar caseless;
49 };
50 
51 } // namespace ue2
52 
53 #define UCP_TABLE_DEFINE_FN
54 #include "ucp_table.h"
55 
56 namespace ue2 {
57 
58 static
operator <(const unicase & a,const unicase & b)59 bool operator<(const unicase &a, const unicase &b) {
60     if (a.base < b.base) {
61         return true;
62     }
63 
64     if (a.base > b.base) {
65         return false;
66     }
67 
68     return a.caseless < b.caseless;
69 }
70 
make_caseless(CodePointSet * cps)71 void make_caseless(CodePointSet *cps) {
72     assert(cps);
73     DEBUG_PRINTF("hello\n");
74     // Cheap optimisation: if we are empty or a dot, we're already caseless.
75     if (cps->begin() == cps->end()) {
76         DEBUG_PRINTF("empty\n");
77         return;
78     }
79     if (lower(*cps->begin()) == 0 && upper(*cps->begin()) == MAX_UNICODE) {
80         DEBUG_PRINTF("dot\n");
81         return;
82     }
83 
84     CodePointSet base = *cps;
85 
86     auto uc_begin = begin(ucp_caseless_def);
87     auto uc_end = end(ucp_caseless_def);
88     DEBUG_PRINTF("uc len %zd\n", distance(uc_begin, uc_end));
89 
90     for (const auto &elem : base) {
91         unichar b = lower(elem);
92         unichar e = upper(elem) + 1;
93 
94         for (; b < e; b++) {
95             DEBUG_PRINTF("decasing %x\n", b);
96             unicase test = {b, 0}; /* NUL is not a caseless version of anything,
97                                     * so we are ok */
98             uc_begin = lower_bound(uc_begin, uc_end, test);
99             if (uc_begin == uc_end) {
100                 DEBUG_PRINTF("EOL\n");
101                 return;
102             }
103             while (uc_begin != uc_end && uc_begin->base == b) {
104                 DEBUG_PRINTF("at {%x,%x}\n", uc_begin->base, uc_begin->caseless);
105                 cps->set(uc_begin->caseless);
106                 ++uc_begin;
107             }
108         }
109     }
110 }
111 
112 /** \brief Flip the case of the codepoint in c, if possible.
113  *
114  * Note that this assumes a one-to-one case mapping, which (though not
115  * realistic) is what PCRE does. */
flip_case(unichar * c)116 bool flip_case(unichar *c) {
117     assert(c);
118 
119     const unicase test = { *c, 0 };
120 
121     const auto uc_begin = begin(ucp_caseless_def);
122     const auto uc_end = end(ucp_caseless_def);
123     const auto f = lower_bound(uc_begin, uc_end, test);
124     if (f != uc_end && f->base == *c) {
125         DEBUG_PRINTF("flipped c=%x to %x\n", *c, f->caseless);
126         *c = f->caseless;
127         return true;
128     }
129     return false;
130 }
131 
132 } // namespace ue2
133