1 /*
2  * encode.c
3  */
4 #include <stdio.h>
5 #include "oniguruma.h"
6 
7 static int
search(regex_t * reg,unsigned char * str,unsigned char * end)8 search(regex_t* reg, unsigned char* str, unsigned char* end)
9 {
10   int r;
11   unsigned char *start, *range;
12   OnigRegion *region;
13 
14   region = onig_region_new();
15 
16   start = str;
17   range = end;
18   r = onig_search(reg, str, end, start, range, region, ONIG_OPTION_NONE);
19   if (r >= 0) {
20     int i;
21 
22     fprintf(stderr, "match at %d  (%s)\n", r,
23             ONIGENC_NAME(onig_get_encoding(reg)));
24     for (i = 0; i < region->num_regs; i++) {
25       fprintf(stderr, "%d: (%d-%d)\n", i, region->beg[i], region->end[i]);
26     }
27   }
28   else if (r == ONIG_MISMATCH) {
29     fprintf(stderr, "search fail (%s)\n",
30             ONIGENC_NAME(onig_get_encoding(reg)));
31   }
32   else { /* error */
33     char s[ONIG_MAX_ERROR_MESSAGE_LEN];
34     onig_error_code_to_str((UChar* )s, r);
35     fprintf(stderr, "ERROR: %s\n", s);
36     fprintf(stderr, "  (%s)\n", ONIGENC_NAME(onig_get_encoding(reg)));
37     onig_region_free(region, 1 /* 1:free self, 0:free contents only */);
38     return -1;
39   }
40 
41   onig_region_free(region, 1 /* 1:free self, 0:free contents only */);
42   return 0;
43 }
44 
45 static int
exec(OnigEncoding enc,OnigOptionType options,char * apattern,char * astr)46 exec(OnigEncoding enc, OnigOptionType options,
47      char* apattern, char* astr)
48 {
49   int r;
50   unsigned char *end;
51   regex_t* reg;
52   OnigErrorInfo einfo;
53   UChar* pattern = (UChar* )apattern;
54   UChar* str     = (UChar* )astr;
55 
56   onig_initialize(&enc, 1);
57 
58   r = onig_new(&reg, pattern,
59                pattern + onigenc_str_bytelen_null(enc, pattern),
60                options, enc, ONIG_SYNTAX_DEFAULT, &einfo);
61   if (r != ONIG_NORMAL) {
62     char s[ONIG_MAX_ERROR_MESSAGE_LEN];
63     onig_error_code_to_str((UChar* )s, r, &einfo);
64     fprintf(stderr, "ERROR: %s\n", s);
65     return -1;
66   }
67 
68   end   = str + onigenc_str_bytelen_null(enc, str);
69   r = search(reg, str, end);
70 
71   onig_free(reg);
72   onig_end();
73   return 0;
74 }
75 
main(int argc,char * argv[])76 extern int main(int argc, char* argv[])
77 {
78   int r;
79   /* ISO 8859-1 test */
80   static unsigned char str[] = { 0xc7, 0xd6, 0xfe, 0xea, 0xe0, 0xe2, 0x00 };
81   static unsigned char pattern[] = { 0xe7, 0xf6, 0xde, '\\', 'w', '+', 0x00 };
82 
83   r = exec(ONIG_ENCODING_SJIS, ONIG_OPTION_NONE,
84            "^a\\p{Hiragana}c$", "a\202\274c");
85 
86   r = exec(ONIG_ENCODING_EUC_JP, ONIG_OPTION_NONE,
87            "^a\\p{Hiragana}c$", "a\244\276c");
88 
89   r = exec(ONIG_ENCODING_CP1251, ONIG_OPTION_IGNORECASE,
90            "aBc", " AbC");
91 
92   r = exec(ONIG_ENCODING_ISO_8859_1, ONIG_OPTION_IGNORECASE,
93            " [a-c\337z] ", "  SS  ");
94   r = exec(ONIG_ENCODING_ISO_8859_1, ONIG_OPTION_IGNORECASE,
95            " [\330-\341] ", "  SS  ");
96 
97   r = exec(ONIG_ENCODING_ISO_8859_2, ONIG_OPTION_IGNORECASE,
98            "\337          ", "          Ss          ");
99   r = exec(ONIG_ENCODING_ISO_8859_2, ONIG_OPTION_IGNORECASE,
100            "SS          ", "          \337          ");
101   r = exec(ONIG_ENCODING_ISO_8859_2, ONIG_OPTION_IGNORECASE,
102            "\\A\\S\\z", "ss");
103 
104   r = exec(ONIG_ENCODING_ISO_8859_2, ONIG_OPTION_IGNORECASE,
105            "[ac]+", "bbbaAaCCC");
106 
107   r = exec(ONIG_ENCODING_ISO_8859_3, ONIG_OPTION_IGNORECASE,
108            "[ac]+", "bbbaAaCCC");
109   r = exec(ONIG_ENCODING_ISO_8859_4, ONIG_OPTION_IGNORECASE,
110            "[ac]+", "bbbaAaCCC");
111   r = exec(ONIG_ENCODING_ISO_8859_5, ONIG_OPTION_IGNORECASE,
112            "[ac]+", "bbbaAaCCC");
113   r = exec(ONIG_ENCODING_ISO_8859_6, ONIG_OPTION_IGNORECASE,
114            "[ac]+", "bbbaAaCCC");
115   r = exec(ONIG_ENCODING_ISO_8859_7, ONIG_OPTION_IGNORECASE,
116            "[ac]+", "bbbaAaCCC");
117   r = exec(ONIG_ENCODING_ISO_8859_8, ONIG_OPTION_IGNORECASE,
118            "[ac]+", "bbbaAaCCC");
119   r = exec(ONIG_ENCODING_ISO_8859_9, ONIG_OPTION_IGNORECASE,
120            "[ac]+", "bbbaAaCCC");
121   r = exec(ONIG_ENCODING_ISO_8859_10, ONIG_OPTION_IGNORECASE,
122            "[ac]+", "bbbaAaCCC");
123   r = exec(ONIG_ENCODING_ISO_8859_11, ONIG_OPTION_IGNORECASE,
124            "[ac]+", "bbbaAaCCC");
125   r = exec(ONIG_ENCODING_ISO_8859_13, ONIG_OPTION_IGNORECASE,
126            "[ac]+", "bbbaAaCCC");
127   r = exec(ONIG_ENCODING_ISO_8859_14, ONIG_OPTION_IGNORECASE,
128            "[ac]+", "bbbaAaCCC");
129   r = exec(ONIG_ENCODING_ISO_8859_15, ONIG_OPTION_IGNORECASE,
130            (char* )pattern, (char* )str);
131   r = exec(ONIG_ENCODING_ISO_8859_16, ONIG_OPTION_IGNORECASE,
132            (char* )pattern, (char* )str);
133 
134   r = exec(ONIG_ENCODING_KOI8_R, ONIG_OPTION_NONE, "a+", "bbbaaaccc");
135   r = exec(ONIG_ENCODING_EUC_TW, ONIG_OPTION_NONE, "b*a+?c+", "bbbaaaccc");
136   r = exec(ONIG_ENCODING_EUC_KR, ONIG_OPTION_NONE, "a+", "bbbaaaccc");
137   r = exec(ONIG_ENCODING_EUC_CN, ONIG_OPTION_NONE, "c+", "bbbaaaccc");
138   r = exec(ONIG_ENCODING_BIG5,   ONIG_OPTION_NONE, "a+", "bbbaaaccc");
139 
140   r = exec(ONIG_ENCODING_ISO_8859_1, ONIG_OPTION_IGNORECASE,
141            "\337", "SS");
142   r = exec(ONIG_ENCODING_ISO_8859_1, ONIG_OPTION_IGNORECASE,
143            "SS", "\337");
144   r = exec(ONIG_ENCODING_ISO_8859_1, ONIG_OPTION_IGNORECASE,
145            "SSb\337ssc", "a\337bSS\337cd");
146   r = exec(ONIG_ENCODING_ISO_8859_1, ONIG_OPTION_IGNORECASE,
147            "[a\337]{0,2}", "aSS");
148   r = exec(ONIG_ENCODING_ISO_8859_1, ONIG_OPTION_IGNORECASE,
149            "is", "iss");
150 
151   r = exec(ONIG_ENCODING_UTF16_BE, ONIG_OPTION_NONE,
152            "\000[\000[\000:\000a\000l\000n\000u\000m\000:\000]\000]\000+\000\000",
153            "\000#\002\120\000a\000Z\012\077\012\076\012\075\000\000");
154   /* 0x0a3d == \012\075 : is not alnum */
155   /* 0x0a3e == \012\076 : is alnum */
156 
157   r = exec(ONIG_ENCODING_UTF16_BE, ONIG_OPTION_NONE,
158            "\000\\\000d\000+\000\000",
159            "\0003\0001\377\020\377\031\377\032\000\000");
160 
161   r = exec(ONIG_ENCODING_GB18030, ONIG_OPTION_IGNORECASE,
162            "(Aa\\d)+", "BaA5Aa0234");
163 
164   r = exec(ONIG_ENCODING_GB18030, ONIG_OPTION_NONE,
165            "[[^\\w]]+[^\xee\xef]\xee\xef", "[[^\\w]]+[^\xee\xef]\xee\xef");
166 
167   r = exec(ONIG_ENCODING_UTF16_BE, ONIG_OPTION_IGNORECASE,
168            "\000[\000\337\000]\000\000", "\000S\000S\000\000");
169 
170   r = exec(ONIG_ENCODING_UTF16_BE, ONIG_OPTION_IGNORECASE,
171            "\000[\000\337\000]\000\000", "\000s\000S\000\000");
172 
173   r = exec(ONIG_ENCODING_UTF16_BE, ONIG_OPTION_IGNORECASE,
174            "\000^\000[\000\001\000-\377\375\000]\000$\000\000",
175            "\000s\000S\000\000");
176 
177   r = exec(ONIG_ENCODING_UTF16_BE, ONIG_OPTION_IGNORECASE,
178            "\000S\000S\000\000",
179            "\000S\000T\000\337\000\000");
180 
181   r = exec(ONIG_ENCODING_UTF16_BE, ONIG_OPTION_IGNORECASE,
182            "\000S\000T\000S\000S\000\000",
183            "\000S\000t\000s\000S\000\000");
184 
185   {
186     UChar pat[]  = { 0x1f, 0xfc, 0x00, 0x00 };
187     UChar str1[] = { 0x21, 0x26, 0x1f, 0xbe, 0x00, 0x00 };
188     UChar str2[] = { 0x1f, 0xf3, 0x00, 0x00 };
189 
190     r = exec(ONIG_ENCODING_UTF16_BE, ONIG_OPTION_IGNORECASE,
191              (char* )pat, (char* )str1);
192 
193     r = exec(ONIG_ENCODING_UTF16_BE, ONIG_OPTION_IGNORECASE,
194              (char* )pat, (char* )str2);
195   }
196 
197 #if 0
198   /* You should define USE_UNICODE_CASE_FOLD_TURKISH_AZERI in regenc.h. */
199 
200   set_case_fold(ONIGENC_CASE_FOLD_TURKISH_AZERI);
201 
202   r = exec(ONIG_ENCODING_UTF8, ONIG_ENCODING_UTF8, ONIG_OPTION_IGNORECASE,
203            "Ii", "\304\261\304\260");
204 
205   r = exec(ONIG_ENCODING_UTF16_BE, ONIG_OPTION_IGNORECASE,
206            "\000I\000i\000\000", "\001\061\001\060\000\000");
207 
208   r = exec(ONIG_ENCODING_UTF16_BE, ONIG_OPTION_IGNORECASE,
209            "\001\061\001\060\000\000", "\000I\000i\000\000");
210 
211   set_case_fold(ONIGENC_CASE_FOLD_MIN);
212 #endif
213 
214   return r;
215 }
216