1 /*
2 This file is part of GNU APL, a free implementation of the
3 ISO/IEC Standard 13751, "Programming Language APL, Extended"
4
5 Copyright (C) 2017 Elias Mårtenson
6
7 This program is free software: you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation, either version 3 of the License, or
10 (at your option) any later version.
11
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with this program. If not, see <http://www.gnu.org/licenses/>.
19 */
20
21 #include "PointerCell.hh"
22 #include "Quad_RE.hh"
23 #include "Workspace.hh"
24
25 Quad_RE Quad_RE::_fun;
26 Quad_RE * Quad_RE::fun = &Quad_RE::_fun;
27
28 #include "../config.h" // for HAVE_LIBPCRE2_32 (from ./configure)
29
30 #ifndef HAVE_LIBPCRE2_32
31
32 //-----------------------------------------------------------------------------
33 Token
eval_AXB(Value_P A,Value_P X,Value_P B)34 Quad_RE::eval_AXB(Value_P A, Value_P X, Value_P B)
35 {
36 MORE_ERROR() <<
37 "⎕RE is not available because either no libpcre2 library was found on this\n"
38 "system when GNU APL was compiled, or because it was disabled in ./configure.";
39
40 SYNTAX_ERROR;
41 return Token();
42 }
43
44 #else // HAVE_LIBPCRE2_32
45
46 # include "Regexp.hh"
47
48 //-----------------------------------------------------------------------------
Flags(const UCS_string & flags_string)49 Quad_RE::Flags::Flags(const UCS_string & flags_string)
50 : flags(0),
51 error_on_no_match(false),
52 global(false),
53 result_type(RT_string)
54 {
55 int ofcnt = 0;
56 loop (f, flags_string.size())
57 {
58 const Unicode uni = flags_string[f];
59 switch(uni)
60 {
61 case UNI_SUBSET: result_type = RT_partition; ++ofcnt; break;
62 case UNI_DOWN_ARROW: result_type = RT_pos_len; ++ofcnt; break;
63 case UNI_ASCII_SLASH: result_type = RT_reduce; ++ofcnt; break;
64 case UNI_ASCII_g: global = true; break;
65 case UNI_ASCII_E: error_on_no_match = true; break;
66 case UNI_ASCII_i: flags |= PCRE2_CASELESS; break;
67 case UNI_ASCII_m: flags |= PCRE2_MULTILINE; break;
68 case UNI_ASCII_s: flags |= PCRE2_DOTALL; break;
69 case UNI_ASCII_x: flags |= PCRE2_EXTENDED; break;
70 default:
71 MORE_ERROR() << "Unknown ⎕RE flag: '" << UCS_string(1, uni)
72 << "'. Valid flags are: Eimsx⊂↓/";
73 DOMAIN_ERROR;
74 }
75 }
76
77 if (ofcnt > 1)
78 {
79 MORE_ERROR() << "Multiple ⎕RE output flags: '" << flags_string
80 << "'. The ⎕RE output flags are: ⊂↓/";
81 DOMAIN_ERROR;
82 }
83 }
84 //-----------------------------------------------------------------------------
85 static Value_P
deep_value(int idx,const PCRE2_SIZE * ovector,int count,const int * parents,const int * child_count,const UCS_string * B)86 deep_value(int idx, const PCRE2_SIZE * ovector, int count, const int * parents,
87 const int * child_count, const UCS_string * B)
88 {
89 if (child_count[idx] == 0) // simple RE (no sub-REs)
90 {
91 const PCRE2_SIZE start = ovector[2*idx];
92 const PCRE2_SIZE end = ovector[2*idx + 1];
93 if (B) // string
94 {
95 const UCS_string item(*B, start, end - start);
96 Value_P Z(item, LOC);
97 Z->check_value(LOC);
98 return Z;
99 }
100 else // pos+len
101 {
102 Value_P Z(2, LOC);
103 new (Z->next_ravel()) IntCell(start);
104 new (Z->next_ravel()) IntCell(end - start);
105 Z->check_value(LOC);
106 return Z;
107 }
108 }
109
110 ShapeItem ini = B ? 1 : 2;
111 Value_P Z(ini + child_count[idx], LOC);
112 if (B)
113 {
114 const PCRE2_SIZE start = ovector[2*idx];
115 const PCRE2_SIZE end = ovector[2*idx + 1];
116 const UCS_string item(*B, start, end - start);
117 Value_P sub_value(item, LOC);
118 sub_value->check_value(LOC);
119 new (Z->next_ravel()) PointerCell(sub_value.get(), Z.getref());
120 }
121 else
122 {
123 new (Z->next_ravel()) IntCell(ovector[2*idx]);
124 new (Z->next_ravel()) IntCell(ovector[2*idx + 1] - ovector[2*idx]);
125 }
126
127 loop(ch, count)
128 {
129 if (parents[ch] != idx) continue; // ch is not a child of idx
130 Value_P CH = deep_value(ch, ovector, count, parents, child_count, B);
131 new (Z->next_ravel()) PointerCell(CH.get(), Z.getref());
132 }
133
134 Z->check_value(LOC);
135 return Z;
136 }
137 //-----------------------------------------------------------------------------
138 Value_P
regex_results(const Regexp & A,const Flags & X,const UCS_string & B)139 Quad_RE::regex_results(const Regexp & A, const Flags & X, const UCS_string & B)
140 {
141 ShapeItem B_offset = 0;
142 if (X.get_result_type() == RT_partition || X.get_result_type() == RT_reduce)
143 return partition_result(A, X, B);
144
145 if (!X.get_global()) // first match only
146 {
147 switch(X.get_result_type())
148 {
149 case RT_string: return string_result (A, X, B, B_offset);
150 case RT_pos_len: return index_result (A, X, B, B_offset);
151 default: FIXME;
152 }
153
154 // not reached
155 FIXME;
156 }
157
158 ShapeItem cells_used = 0;
159 ShapeItem cells_allocated = SHORT_VALUE_LENGTH_WANTED;
160 Value_P Z(cells_allocated, LOC);
161 for (;;)
162 {
163 Value_P ZZ;
164 switch(X.get_result_type())
165 {
166 case RT_string:
167 ZZ = string_result(A, X, B, B_offset);
168 break;
169
170 case RT_pos_len:
171 ZZ = index_result(A, X, B, B_offset);
172 break;
173
174 default: FIXME;
175 }
176
177 if (B_offset == -1) break; // no more matches
178 if (cells_used >= cells_allocated) // Z full
179 {
180 cells_allocated += cells_allocated;
181 Value_P Z2(cells_allocated, LOC);
182 loop(u, cells_used)
183 {
184 Cell & cell = Z->get_ravel(u);
185 new (Z2->next_ravel())
186 PointerCell(cell.get_pointer_value().get(), Z2.getref());
187 cell.release(LOC);
188 }
189 Z = Z2;
190 }
191
192 new (Z->next_ravel()) PointerCell(ZZ.get(), Z.getref());
193 ++cells_used;
194 }
195
196 Z->set_shape(Shape(cells_used));
197 Z->check_value(LOC);
198 return Z;
199 }
200 //-----------------------------------------------------------------------------
201 Value_P
partition_result(const Regexp & A,const Flags & X,const UCS_string & B)202 Quad_RE::partition_result(const Regexp & A, const Flags & X,
203 const UCS_string & B)
204 {
205 const PCRE2_SIZE len = B.size();
206 Value_P Z(len, LOC);
207 loop(z, len) new (Z->next_ravel()) IntCell(0);
208
209 PCRE2_SIZE B_offset = 0;
210 const int match_id_inc = X.get_result_type() == RT_partition ? 1 : 0;
211 for (ShapeItem match_id = 1; B_offset < len; match_id += match_id_inc)
212 {
213 RegexpMatch rem(A.get_code(), B, B_offset);
214 if (!rem.is_match()) break;
215
216 const PCRE2_SIZE * ovector = rem.get_ovector();
217 const PCRE2_SIZE start = ovector[0];
218 const PCRE2_SIZE end = ovector[1];
219 for (PCRE2_SIZE b = start; b < end; ++b)
220 new (&Z->get_ravel(b)) IntCell(match_id);
221
222 if (!X.get_global()) break;
223 B_offset = end;
224 }
225
226 Z->check_value(LOC);
227 return Z;
228 }
229 //-----------------------------------------------------------------------------
230 Value_P
string_result(const Regexp & A,const Flags & X,const UCS_string & B,ShapeItem & B_offset)231 Quad_RE::string_result(const Regexp & A, const Flags & X,
232 const UCS_string & B, ShapeItem & B_offset)
233 {
234 RegexpMatch rem(A.get_code(), B, B_offset);
235 if (!rem.is_match())
236 {
237 B_offset = -1;
238 if (X.get_global()) return Value_P();
239 if (!X.get_error_on_no_match()) return Idx0(LOC);
240 MORE_ERROR() << "No match";
241 DOMAIN_ERROR;
242 }
243
244 // rem is a match
245 //
246 const PCRE2_SIZE * ovector = rem.get_ovector();
247 B_offset = ovector[1];
248 if (0 && rem.num_matches() == 1) // simple match
249 {
250 // single string
251 //
252 Value_P Z(2, LOC);
253 const PCRE2_SIZE start = ovector[0];
254 const PCRE2_SIZE end = ovector[1];
255 new (Z->next_ravel()) IntCell(start);
256 new (Z->next_ravel()) IntCell(end - start);
257 Z->check_value(LOC);
258 return Z;
259 }
260
261 const uint32_t ovector_count = rem.get_ovector_count();
262 B_offset = ovector[1];
263 int parents[ovector_count];
264 int ccount[ovector_count];
265
266 loop(o, ovector_count)
267 {
268 parents[o] = -1;
269 ccount[o] = 0;
270 }
271
272 for (int o = ovector_count - 1; o >= 0; --o)
273 {
274 const PCRE2_SIZE ostart = ovector[2*o];
275 for (int p = o - 1; p >= 0; --p)
276 {
277 if (ovector[2*p] <= ostart && ostart < ovector[2*p + 1])
278 {
279 parents[o] = p;
280 ++ccount[p];
281 break;
282 }
283 }
284 }
285
286 return deep_value(0, ovector, ovector_count, parents, ccount, &B);
287 }
288 //-----------------------------------------------------------------------------
289 Value_P
index_result(const Regexp & A,const Flags & X,const UCS_string & B,ShapeItem & B_offset)290 Quad_RE::index_result(const Regexp & A, const Flags & X,
291 const UCS_string & B, ShapeItem & B_offset)
292 {
293 RegexpMatch rem(A.get_code(), B, B_offset);
294 if (!rem.is_match())
295 {
296 B_offset = -1;
297 if (X.get_global()) return Value_P();
298 if (!X.get_error_on_no_match()) return Idx0(LOC);
299 MORE_ERROR() << "No match";
300 DOMAIN_ERROR;
301 }
302
303 // rem is a match
304 //
305 const PCRE2_SIZE * ovector = rem.get_ovector();
306 B_offset = ovector[1];
307 if (rem.num_matches() == 1) // simple match
308 {
309 // single string
310 //
311 Value_P Z(2, LOC);
312 const PCRE2_SIZE start = ovector[0];
313 const PCRE2_SIZE end = ovector[1];
314 new (Z->next_ravel()) IntCell(start);
315 new (Z->next_ravel()) IntCell(end - start);
316 Z->check_value(LOC);
317 return Z;
318 }
319
320 const uint32_t ovector_count = rem.get_ovector_count();
321 int parents[ovector_count];
322 B_offset = ovector[1];
323 int ccount[ovector_count];
324
325 loop(o, ovector_count)
326 {
327 parents[o] = -1;
328 ccount[o] = 0;
329 }
330
331 for (int o = ovector_count - 1; o >= 0; --o)
332 {
333 const PCRE2_SIZE ostart = ovector[2*o];
334 for (int p = o - 1; p >= 0; --p)
335 {
336 if (ovector[2*p] <= ostart && ostart < ovector[2*p + 1])
337 {
338 parents[o] = p;
339 ++ccount[p];
340 break;
341 }
342 }
343 }
344
345 return deep_value(0, ovector, ovector_count, parents, ccount, 0);
346 }
347 //-----------------------------------------------------------------------------
348 Token
eval_AXB(Value_P A,Value_P X,Value_P B)349 Quad_RE::eval_AXB(Value_P A, Value_P X, Value_P B)
350 {
351 if (A->get_rank() > 1) RANK_ERROR;
352 if (X->get_rank() > 1) RANK_ERROR;
353
354 if (!A->is_char_string())
355 {
356 MORE_ERROR() << "left ⎕RE arguments must be a string value";
357 DOMAIN_ERROR;
358 }
359
360 Flags flags(X->get_UCS_ravel());
361 Regexp regexp(A->get_UCS_ravel(), flags.get_compflags());
362
363 const Shape & shape = B->get_shape();
364 if (shape.get_rank() == 0)
365 return Token(TOK_APL_VALUE1, Idx0(LOC));
366
367 if (B->is_char_string())
368 {
369 Value_P Z = regex_results(regexp, flags, B->get_UCS_ravel());
370 Z->check_value(LOC);
371 return Token(TOK_APL_VALUE1, Z);
372 }
373
374 Value_P Z(shape, LOC);
375 for (ShapeItem i = 0 ; i < shape.get_volume() ; i++)
376 {
377 const Cell & cell = B->get_ravel(i);
378 Value_P B_sub = cell.to_value(LOC);
379 if (!B_sub->is_char_string())
380 {
381 MORE_ERROR() << "Cell does not contain a string";
382 DOMAIN_ERROR;
383 }
384
385 Value_P Z_sub = regex_results(regexp, flags, B_sub->get_UCS_ravel());
386 Z_sub->check_value(LOC);
387 new (Z->next_ravel()) PointerCell(Z_sub.get(), Z.getref());
388 }
389
390 Z->check_value(LOC);
391 return Token(TOK_APL_VALUE1, Z);
392 }
393 //-----------------------------------------------------------------------------
394
395 #endif
396