1 /*
2     This file is part of GNU APL, a free implementation of the
3     ISO/IEC Standard 13751, "Programming Language APL, Extended"
4 
5     Copyright (C) 2017 Elias Mårtenson
6 
7     This program is free software: you can redistribute it and/or modify
8     it under the terms of the GNU General Public License as published by
9     the Free Software Foundation, either version 3 of the License, or
10     (at your option) any later version.
11 
12     This program is distributed in the hope that it will be useful,
13     but WITHOUT ANY WARRANTY; without even the implied warranty of
14     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15     GNU General Public License for more details.
16 
17     You should have received a copy of the GNU General Public License
18     along with this program.  If not, see <http://www.gnu.org/licenses/>.
19 */
20 
21 #include "PointerCell.hh"
22 #include "Quad_RE.hh"
23 #include "Workspace.hh"
24 
25 Quad_RE Quad_RE::_fun;
26 Quad_RE * Quad_RE::fun = &Quad_RE::_fun;
27 
28 #include "../config.h"   // for HAVE_LIBPCRE2_32 (from ./configure)
29 
30 #ifndef HAVE_LIBPCRE2_32
31 
32 //-----------------------------------------------------------------------------
33 Token
eval_AXB(Value_P A,Value_P X,Value_P B)34 Quad_RE::eval_AXB(Value_P A, Value_P X, Value_P B)
35 {
36   MORE_ERROR() <<
37 "⎕RE is not available because either no libpcre2 library was found on this\n"
38 "system when GNU APL was compiled, or because it was disabled in ./configure.";
39 
40    SYNTAX_ERROR;
41    return Token();
42 }
43 
44 #else // HAVE_LIBPCRE2_32
45 
46 # include "Regexp.hh"
47 
48 //-----------------------------------------------------------------------------
Flags(const UCS_string & flags_string)49 Quad_RE::Flags::Flags(const UCS_string & flags_string)
50    : flags(0),
51      error_on_no_match(false),
52      global(false),
53      result_type(RT_string)
54 {
55 int ofcnt = 0;
56    loop (f, flags_string.size())
57       {
58         const Unicode uni = flags_string[f];
59         switch(uni)
60            {
61              case UNI_SUBSET:      result_type = RT_partition;  ++ofcnt;  break;
62              case UNI_DOWN_ARROW:  result_type = RT_pos_len;    ++ofcnt;  break;
63              case UNI_ASCII_SLASH: result_type = RT_reduce;     ++ofcnt;  break;
64              case UNI_ASCII_g:     global = true;                         break;
65              case UNI_ASCII_E:     error_on_no_match = true;              break;
66              case UNI_ASCII_i:     flags |= PCRE2_CASELESS;               break;
67              case UNI_ASCII_m:     flags |= PCRE2_MULTILINE;              break;
68              case UNI_ASCII_s:     flags |= PCRE2_DOTALL;                 break;
69              case UNI_ASCII_x:     flags |= PCRE2_EXTENDED;               break;
70              default:
71                 MORE_ERROR() << "Unknown ⎕RE flag: '" << UCS_string(1, uni)
72                              << "'. Valid flags are: Eimsx⊂↓/";
73                 DOMAIN_ERROR;
74            }
75      }
76 
77    if (ofcnt > 1)
78       {
79         MORE_ERROR() << "Multiple ⎕RE output flags: '" << flags_string
80                      << "'. The ⎕RE output flags are: ⊂↓/";
81         DOMAIN_ERROR;
82       }
83 }
84 //-----------------------------------------------------------------------------
85 static Value_P
deep_value(int idx,const PCRE2_SIZE * ovector,int count,const int * parents,const int * child_count,const UCS_string * B)86 deep_value(int idx, const PCRE2_SIZE * ovector, int count, const int * parents,
87            const int * child_count, const UCS_string * B)
88 {
89    if (child_count[idx] == 0)   // simple RE (no sub-REs)
90       {
91         const PCRE2_SIZE start = ovector[2*idx];
92         const PCRE2_SIZE end   = ovector[2*idx + 1];
93         if (B)   // string
94            {
95              const UCS_string item(*B, start, end - start);
96              Value_P Z(item, LOC);
97              Z->check_value(LOC);
98              return Z;
99            }
100         else     // pos+len
101            {
102              Value_P Z(2, LOC);
103              new (Z->next_ravel())   IntCell(start);
104              new (Z->next_ravel())   IntCell(end - start);
105              Z->check_value(LOC);
106              return Z;
107            }
108       }
109 
110 ShapeItem ini = B ? 1 : 2;
111 Value_P Z(ini + child_count[idx], LOC);
112    if (B)
113       {
114         const PCRE2_SIZE start = ovector[2*idx];
115         const PCRE2_SIZE end = ovector[2*idx + 1];
116         const UCS_string item(*B, start, end - start);
117         Value_P sub_value(item, LOC);
118         sub_value->check_value(LOC);
119         new (Z->next_ravel())   PointerCell(sub_value.get(), Z.getref());
120       }
121    else
122       {
123         new (Z->next_ravel())   IntCell(ovector[2*idx]);
124         new (Z->next_ravel())   IntCell(ovector[2*idx + 1] - ovector[2*idx]);
125       }
126 
127    loop(ch, count)
128        {
129          if (parents[ch] != idx)   continue;   // ch is not a child of idx
130          Value_P CH = deep_value(ch, ovector, count, parents, child_count, B);
131          new (Z->next_ravel())   PointerCell(CH.get(), Z.getref());
132        }
133 
134    Z->check_value(LOC);
135    return Z;
136 }
137 //-----------------------------------------------------------------------------
138 Value_P
regex_results(const Regexp & A,const Flags & X,const UCS_string & B)139 Quad_RE::regex_results(const Regexp & A, const Flags & X, const UCS_string & B)
140 {
141 ShapeItem B_offset = 0;
142    if (X.get_result_type() == RT_partition || X.get_result_type() == RT_reduce)
143       return partition_result(A, X, B);
144 
145    if (!X.get_global())   // first match only
146       {
147         switch(X.get_result_type())
148            {
149              case RT_string:    return string_result   (A, X, B, B_offset);
150              case RT_pos_len:   return index_result    (A, X, B, B_offset);
151              default:           FIXME;
152            }
153 
154         // not reached
155         FIXME;
156       }
157 
158 ShapeItem cells_used = 0;
159 ShapeItem cells_allocated = SHORT_VALUE_LENGTH_WANTED;
160 Value_P Z(cells_allocated, LOC);
161    for (;;)
162        {
163          Value_P ZZ;
164          switch(X.get_result_type())
165             {
166               case RT_string:
167                    ZZ = string_result(A, X, B, B_offset);
168                    break;
169 
170               case RT_pos_len:
171                    ZZ = index_result(A, X, B, B_offset);
172                    break;
173 
174               default:           FIXME;
175             }
176 
177          if (B_offset == -1)   break;   // no more matches
178          if (cells_used >= cells_allocated)   // Z full
179             {
180               cells_allocated += cells_allocated;
181               Value_P Z2(cells_allocated, LOC);
182               loop(u, cells_used)
183                   {
184                     Cell & cell = Z->get_ravel(u);
185                     new (Z2->next_ravel())
186                         PointerCell(cell.get_pointer_value().get(), Z2.getref());
187                     cell.release(LOC);
188                   }
189               Z = Z2;
190             }
191 
192          new (Z->next_ravel())   PointerCell(ZZ.get(), Z.getref());
193          ++cells_used;
194        }
195 
196    Z->set_shape(Shape(cells_used));
197    Z->check_value(LOC);
198    return Z;
199 }
200 //-----------------------------------------------------------------------------
201 Value_P
partition_result(const Regexp & A,const Flags & X,const UCS_string & B)202 Quad_RE::partition_result(const Regexp & A, const Flags & X,
203                           const UCS_string & B)
204 {
205 const PCRE2_SIZE len = B.size();
206 Value_P Z(len, LOC);
207    loop(z, len)   new (Z->next_ravel())   IntCell(0);
208 
209 PCRE2_SIZE B_offset = 0;
210 const int match_id_inc = X.get_result_type() == RT_partition ? 1 : 0;
211    for (ShapeItem match_id = 1; B_offset < len; match_id += match_id_inc)
212        {
213          RegexpMatch rem(A.get_code(), B, B_offset);
214          if (!rem.is_match())   break;
215 
216          const PCRE2_SIZE * ovector = rem.get_ovector();
217          const PCRE2_SIZE start = ovector[0];
218          const PCRE2_SIZE end   = ovector[1];
219          for (PCRE2_SIZE b = start; b < end; ++b)
220              new (&Z->get_ravel(b))   IntCell(match_id);
221 
222          if (!X.get_global())   break;
223          B_offset = end;
224        }
225 
226    Z->check_value(LOC);
227    return Z;
228 }
229 //-----------------------------------------------------------------------------
230 Value_P
string_result(const Regexp & A,const Flags & X,const UCS_string & B,ShapeItem & B_offset)231 Quad_RE::string_result(const Regexp & A, const Flags & X,
232                        const UCS_string & B, ShapeItem & B_offset)
233 {
234 RegexpMatch rem(A.get_code(), B, B_offset);
235    if (!rem.is_match())
236       {
237         B_offset = -1;
238         if (X.get_global())               return Value_P();
239         if (!X.get_error_on_no_match())   return Idx0(LOC);
240         MORE_ERROR() << "No match";
241         DOMAIN_ERROR;
242       }
243 
244    // rem is a match
245    //
246 const PCRE2_SIZE * ovector = rem.get_ovector();
247    B_offset = ovector[1];
248    if (0 && rem.num_matches() == 1)   // simple match
249       {
250         // single string
251         //
252         Value_P Z(2, LOC);
253         const PCRE2_SIZE start = ovector[0];
254         const PCRE2_SIZE end   = ovector[1];
255         new (Z->next_ravel())   IntCell(start);
256         new (Z->next_ravel())   IntCell(end - start);
257         Z->check_value(LOC);
258         return Z;
259       }
260 
261 const uint32_t ovector_count = rem.get_ovector_count();
262    B_offset = ovector[1];
263 int parents[ovector_count];
264 int ccount[ovector_count];
265 
266    loop(o, ovector_count)
267        {
268          parents[o] = -1;
269          ccount[o]  = 0;
270        }
271 
272    for (int o = ovector_count - 1; o >= 0; --o)
273        {
274          const PCRE2_SIZE ostart = ovector[2*o];
275          for (int p = o - 1; p >= 0; --p)
276              {
277                if (ovector[2*p] <= ostart && ostart < ovector[2*p + 1])
278                   {
279                     parents[o] = p;
280                     ++ccount[p];
281                     break;
282                   }
283              }
284        }
285 
286    return deep_value(0, ovector, ovector_count, parents, ccount, &B);
287 }
288 //-----------------------------------------------------------------------------
289 Value_P
index_result(const Regexp & A,const Flags & X,const UCS_string & B,ShapeItem & B_offset)290 Quad_RE::index_result(const Regexp & A, const Flags & X,
291                       const UCS_string & B, ShapeItem & B_offset)
292 {
293 RegexpMatch rem(A.get_code(), B, B_offset);
294    if (!rem.is_match())
295       {
296         B_offset = -1;
297         if (X.get_global())               return Value_P();
298         if (!X.get_error_on_no_match())   return Idx0(LOC);
299         MORE_ERROR() << "No match";
300         DOMAIN_ERROR;
301       }
302 
303    // rem is a match
304    //
305 const PCRE2_SIZE * ovector = rem.get_ovector();
306    B_offset = ovector[1];
307    if (rem.num_matches() == 1)   // simple match
308       {
309         // single string
310         //
311         Value_P Z(2, LOC);
312         const PCRE2_SIZE start = ovector[0];
313         const PCRE2_SIZE end   = ovector[1];
314         new (Z->next_ravel())   IntCell(start);
315         new (Z->next_ravel())   IntCell(end - start);
316         Z->check_value(LOC);
317         return Z;
318       }
319 
320 const uint32_t ovector_count = rem.get_ovector_count();
321 int parents[ovector_count];
322    B_offset = ovector[1];
323 int ccount[ovector_count];
324 
325    loop(o, ovector_count)
326        {
327          parents[o] = -1;
328          ccount[o]  = 0;
329        }
330 
331    for (int o = ovector_count - 1; o >= 0; --o)
332        {
333          const PCRE2_SIZE ostart = ovector[2*o];
334          for (int p = o - 1; p >= 0; --p)
335              {
336                if (ovector[2*p] <= ostart && ostart < ovector[2*p + 1])
337                   {
338                     parents[o] = p;
339                     ++ccount[p];
340                     break;
341                   }
342              }
343        }
344 
345    return deep_value(0, ovector, ovector_count, parents, ccount, 0);
346 }
347 //-----------------------------------------------------------------------------
348 Token
eval_AXB(Value_P A,Value_P X,Value_P B)349 Quad_RE::eval_AXB(Value_P A, Value_P X, Value_P B)
350 {
351    if (A->get_rank() > 1)   RANK_ERROR;
352    if (X->get_rank() > 1)   RANK_ERROR;
353 
354    if (!A->is_char_string())
355       {
356         MORE_ERROR() << "left ⎕RE arguments must be a string value";
357         DOMAIN_ERROR;
358       }
359 
360 Flags flags(X->get_UCS_ravel());
361 Regexp regexp(A->get_UCS_ravel(), flags.get_compflags());
362 
363 const Shape & shape = B->get_shape();
364     if (shape.get_rank() == 0)
365         return Token(TOK_APL_VALUE1, Idx0(LOC));
366 
367     if (B->is_char_string())
368        {
369          Value_P Z = regex_results(regexp, flags, B->get_UCS_ravel());
370          Z->check_value(LOC);
371          return Token(TOK_APL_VALUE1, Z);
372        }
373 
374 Value_P Z(shape, LOC);
375    for (ShapeItem i = 0 ; i < shape.get_volume() ; i++)
376        {
377          const Cell & cell = B->get_ravel(i);
378          Value_P B_sub = cell.to_value(LOC);
379          if (!B_sub->is_char_string())
380             {
381               MORE_ERROR() << "Cell does not contain a string";
382               DOMAIN_ERROR;
383             }
384 
385          Value_P Z_sub = regex_results(regexp, flags, B_sub->get_UCS_ravel());
386          Z_sub->check_value(LOC);
387          new (Z->next_ravel()) PointerCell(Z_sub.get(), Z.getref());
388        }
389 
390    Z->check_value(LOC);
391    return Token(TOK_APL_VALUE1, Z);
392 }
393 //-----------------------------------------------------------------------------
394 
395 #endif
396