1/*
2** string.ops
3*/
4
5BEGIN_OPS_PREAMBLE
6
7#include "parrot/encoding.h"
8#include "parrot/namealias.h"
9#if PARROT_HAS_ICU
10#  include <unicode/uchar.h>
11#endif
12
13END_OPS_PREAMBLE
14
15=head1 NAME
16
17string.ops - String Opcodes
18
19=head1 DESCRIPTION
20
21Operations that work on strings, whether constructing, modifying
22or examining them.
23
24When making changes to any ops file, run C<make bootstrap-ops> to regenerate
25all generated ops files.
26
27=over 4
28
29=cut
30
31
32=item B<ord>(out INT, in STR)
33
34The codepoint in the current character set of the first character of string $2
35is returned in integer $1.
36If $2 is empty, an exception is thrown.
37
38=item B<ord>(out INT, in STR, in INT)
39
40The codepoint in the current character set of the character at integer index $3
41of string $2 is returned in integer $1.
42If $2 is empty, an exception is thrown.
43If $3 is greater than the length of $2, an exception is thrown.
44If $3 is less then zero but greater than the negative of the length of $2,
45counts backwards through $2, such that -1 is the last character,
46-2 is the second-to-last character, and so on.
47If $3 is less than the negative of the length of $2, an exception is thrown.
48
49=cut
50
51inline op ord(out INT, in STR)  {
52    $1 = STRING_ord(interp, $2, 0);
53}
54
55inline op ord(out INT, in STR, in INT)  {
56    $1 = STRING_ord(interp, $2, $3);
57}
58
59
60=item B<chr>(out STR, in INT)
61
62The character specified by codepoint integer $2 is returned in string $1.
63
64For characters > 0xff an utf8 encoded string is returned,
65for characters between 0x7f and 0xff a latin1 encoded string is returned,
66for characters below 0x7f an ascii encoded string.
67
68=cut
69
70inline op chr(out STR, in INT)  {
71    STRING * const s = Parrot_str_chr(interp, (UINTVAL)$2);
72    $1 = s;
73}
74
75
76=item B<chopn>(out STR, in STR, in INT)
77
78Remove n characters specified by integer $3 from the tail of string $2,
79and returns the characters not chopped in string $1.
80If $3 is negative, cut the string after -$3 characters.
81
82=cut
83
84inline op chopn(out STR, in STR, in INT)  {
85    $1 = Parrot_str_chopn(interp, $2, $3);
86}
87
88
89=item B<concat>(invar PMC, in STR)
90
91=item B<concat>(invar PMC, invar PMC)
92
93Modify string $1 in place, appending string $2.
94
95=item B<concat>(out STR, in STR, in STR)
96
97=item B<concat>(invar PMC, invar PMC, in STR)
98
99=item B<concat>(invar PMC, invar PMC, invar PMC)
100
101Append string $3 to string $2 and place the result into string $1.
102
103=cut
104
105inline op concat(invar PMC, invar PMC)  {
106    VTABLE_i_concatenate(interp, $1, $2);
107}
108
109inline op concat(invar PMC, in STR)  {
110    VTABLE_i_concatenate_str(interp, $1, $2);
111}
112
113inline op concat(out STR, in STR, in STR) :base_mem {
114    $1 = Parrot_str_concat(interp, $2, $3);
115}
116
117inline op concat(invar PMC, invar PMC, in STR)  {
118    $1 = VTABLE_concatenate_str(interp, $2, $3, $1);
119}
120
121inline op concat(invar PMC, invar PMC, invar PMC)  {
122    $1 = VTABLE_concatenate(interp, $2, $3, $1);
123}
124
125=item B<repeat>(out STR, in STR, in INT)
126
127=item B<repeat>(invar PMC, invar PMC, in INT)
128
129=item B<repeat>(invar PMC, invar PMC, invar PMC)
130
131Repeat string $2 integer $3 times and return result in string $1.
132The C<PMC> versions are MMD operations.
133
134=cut
135
136inline op repeat(out STR, in STR, in INT) :base_mem {
137    if ($3 < 0) {
138        opcode_t * const handler = Parrot_ex_throw_from_op_args(interp, NULL,
139                EXCEPTION_NEG_REPEAT,
140                "Cannot repeat with negative arg");
141        goto ADDRESS(handler);
142    }
143    $1 = Parrot_str_repeat(interp, $2, (UINTVAL)$3);
144}
145
146inline op repeat(invar PMC, invar PMC, in INT)  {
147    if ($3 < 0) {
148        opcode_t * const handler = Parrot_ex_throw_from_op_args(interp, NULL,
149                EXCEPTION_NEG_REPEAT,
150                "Cannot repeat with negative arg");
151        goto ADDRESS(handler);
152    }
153    $1 = VTABLE_repeat_int(interp, $2, $3, $1);
154}
155
156inline op repeat(invar PMC, invar PMC, invar PMC)  {
157    if (VTABLE_get_integer(interp, $3) < 0) {
158        opcode_t * const handler = Parrot_ex_throw_from_op_args(interp, NULL,
159                EXCEPTION_NEG_REPEAT,
160                "Cannot repeat with negative arg");
161        goto ADDRESS(handler);
162    }
163    $1 = VTABLE_repeat(interp, $2, $3, $1);
164}
165
166=item B<repeat>(invar PMC, in INT)
167
168=item B<repeat>(invar PMC, invar PMC)
169
170Repeat string $1 number $2 times and return result in string $1.
171The C<PMC> versions are MMD operations.
172
173=cut
174
175inline op repeat(invar PMC, in INT)  {
176    VTABLE_i_repeat_int(interp, $1, $2);
177}
178
179inline op repeat(invar PMC, invar PMC)  {
180    VTABLE_i_repeat(interp, $1, $2);
181}
182
183
184=item B<length>(out INT, in STR)
185
186Calculate the length (in characters) of string $2 and return as integer $1.
187If $2 is NULL or zero length, zero is returned.
188
189=item B<bytelength>(out INT, in STR)
190
191Calculate the length (in bytes) of string $2 and return as integer $1.
192If $2 is NULL or zero length, zero is returned.
193
194=cut
195
196inline op length(out INT, in STR) :base_mem {
197    $1 = Parrot_str_length(interp, $2);
198}
199
200inline op bytelength(out INT, in STR) :base_mem {
201    $1 = Parrot_str_byte_length(interp, $2);
202}
203
204
205=item B<pin>(inout STR)
206
207Make the memory in string $1 immobile. This memory will I<not> be moved
208by the Garbage Collector, and may be safely passed to external libraries.
209(Well, as long as they don't free it) Pinning a string will move the contents.
210
211$1 should be unpinned if it is used after pinning is no longer necessary.
212
213=cut
214
215op pin(inout STR) :base_mem {
216    Parrot_str_pin(interp, $1);
217}
218
219
220=item B<unpin>(inout STR)
221
222Make the memory in string $1 movable again.
223This will make the memory in $1 move.
224
225=cut
226
227op unpin(inout STR) :base_mem {
228    Parrot_str_unpin(interp, $1);
229}
230
231
232=item B<substr>(out STR, in STR, in INT)
233
234=item B<substr>(out STR, in STR, in INT, in INT)
235
236=item B<substr>(out STR, invar PMC, in INT, in INT)
237
238Set $1 to the portion of $2 starting at (zero-based) character position
239$3 and having length $4. If no length ($4) is provided, it is equivalent to
240passing in the length of $2.
241
242=item B<replace>(out STR, in STR, in INT, in INT, in STR)
243
244Replace part of $2 starting from $3 of length $4 with $5. If the length of $5 is
245different from the length specified in $4, then $2 will grow or shrink
246accordingly. If $3 is one character position larger than the length of
247$2, then $5 is appended to $2 (and the empty string is returned);
248this is essentially the same as
249
250  concat $2, $5
251
252Finally, if $3 is negative, then it is taken to count backwards from
253the end of the string (ie an offset of -1 corresponds to the last
254character).
255
256New $1 string returned.
257
258=cut
259
260inline op substr(out STR, in STR, in INT)  {
261    const INTVAL len = Parrot_str_byte_length(interp, $2);
262    $1 = STRING_substr(interp, $2, $3, len);
263}
264
265inline op substr(out STR, in STR, in INT, in INT)  {
266    $1 = STRING_substr(interp, $2, $3, $4);
267}
268
269inline op substr(out STR, invar PMC, in INT, in INT)  {
270    $1 = VTABLE_substr(interp, $2, $3, $4);
271}
272
273inline op replace(out STR, in STR, in INT, in INT, in STR)  {
274    $1 = Parrot_str_replace(interp, $2, $3, $4, $5);
275}
276
277
278=item B<index>(out INT, in STR, in STR)
279
280=item B<index>(out INT, in STR, in STR, in INT)
281
282The B<index> function searches for a substring within target string, but
283without the wildcard-like behavior of a full regular-expression pattern match.
284It returns the position of the first occurrence of substring $3
285in target string $2 at or after zero-based position $4.
286If $4 is omitted, B<index> starts searching from the beginning of the string.
287The return value is based at "0".
288If the string is null, or the substring is not found or is null,
289B<index> returns "-1".
290
291=item B<rindex>(out INT, in STR, in STR)
292
293=item B<rindex>(out INT, in STR, in STR, in INT)
294
295Search the string for the last instance of the substring from the end. If
296Provided, a match will not be found after $4.
297
298=cut
299
300inline op index(out INT, in STR, in STR)  {
301    $1 = ($2 && $3) ? STRING_index(interp, $2, $3, 0) : -1;
302}
303
304inline op index(out INT, in STR, in STR, in INT)  {
305    $1 = ($2 && $3) ? STRING_index(interp, $2, $3, $4) : -1;
306}
307
308inline op rindex(out INT, in STR, in STR) {
309    STRING * const str = $2;
310    $1 = Parrot_str_find_reverse_index(interp, str, $3, STRING_length(str));
311}
312
313inline op rindex(out INT, in STR, in STR, in INT) {
314    STRING * const str = $2;
315    $1 = Parrot_str_find_reverse_index(interp, str, $3, $4);
316}
317
318=item B<sprintf>(out STR, in STR, invar PMC)
319
320=item B<sprintf>(out PMC, invar PMC, invar PMC)
321
322Sets $1 to the result of calling C<Parrot_psprintf> with the
323given format ($2) and arguments ($3, which should be an ordered
324aggregate PMC).
325
326The result is quite similar to using the system C<sprintf>, but is
327protected against buffer overflows and the like.  There are some
328differences, especially concerning sizes (which are largely ignored);
329see F<misc.c> for details.
330
331=cut
332
333inline op sprintf(out STR, in STR, invar PMC)  {
334    $1=Parrot_psprintf(interp, $2, $3);
335}
336
337inline op sprintf(out PMC, invar PMC, invar PMC)  {
338    VTABLE_set_string_native(interp, $1,
339        Parrot_psprintf(interp, VTABLE_get_string(interp, $2), $3));
340}
341
342
343=item B<new>(out STR)
344
345Allocate a new empty string.
346
347=cut
348
349inline op new(out STR) :base_mem {
350    $1 = Parrot_str_new_noinit(interp, 0);
351}
352
353
354=item B<stringinfo>(out INT, in STR, in INT)
355
356Extract some information about string $2 and store it in $1.
357If a null string is passed, $1 is always set to 0.
358If an invalid $3 is passed, an exception is thrown.
359Possible values for $3 are:
360
361=over 4
362
363=item 1 The location of the string buffer header.
364
365=item 2 The location of the start of the string.
366
367=item 3 The length of the string buffer (in bytes).
368
369=item 4 The flags attached to the string (if any).
370
371=item 5 The amount of the string buffer used (in bytes).
372
373=item 6 The length of the string (in characters).
374
375=back
376
377=cut
378
379inline op stringinfo(out INT, in STR, in INT)  {
380    if ($2 == NULL)
381        $1 = 0;
382    else {
383        switch ($3) {
384          case STRINGINFO_HEADER:
385            $1 = PTR2UINTVAL($2);
386            break;
387          case STRINGINFO_STRSTART:
388            $1 = PTR2UINTVAL($2->strstart);
389            break;
390          case STRINGINFO_BUFLEN:
391            $1 = Buffer_buflen($2);
392            break;
393          case STRINGINFO_FLAGS:
394            $1 = PObj_get_FLAGS($2);
395            break;
396          case STRINGINFO_BUFUSED:
397            $1 = $2->bufused;
398            break;
399          case STRINGINFO_STRLEN:
400            $1 = $2->strlen;
401            break;
402          default:
403            {
404                opcode_t * const handler = Parrot_ex_throw_from_op_args(interp, NULL,
405                    EXCEPTION_INVALID_OPERATION,
406                    "stringinfo: unknown info type: %d", $3);
407                goto ADDRESS(handler);
408            }
409        }
410    }
411}
412
413
414=item B<upcase>(out STR, in STR)
415
416Uppercase $2 and put the result in $1
417
418=cut
419
420inline op upcase(out STR, in STR)  {
421    $1 = Parrot_str_upcase(interp, $2);
422}
423
424=item B<downcase>(out STR, in STR)
425
426Downcase $2 and put the result in $1
427
428=cut
429
430inline op downcase(out STR, in STR)  {
431    $1 = Parrot_str_downcase(interp, $2);
432}
433
434=item B<titlecase>(out STR, in STR)
435
436Titlecase $2 and put the result in $1
437
438=cut
439
440inline op titlecase(out STR, in STR)  {
441    $1 = Parrot_str_titlecase(interp, $2);
442}
443
444
445=item B<join>(out STR, in STR, invar PMC)
446
447Create a new string $1 by joining array elements from array $3
448with string $2.
449
450=item B<split>(out PMC, in STR, in STR)
451
452Create a new Array PMC $1 by splitting the string $3 into pieces
453delimited by the string $2. If $2 does not appear in $3, then return $3
454as the sole element of the Array PMC. Will return empty strings for
455delimiters at the beginning and end of $3
456
457Note: the string $2 is just a string. If you want a perl-ish split
458on regular expression, use C<PGE::Util>'s split from the standard library.
459
460=cut
461
462op join(out STR, in STR, invar PMC)  {
463    $1 = Parrot_str_join(interp, $2, $3);
464}
465
466op split(out PMC, in STR, in STR)  {
467    $1 = Parrot_str_split(interp, $2, $3);
468}
469
470
471=item B<encoding>(out INT, in STR)
472
473Return the encoding number $1 of string $2.
474
475=item B<encodingname>(out STR, in INT)
476
477Return the name $1 of encoding number $2.
478If encoding number $2 is not found, name $1 is set to null.
479
480=item B<find_encoding>(out INT, in STR)
481
482Return the encoding number of the encoding named $2. If the encoding doesn't
483exist, throw an exception.
484
485=item B<trans_encoding>(out STR, in STR, in INT)
486
487Create a string $1 from $2 with the specified encoding.
488
489Both functions may throw an exception on information loss.
490
491=cut
492
493op encoding(out INT, in STR)  {
494    $1 = Parrot_encoding_number_of_str(interp, $2);
495}
496
497op encodingname(out STR, in INT)  {
498    $1 = Parrot_encoding_name(interp, $2);
499}
500
501op find_encoding(out INT, in STR)  {
502    const INTVAL n = Parrot_encoding_number(interp, $2);
503    if (n < 0) {
504        opcode_t * const handler = Parrot_ex_throw_from_op_args(interp, NULL,
505                EXCEPTION_INVALID_ENCODING,
506                "encoding '%Ss' not found", $2);
507        goto ADDRESS(handler);
508    }
509    $1 = n;
510}
511
512op trans_encoding(out STR, in STR, in INT) {
513    $1 = Parrot_str_change_encoding(interp, $2, $3);
514}
515
516
517=item B<is_cclass>(out INT, in INT, in STR, in INT)
518
519Set $1 to 1 if the codepoint of $3 at position $4 is in
520the character class(es) given by $2.
521
522=cut
523
524inline op is_cclass(out INT, in INT, in STR, in INT) {
525    $1 = Parrot_str_is_cclass(interp, $2, $3, $4);
526}
527
528
529=item B<find_cclass>(out INT, in INT, in STR, in INT, in INT)
530
531Set $1 to the offset of the first codepoint matching
532the character class(es) given by $2 in string $3, starting
533at offset $4 for up to $5 codepoints.  If no matching
534character is found, set $1 to (offset + count).
535
536=cut
537
538inline op find_cclass(out INT, in INT, in STR, in INT, in INT) {
539    $1 = Parrot_str_find_cclass(interp, $2, $3, $4, $5);
540}
541
542
543=item B<find_not_cclass>(out INT, in INT, in STR, in INT, in INT)
544
545Set $1 to the offset of the first codepoint not matching
546the character class(es) given by $2 in string $3, starting
547at offset $4 for up to $5 codepoints.  If the substring
548consists entirely of matching characters, set $1 to (offset + count).
549
550=cut
551
552inline op find_not_cclass(out INT, in INT, in STR, in INT, in INT) {
553    $1 = Parrot_str_find_not_cclass(interp, $2, $3, $4, $5);
554}
555
556
557=item B<escape>(out STR, invar STR)
558
559Escape all non-ascii chars to backslashed escape sequences. A
560string with charset I<ascii> is created as result.
561
562=item B<compose>(out STR, in STR)
563
564Compose (normalize) a string.
565
566=cut
567
568op escape(out STR, invar STR) {
569    $1 = Parrot_str_escape(interp, $2);
570}
571
572op compose(out STR, in STR) {
573    $1 = Parrot_str_compose(interp, $2);
574}
575
576
577=item B<find_codepoint>(out INT, in STR)
578
579Set $1 to the codepoint with the name given in $2, or -1 if there is none.
580
581With ICU many more name aliases are found, but without currently only
582for control characters.
583
584=cut
585
586op find_codepoint(out INT, in STR) {
587    $1 = Parrot_str_internal_find_codepoint(interp, $2);
588}
589
590=back
591
592=head1 COPYRIGHT
593
594Copyright (C) 2001-2011, Parrot Foundation.
595
596=head1 LICENSE
597
598This program is free software. It is subject to the same license
599as the Parrot interpreter itself.
600
601=cut
602
603/*
604 * Local variables:
605 *   c-file-style: "parrot"
606 * End:
607 * vim: expandtab shiftwidth=4 cinoptions='\:2=2' :
608 */
609