1/* 2** string.ops 3*/ 4 5BEGIN_OPS_PREAMBLE 6 7#include "parrot/encoding.h" 8#include "parrot/namealias.h" 9#if PARROT_HAS_ICU 10# include <unicode/uchar.h> 11#endif 12 13END_OPS_PREAMBLE 14 15=head1 NAME 16 17string.ops - String Opcodes 18 19=head1 DESCRIPTION 20 21Operations that work on strings, whether constructing, modifying 22or examining them. 23 24When making changes to any ops file, run C<make bootstrap-ops> to regenerate 25all generated ops files. 26 27=over 4 28 29=cut 30 31 32=item B<ord>(out INT, in STR) 33 34The codepoint in the current character set of the first character of string $2 35is returned in integer $1. 36If $2 is empty, an exception is thrown. 37 38=item B<ord>(out INT, in STR, in INT) 39 40The codepoint in the current character set of the character at integer index $3 41of string $2 is returned in integer $1. 42If $2 is empty, an exception is thrown. 43If $3 is greater than the length of $2, an exception is thrown. 44If $3 is less then zero but greater than the negative of the length of $2, 45counts backwards through $2, such that -1 is the last character, 46-2 is the second-to-last character, and so on. 47If $3 is less than the negative of the length of $2, an exception is thrown. 48 49=cut 50 51inline op ord(out INT, in STR) { 52 $1 = STRING_ord(interp, $2, 0); 53} 54 55inline op ord(out INT, in STR, in INT) { 56 $1 = STRING_ord(interp, $2, $3); 57} 58 59 60=item B<chr>(out STR, in INT) 61 62The character specified by codepoint integer $2 is returned in string $1. 63 64For characters > 0xff an utf8 encoded string is returned, 65for characters between 0x7f and 0xff a latin1 encoded string is returned, 66for characters below 0x7f an ascii encoded string. 67 68=cut 69 70inline op chr(out STR, in INT) { 71 STRING * const s = Parrot_str_chr(interp, (UINTVAL)$2); 72 $1 = s; 73} 74 75 76=item B<chopn>(out STR, in STR, in INT) 77 78Remove n characters specified by integer $3 from the tail of string $2, 79and returns the characters not chopped in string $1. 80If $3 is negative, cut the string after -$3 characters. 81 82=cut 83 84inline op chopn(out STR, in STR, in INT) { 85 $1 = Parrot_str_chopn(interp, $2, $3); 86} 87 88 89=item B<concat>(invar PMC, in STR) 90 91=item B<concat>(invar PMC, invar PMC) 92 93Modify string $1 in place, appending string $2. 94 95=item B<concat>(out STR, in STR, in STR) 96 97=item B<concat>(invar PMC, invar PMC, in STR) 98 99=item B<concat>(invar PMC, invar PMC, invar PMC) 100 101Append string $3 to string $2 and place the result into string $1. 102 103=cut 104 105inline op concat(invar PMC, invar PMC) { 106 VTABLE_i_concatenate(interp, $1, $2); 107} 108 109inline op concat(invar PMC, in STR) { 110 VTABLE_i_concatenate_str(interp, $1, $2); 111} 112 113inline op concat(out STR, in STR, in STR) :base_mem { 114 $1 = Parrot_str_concat(interp, $2, $3); 115} 116 117inline op concat(invar PMC, invar PMC, in STR) { 118 $1 = VTABLE_concatenate_str(interp, $2, $3, $1); 119} 120 121inline op concat(invar PMC, invar PMC, invar PMC) { 122 $1 = VTABLE_concatenate(interp, $2, $3, $1); 123} 124 125=item B<repeat>(out STR, in STR, in INT) 126 127=item B<repeat>(invar PMC, invar PMC, in INT) 128 129=item B<repeat>(invar PMC, invar PMC, invar PMC) 130 131Repeat string $2 integer $3 times and return result in string $1. 132The C<PMC> versions are MMD operations. 133 134=cut 135 136inline op repeat(out STR, in STR, in INT) :base_mem { 137 if ($3 < 0) { 138 opcode_t * const handler = Parrot_ex_throw_from_op_args(interp, NULL, 139 EXCEPTION_NEG_REPEAT, 140 "Cannot repeat with negative arg"); 141 goto ADDRESS(handler); 142 } 143 $1 = Parrot_str_repeat(interp, $2, (UINTVAL)$3); 144} 145 146inline op repeat(invar PMC, invar PMC, in INT) { 147 if ($3 < 0) { 148 opcode_t * const handler = Parrot_ex_throw_from_op_args(interp, NULL, 149 EXCEPTION_NEG_REPEAT, 150 "Cannot repeat with negative arg"); 151 goto ADDRESS(handler); 152 } 153 $1 = VTABLE_repeat_int(interp, $2, $3, $1); 154} 155 156inline op repeat(invar PMC, invar PMC, invar PMC) { 157 if (VTABLE_get_integer(interp, $3) < 0) { 158 opcode_t * const handler = Parrot_ex_throw_from_op_args(interp, NULL, 159 EXCEPTION_NEG_REPEAT, 160 "Cannot repeat with negative arg"); 161 goto ADDRESS(handler); 162 } 163 $1 = VTABLE_repeat(interp, $2, $3, $1); 164} 165 166=item B<repeat>(invar PMC, in INT) 167 168=item B<repeat>(invar PMC, invar PMC) 169 170Repeat string $1 number $2 times and return result in string $1. 171The C<PMC> versions are MMD operations. 172 173=cut 174 175inline op repeat(invar PMC, in INT) { 176 VTABLE_i_repeat_int(interp, $1, $2); 177} 178 179inline op repeat(invar PMC, invar PMC) { 180 VTABLE_i_repeat(interp, $1, $2); 181} 182 183 184=item B<length>(out INT, in STR) 185 186Calculate the length (in characters) of string $2 and return as integer $1. 187If $2 is NULL or zero length, zero is returned. 188 189=item B<bytelength>(out INT, in STR) 190 191Calculate the length (in bytes) of string $2 and return as integer $1. 192If $2 is NULL or zero length, zero is returned. 193 194=cut 195 196inline op length(out INT, in STR) :base_mem { 197 $1 = Parrot_str_length(interp, $2); 198} 199 200inline op bytelength(out INT, in STR) :base_mem { 201 $1 = Parrot_str_byte_length(interp, $2); 202} 203 204 205=item B<pin>(inout STR) 206 207Make the memory in string $1 immobile. This memory will I<not> be moved 208by the Garbage Collector, and may be safely passed to external libraries. 209(Well, as long as they don't free it) Pinning a string will move the contents. 210 211$1 should be unpinned if it is used after pinning is no longer necessary. 212 213=cut 214 215op pin(inout STR) :base_mem { 216 Parrot_str_pin(interp, $1); 217} 218 219 220=item B<unpin>(inout STR) 221 222Make the memory in string $1 movable again. 223This will make the memory in $1 move. 224 225=cut 226 227op unpin(inout STR) :base_mem { 228 Parrot_str_unpin(interp, $1); 229} 230 231 232=item B<substr>(out STR, in STR, in INT) 233 234=item B<substr>(out STR, in STR, in INT, in INT) 235 236=item B<substr>(out STR, invar PMC, in INT, in INT) 237 238Set $1 to the portion of $2 starting at (zero-based) character position 239$3 and having length $4. If no length ($4) is provided, it is equivalent to 240passing in the length of $2. 241 242=item B<replace>(out STR, in STR, in INT, in INT, in STR) 243 244Replace part of $2 starting from $3 of length $4 with $5. If the length of $5 is 245different from the length specified in $4, then $2 will grow or shrink 246accordingly. If $3 is one character position larger than the length of 247$2, then $5 is appended to $2 (and the empty string is returned); 248this is essentially the same as 249 250 concat $2, $5 251 252Finally, if $3 is negative, then it is taken to count backwards from 253the end of the string (ie an offset of -1 corresponds to the last 254character). 255 256New $1 string returned. 257 258=cut 259 260inline op substr(out STR, in STR, in INT) { 261 const INTVAL len = Parrot_str_byte_length(interp, $2); 262 $1 = STRING_substr(interp, $2, $3, len); 263} 264 265inline op substr(out STR, in STR, in INT, in INT) { 266 $1 = STRING_substr(interp, $2, $3, $4); 267} 268 269inline op substr(out STR, invar PMC, in INT, in INT) { 270 $1 = VTABLE_substr(interp, $2, $3, $4); 271} 272 273inline op replace(out STR, in STR, in INT, in INT, in STR) { 274 $1 = Parrot_str_replace(interp, $2, $3, $4, $5); 275} 276 277 278=item B<index>(out INT, in STR, in STR) 279 280=item B<index>(out INT, in STR, in STR, in INT) 281 282The B<index> function searches for a substring within target string, but 283without the wildcard-like behavior of a full regular-expression pattern match. 284It returns the position of the first occurrence of substring $3 285in target string $2 at or after zero-based position $4. 286If $4 is omitted, B<index> starts searching from the beginning of the string. 287The return value is based at "0". 288If the string is null, or the substring is not found or is null, 289B<index> returns "-1". 290 291=item B<rindex>(out INT, in STR, in STR) 292 293=item B<rindex>(out INT, in STR, in STR, in INT) 294 295Search the string for the last instance of the substring from the end. If 296Provided, a match will not be found after $4. 297 298=cut 299 300inline op index(out INT, in STR, in STR) { 301 $1 = ($2 && $3) ? STRING_index(interp, $2, $3, 0) : -1; 302} 303 304inline op index(out INT, in STR, in STR, in INT) { 305 $1 = ($2 && $3) ? STRING_index(interp, $2, $3, $4) : -1; 306} 307 308inline op rindex(out INT, in STR, in STR) { 309 STRING * const str = $2; 310 $1 = Parrot_str_find_reverse_index(interp, str, $3, STRING_length(str)); 311} 312 313inline op rindex(out INT, in STR, in STR, in INT) { 314 STRING * const str = $2; 315 $1 = Parrot_str_find_reverse_index(interp, str, $3, $4); 316} 317 318=item B<sprintf>(out STR, in STR, invar PMC) 319 320=item B<sprintf>(out PMC, invar PMC, invar PMC) 321 322Sets $1 to the result of calling C<Parrot_psprintf> with the 323given format ($2) and arguments ($3, which should be an ordered 324aggregate PMC). 325 326The result is quite similar to using the system C<sprintf>, but is 327protected against buffer overflows and the like. There are some 328differences, especially concerning sizes (which are largely ignored); 329see F<misc.c> for details. 330 331=cut 332 333inline op sprintf(out STR, in STR, invar PMC) { 334 $1=Parrot_psprintf(interp, $2, $3); 335} 336 337inline op sprintf(out PMC, invar PMC, invar PMC) { 338 VTABLE_set_string_native(interp, $1, 339 Parrot_psprintf(interp, VTABLE_get_string(interp, $2), $3)); 340} 341 342 343=item B<new>(out STR) 344 345Allocate a new empty string. 346 347=cut 348 349inline op new(out STR) :base_mem { 350 $1 = Parrot_str_new_noinit(interp, 0); 351} 352 353 354=item B<stringinfo>(out INT, in STR, in INT) 355 356Extract some information about string $2 and store it in $1. 357If a null string is passed, $1 is always set to 0. 358If an invalid $3 is passed, an exception is thrown. 359Possible values for $3 are: 360 361=over 4 362 363=item 1 The location of the string buffer header. 364 365=item 2 The location of the start of the string. 366 367=item 3 The length of the string buffer (in bytes). 368 369=item 4 The flags attached to the string (if any). 370 371=item 5 The amount of the string buffer used (in bytes). 372 373=item 6 The length of the string (in characters). 374 375=back 376 377=cut 378 379inline op stringinfo(out INT, in STR, in INT) { 380 if ($2 == NULL) 381 $1 = 0; 382 else { 383 switch ($3) { 384 case STRINGINFO_HEADER: 385 $1 = PTR2UINTVAL($2); 386 break; 387 case STRINGINFO_STRSTART: 388 $1 = PTR2UINTVAL($2->strstart); 389 break; 390 case STRINGINFO_BUFLEN: 391 $1 = Buffer_buflen($2); 392 break; 393 case STRINGINFO_FLAGS: 394 $1 = PObj_get_FLAGS($2); 395 break; 396 case STRINGINFO_BUFUSED: 397 $1 = $2->bufused; 398 break; 399 case STRINGINFO_STRLEN: 400 $1 = $2->strlen; 401 break; 402 default: 403 { 404 opcode_t * const handler = Parrot_ex_throw_from_op_args(interp, NULL, 405 EXCEPTION_INVALID_OPERATION, 406 "stringinfo: unknown info type: %d", $3); 407 goto ADDRESS(handler); 408 } 409 } 410 } 411} 412 413 414=item B<upcase>(out STR, in STR) 415 416Uppercase $2 and put the result in $1 417 418=cut 419 420inline op upcase(out STR, in STR) { 421 $1 = Parrot_str_upcase(interp, $2); 422} 423 424=item B<downcase>(out STR, in STR) 425 426Downcase $2 and put the result in $1 427 428=cut 429 430inline op downcase(out STR, in STR) { 431 $1 = Parrot_str_downcase(interp, $2); 432} 433 434=item B<titlecase>(out STR, in STR) 435 436Titlecase $2 and put the result in $1 437 438=cut 439 440inline op titlecase(out STR, in STR) { 441 $1 = Parrot_str_titlecase(interp, $2); 442} 443 444 445=item B<join>(out STR, in STR, invar PMC) 446 447Create a new string $1 by joining array elements from array $3 448with string $2. 449 450=item B<split>(out PMC, in STR, in STR) 451 452Create a new Array PMC $1 by splitting the string $3 into pieces 453delimited by the string $2. If $2 does not appear in $3, then return $3 454as the sole element of the Array PMC. Will return empty strings for 455delimiters at the beginning and end of $3 456 457Note: the string $2 is just a string. If you want a perl-ish split 458on regular expression, use C<PGE::Util>'s split from the standard library. 459 460=cut 461 462op join(out STR, in STR, invar PMC) { 463 $1 = Parrot_str_join(interp, $2, $3); 464} 465 466op split(out PMC, in STR, in STR) { 467 $1 = Parrot_str_split(interp, $2, $3); 468} 469 470 471=item B<encoding>(out INT, in STR) 472 473Return the encoding number $1 of string $2. 474 475=item B<encodingname>(out STR, in INT) 476 477Return the name $1 of encoding number $2. 478If encoding number $2 is not found, name $1 is set to null. 479 480=item B<find_encoding>(out INT, in STR) 481 482Return the encoding number of the encoding named $2. If the encoding doesn't 483exist, throw an exception. 484 485=item B<trans_encoding>(out STR, in STR, in INT) 486 487Create a string $1 from $2 with the specified encoding. 488 489Both functions may throw an exception on information loss. 490 491=cut 492 493op encoding(out INT, in STR) { 494 $1 = Parrot_encoding_number_of_str(interp, $2); 495} 496 497op encodingname(out STR, in INT) { 498 $1 = Parrot_encoding_name(interp, $2); 499} 500 501op find_encoding(out INT, in STR) { 502 const INTVAL n = Parrot_encoding_number(interp, $2); 503 if (n < 0) { 504 opcode_t * const handler = Parrot_ex_throw_from_op_args(interp, NULL, 505 EXCEPTION_INVALID_ENCODING, 506 "encoding '%Ss' not found", $2); 507 goto ADDRESS(handler); 508 } 509 $1 = n; 510} 511 512op trans_encoding(out STR, in STR, in INT) { 513 $1 = Parrot_str_change_encoding(interp, $2, $3); 514} 515 516 517=item B<is_cclass>(out INT, in INT, in STR, in INT) 518 519Set $1 to 1 if the codepoint of $3 at position $4 is in 520the character class(es) given by $2. 521 522=cut 523 524inline op is_cclass(out INT, in INT, in STR, in INT) { 525 $1 = Parrot_str_is_cclass(interp, $2, $3, $4); 526} 527 528 529=item B<find_cclass>(out INT, in INT, in STR, in INT, in INT) 530 531Set $1 to the offset of the first codepoint matching 532the character class(es) given by $2 in string $3, starting 533at offset $4 for up to $5 codepoints. If no matching 534character is found, set $1 to (offset + count). 535 536=cut 537 538inline op find_cclass(out INT, in INT, in STR, in INT, in INT) { 539 $1 = Parrot_str_find_cclass(interp, $2, $3, $4, $5); 540} 541 542 543=item B<find_not_cclass>(out INT, in INT, in STR, in INT, in INT) 544 545Set $1 to the offset of the first codepoint not matching 546the character class(es) given by $2 in string $3, starting 547at offset $4 for up to $5 codepoints. If the substring 548consists entirely of matching characters, set $1 to (offset + count). 549 550=cut 551 552inline op find_not_cclass(out INT, in INT, in STR, in INT, in INT) { 553 $1 = Parrot_str_find_not_cclass(interp, $2, $3, $4, $5); 554} 555 556 557=item B<escape>(out STR, invar STR) 558 559Escape all non-ascii chars to backslashed escape sequences. A 560string with charset I<ascii> is created as result. 561 562=item B<compose>(out STR, in STR) 563 564Compose (normalize) a string. 565 566=cut 567 568op escape(out STR, invar STR) { 569 $1 = Parrot_str_escape(interp, $2); 570} 571 572op compose(out STR, in STR) { 573 $1 = Parrot_str_compose(interp, $2); 574} 575 576 577=item B<find_codepoint>(out INT, in STR) 578 579Set $1 to the codepoint with the name given in $2, or -1 if there is none. 580 581With ICU many more name aliases are found, but without currently only 582for control characters. 583 584=cut 585 586op find_codepoint(out INT, in STR) { 587 $1 = Parrot_str_internal_find_codepoint(interp, $2); 588} 589 590=back 591 592=head1 COPYRIGHT 593 594Copyright (C) 2001-2011, Parrot Foundation. 595 596=head1 LICENSE 597 598This program is free software. It is subject to the same license 599as the Parrot interpreter itself. 600 601=cut 602 603/* 604 * Local variables: 605 * c-file-style: "parrot" 606 * End: 607 * vim: expandtab shiftwidth=4 cinoptions='\:2=2' : 608 */ 609