1 /*  Part of SWI-Prolog
2 
3     Author:        Jan Wielemaker and Anjo Anjewierden
4     E-mail:        J.Wielemaker@vu.nl
5     WWW:           http://www.swi-prolog.org
6     Copyright (c)  2011-2020, University of Amsterdam
7                               VU University Amsterdam
8     All rights reserved.
9 
10     Redistribution and use in source and binary forms, with or without
11     modification, are permitted provided that the following conditions
12     are met:
13 
14     1. Redistributions of source code must retain the above copyright
15        notice, this list of conditions and the following disclaimer.
16 
17     2. Redistributions in binary form must reproduce the above copyright
18        notice, this list of conditions and the following disclaimer in
19        the documentation and/or other materials provided with the
20        distribution.
21 
22     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
26     COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
28     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
30     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
32     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
33     POSSIBILITY OF SUCH DAMAGE.
34 */
35 
36 #include "pl-incl.h"
37 #include "pl-arith.h"
38 #include "pl-ctype.h"
39 #include "pl-utf8.h"
40 #include "pl-codelist.h"
41 #include <errno.h>
42 #include <stdio.h>
43 #if HAVE_LIMITS_H
44 #include <limits.h>			/* solaris compatibility */
45 #endif
46 
47 #undef LD
48 #define LD LOCAL_LD
49 
50 
51 		 /*******************************
52 		 *	UNIFIED TEXT STUFF	*
53 		 *******************************/
54 
55 static inline size_t
bufsize_text(PL_chars_t * text,size_t len)56 bufsize_text(PL_chars_t *text, size_t len)
57 { size_t unit;
58 
59   switch(text->encoding)
60   { case ENC_ISO_LATIN_1:
61     case ENC_ASCII:
62     case ENC_UTF8:
63     case ENC_ANSI:
64       unit = sizeof(char);
65       break;
66     case ENC_WCHAR:
67       unit = sizeof(pl_wchar_t);
68       break;
69     default:
70       assert(0);
71       unit = sizeof(char);		/*NOTREACHED*/
72   }
73 
74   return len*unit;
75 }
76 
77 
78 int
PL_save_text(PL_chars_t * text,int flags)79 PL_save_text(PL_chars_t *text, int flags)
80 { if ( (flags & BUF_MALLOC) && text->storage != PL_CHARS_MALLOC )
81   { size_t bl = bufsize_text(text, text->length+1);
82     void *new = PL_malloc(bl);
83 
84     if ( new )
85     { memcpy(new, text->text.t, bl);
86       text->text.t = new;
87       text->storage = PL_CHARS_MALLOC;
88     } else
89     { return FALSE;
90     }
91   } else if ( text->storage == PL_CHARS_LOCAL )
92   { Buffer b = findBuffer(BUF_STACK);
93     size_t bl = bufsize_text(text, text->length+1);
94 
95     addMultipleBuffer(b, text->text.t, bl, char);
96     text->text.t = baseBuffer(b, char);
97 
98     text->storage = PL_CHARS_RING;
99   } else if ( text->storage == PL_CHARS_MALLOC )
100   { Buffer b = findBuffer(BUF_STACK);
101     size_t bl = bufsize_text(text, text->length+1);
102 
103     addMultipleBuffer(b, text->text.t, bl, char);
104     PL_free_text(text);
105     text->text.t = baseBuffer(b, char);
106 
107     text->storage = PL_CHARS_RING;
108   }
109 
110   return TRUE;
111 }
112 
113 
114 /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
115 PL_from_stack_text() moves a string from  the   stack,  so  it won't get
116 corrupted if GC/shift comes along.
117 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
118 
119 static int
PL_from_stack_text(PL_chars_t * text,int flags)120 PL_from_stack_text(PL_chars_t *text, int flags)
121 { if ( !(flags&BUF_ALLOW_STACK) )
122   { if ( text->storage == PL_CHARS_STACK )
123     { size_t bl = bufsize_text(text, text->length+1);
124 
125       if ( bl < sizeof(text->buf) )
126       { memcpy(text->buf, text->text.t, bl);
127 	text->text.t = text->buf;
128 	text->storage = PL_CHARS_LOCAL;
129       } else
130       { Buffer b = findBuffer(BUF_STACK);
131 
132 	addMultipleBuffer(b, text->text.t, bl, char);
133 	text->text.t = baseBuffer(b, char);
134 	text->storage = PL_CHARS_RING;
135       }
136     }
137   }
138 
139   return TRUE;
140 }
141 
142 
143 #define INT64_DIGITS 20
144 
145 static char *
ui64toa(uint64_t val,char * out)146 ui64toa(uint64_t val, char *out)
147 { char tmpBuf[INT64_DIGITS + 1];
148   char *ptrOrg = tmpBuf + INT64_DIGITS;
149   char *ptr = ptrOrg;
150   size_t nbDigs;
151 
152   do
153   { int rem = val % 10;
154 
155     *--ptr = rem + '0';
156     val /= 10;
157   } while ( val );
158 
159   nbDigs = ptrOrg - ptr;
160   memcpy(out, ptr, nbDigs);
161   out += nbDigs;
162   *out = '\0';
163 
164   return out;				/* points to the END */
165 };
166 
167 
168 static char *
i64toa(int64_t val,char * out)169 i64toa(int64_t val, char *out)
170 { if ( val < 0 )
171   { *out++ = '-';
172     val = -(uint64_t)val;
173   }
174 
175   return ui64toa((uint64_t)val, out);
176 }
177 
178 
179 int
PL_get_text__LD(term_t l,PL_chars_t * text,int flags ARG_LD)180 PL_get_text__LD(term_t l, PL_chars_t *text, int flags ARG_LD)
181 { word w = valHandle(l);
182 
183   if ( (flags & CVT_ATOM) && isAtom(w) )
184   { if ( isNil(w) && (flags&CVT_LIST) )
185       goto case_list;
186     if ( !get_atom_text(w, text) )
187       goto maybe_write;
188   } else if ( (flags & CVT_STRING) && isString(w) )
189   { if ( !get_string_text(w, text PASS_LD) )
190       goto maybe_write;
191     if ( !PL_from_stack_text(text, flags) )
192       return FALSE;			/* no memory */
193   } else if ( ((flags&CVT_RATIONAL) && isRational(w)) ||
194 	      ((flags&CVT_INTEGER)  && isInteger(w)) )
195   { number n;
196 
197     PL_get_number(l, &n);
198     switch(n.type)
199     { case V_INTEGER:
200       { char *ep = i64toa(n.value.i, text->buf);
201 
202         text->text.t    = text->buf;
203 	text->length    = ep-text->text.t;
204 	text->storage   = PL_CHARS_LOCAL;
205 	break;
206       }
207 #ifdef O_GMP
208       case V_MPZ:
209       { size_t sz = mpz_sizeinbase(n.value.mpz, 10) + 2;
210 	Buffer b  = findBuffer(BUF_STACK);
211 
212 	if ( !growBuffer(b, sz) )
213 	  outOfCore();
214 	mpz_get_str(b->base, 10, n.value.mpz);
215 	b->top = b->base + strlen(b->base);
216 	text->text.t  = baseBuffer(b, char);
217 	text->length  = entriesBuffer(b, char);
218 	text->storage = PL_CHARS_RING;
219 
220 	break;
221       }
222       case V_MPQ:
223       { size_t sz = ( mpz_sizeinbase(mpq_numref(n.value.mpq), 10) +
224 		      mpz_sizeinbase(mpq_denref(n.value.mpq), 10) + 4 );
225 	Buffer b  = findBuffer(BUF_STACK);
226 
227 	if ( !growBuffer(b, sz) )
228 	  outOfCore();
229 	mpz_get_str(b->base, 10, mpq_numref(n.value.mpq));
230 	b->top = b->base + strlen(b->base);
231 	*b->top++ = 'r';			/* '/' under some condition? */
232 	mpz_get_str(b->top, 10, mpq_denref(n.value.mpq));
233 	b->top += strlen(b->top);
234 	text->text.t  = baseBuffer(b, char);
235 	text->length  = entriesBuffer(b, char);
236 	text->storage = PL_CHARS_RING;
237 
238 	break;
239       }
240 #endif
241       default:
242 	assert(0);
243     }
244     text->encoding  = ENC_ISO_LATIN_1;
245     text->canonical = TRUE;
246   } else if ( (flags & CVT_FLOAT) && isFloat(w) )
247   { format_float(valFloat(w), text->buf);
248     text->text.t    = text->buf;
249     text->length    = strlen(text->text.t);
250     text->encoding  = ENC_ISO_LATIN_1;
251     text->storage   = PL_CHARS_LOCAL;
252     text->canonical = TRUE;
253   } else if ( (flags & CVT_LIST) )
254   { Buffer b;
255     CVT_result result;
256 
257   case_list:
258     if ( (b = codes_or_chars_to_buffer(l, BUF_STACK, FALSE, &result)) )
259     { text->length = entriesBuffer(b, char);
260       addBuffer(b, EOS, char);
261       text->text.t = baseBuffer(b, char);
262       text->encoding = ENC_ISO_LATIN_1;
263     } else if ( result.status == CVT_wide &&
264 		(b = codes_or_chars_to_buffer(l, BUF_STACK, TRUE, &result)) )
265     { text->length = entriesBuffer(b, pl_wchar_t);
266       addBuffer(b, EOS, pl_wchar_t);
267       text->text.w = baseBuffer(b, pl_wchar_t);
268       text->encoding = ENC_WCHAR;
269     } else if ( (flags & (CVT_WRITE|CVT_WRITE_CANONICAL)) )
270     { goto case_write;
271     } else
272     { if ( (flags & CVT_VARNOFAIL) && result.status == CVT_partial )
273 	return 2;
274 
275       if ( (flags & CVT_EXCEPTION) )
276       { switch(result.status)
277 	{ case CVT_partial:
278 	    return PL_error(NULL, 0, NULL, ERR_INSTANTIATION);
279 	  case CVT_nolist:
280 	    goto error;
281 	  case CVT_nocode:
282 	  case CVT_nochar:
283 	  { term_t culprit = PL_new_term_ref();
284 	    atom_t type;
285 
286 	    *valTermRef(culprit) = result.culprit;
287 	    if ( result.status == CVT_nocode )
288 	      type = ATOM_character_code;
289 	    else
290 	      type = ATOM_character;
291 
292 	    return PL_error(NULL, 0, NULL, ERR_TYPE, type, culprit);
293 	  }
294 	  case CVT_representation:
295 	    return PL_representation_error("character_code");
296 	  default:
297 	    break;
298 	}
299       }
300       goto error;
301     }
302 
303     text->storage   = PL_CHARS_RING;
304     text->canonical = TRUE;
305   } else if ( (flags & CVT_VARIABLE) && isVar(w) )
306   { text->text.t   = varName(l, text->buf);
307     text->length   = strlen(text->text.t);
308     text->encoding = ENC_ISO_LATIN_1;
309     text->storage  = PL_CHARS_LOCAL;
310     text->canonical = TRUE;
311   } else if ( (flags & (CVT_WRITE|CVT_WRITE_CANONICAL)) )
312   { IOENC encodings[3];
313     IOENC *enc;
314     char *r;
315     int wflags;
316 
317   case_write:
318     if ( (flags&REP_UTF8) )
319     { encodings[0] = ENC_UTF8;
320       encodings[1] = ENC_UNKNOWN;
321     } else
322     { encodings[0] = ENC_ISO_LATIN_1;
323       encodings[1] = ENC_WCHAR;
324       encodings[2] = ENC_UNKNOWN;
325     }
326 
327     if ( (flags&CVT_WRITEQ) == CVT_WRITEQ )
328       wflags = PL_WRT_QUOTED|PL_WRT_NUMBERVARS;
329     else if ( (flags&CVT_WRITE_CANONICAL) )
330       wflags = PL_WRT_QUOTED|PL_WRT_IGNOREOPS|PL_WRT_NUMBERVARS;
331     else
332       wflags = PL_WRT_NUMBERVARS;
333 
334     for(enc = encodings; *enc != ENC_UNKNOWN; enc++)
335     { size_t size;
336       IOSTREAM *fd;
337 
338       r = text->buf;
339       size = sizeof(text->buf);
340       fd = Sopenmem(&r, &size, "w");
341       fd->encoding = *enc;
342       if ( PL_write_term(fd, l, 1200, wflags) &&
343 	   Sputcode(EOS, fd) >= 0 &&
344 	   Sflush(fd) >= 0 )
345       { text->encoding = *enc;
346 	text->storage = (r == text->buf ? PL_CHARS_LOCAL : PL_CHARS_MALLOC);
347 	text->canonical = TRUE;
348 
349 	if ( *enc == ENC_WCHAR )
350 	{ text->length = (size/sizeof(pl_wchar_t))-1;
351 	  text->text.w = (pl_wchar_t *)r;
352 	} else
353 	{ text->length = size-1;
354 	  text->text.t = r;
355 	}
356 
357 	Sclose(fd);
358 
359 	goto out;
360       } else
361       { Sclose(fd);
362 	if ( *enc == ENC_ISO_LATIN_1 && enc[1] != ENC_UNKNOWN )
363 	  PL_clear_exception();
364 
365 	if ( r != text->buf )
366 	  Sfree(r);
367       }
368     }
369 
370     goto error;
371   } else
372   { goto error;
373   }
374 
375 out:
376   return TRUE;
377 
378 maybe_write:
379   if ( (flags & (CVT_WRITE|CVT_WRITE_CANONICAL)) )
380     goto case_write;
381 
382 error:
383   if ( canBind(w) && (flags & CVT_VARNOFAIL) )
384     return 2;
385 
386   if ( (flags & CVT_EXCEPTION) )
387   { atom_t expected;
388 
389     if ( (flags & CVT_LIST) && !(flags&(CVT_ATOM|CVT_NUMBER)) )
390       expected = ATOM_list;		/* List and/or string object */
391     else if ( flags & CVT_LIST )
392       expected = ATOM_text;
393     else if ( (flags & CVT_ATOM) && w == ATOM_nil )
394       expected = ATOM_atom;		/* [] \== '[]' */
395     else if ( flags & CVT_NUMBER )
396       expected = ATOM_atomic;
397     else
398       expected = ATOM_atom;
399 
400     return PL_error(NULL, 0, NULL, ERR_TYPE, expected, l);
401   }
402 
403   fail;
404 }
405 
406 
407 atom_t
textToAtom(PL_chars_t * text)408 textToAtom(PL_chars_t *text)
409 { GET_LD
410   atom_t a;
411 
412   PL_STRINGS_MARK();
413   if ( PL_canonicalise_text(text) )
414   { if ( text->encoding == ENC_ISO_LATIN_1 )
415       a = lookupAtom(text->text.t, text->length);
416     else
417       a = lookupUCSAtom(text->text.w, text->length);
418   } else
419   { a = 0;
420   }
421   PL_STRINGS_RELEASE();
422 
423   return a;
424 }
425 
426 
427 word
textToString(PL_chars_t * text)428 textToString(PL_chars_t *text)
429 { GET_LD
430   atom_t a;
431 
432   PL_STRINGS_MARK();
433   if ( PL_canonicalise_text(text) )
434   { if ( text->encoding == ENC_ISO_LATIN_1 )
435       a = globalString(text->length, text->text.t);
436     else
437       a = globalWString(text->length, text->text.w);
438   } else
439   { a = 0;
440   }
441   PL_STRINGS_RELEASE();
442 
443   return a;
444 }
445 
446 
447 static size_t
globalSpaceRequirement(PL_chars_t * text)448 globalSpaceRequirement(PL_chars_t *text)
449 { size_t len;
450 
451   if ( text->encoding == ENC_ISO_LATIN_1 )
452   { len = text->length+1;
453   } else
454   { len = (text->length+1)*sizeof(pl_wchar_t);
455   }
456 
457   return 2 + (len+sizeof(word))/sizeof(word);
458 }
459 
460 
461 
462 static int
unify_text(term_t term,term_t tail,PL_chars_t * text,int type ARG_LD)463 unify_text(term_t term, term_t tail, PL_chars_t *text, int type ARG_LD)
464 { switch(type)
465   { case PL_ATOM:
466     { atom_t a = textToAtom(text);
467 
468       if ( a )
469       { int rval = _PL_unify_atomic(term, a);
470 
471 	PL_unregister_atom(a);
472 	return rval;
473       }
474       return FALSE;
475     }
476     case PL_STRING:
477     { word w;
478 
479       if ( PL_canonicalise_text(text) )
480       { if ( hasGlobalSpace(globalSpaceRequirement(text)) ||
481 	     PL_from_stack_text(text, 0) )
482 	{ if ( (w = textToString(text)) )
483 	    return _PL_unify_atomic(term, w);
484 	}
485       }
486 
487       return FALSE;
488     }
489     case PL_CODE_LIST:
490     case PL_CHAR_LIST:
491     { if ( !hasGlobalSpace(text->length*3+1) &&
492 	   !PL_from_stack_text(text, 0) )
493 	return FALSE;
494 
495       if ( text->length == 0 )
496       { if ( tail )
497 	{ PL_put_term(tail, term);
498 	  return TRUE;
499 	} else
500 	{ return PL_unify_nil(term);
501 	}
502       } else
503       { term_t l = PL_new_term_ref();
504 	Word p0, p;
505 
506 	switch(text->encoding)
507 	{ case ENC_ISO_LATIN_1:
508 	  { const unsigned char *s = (const unsigned char *)text->text.t;
509 	    const unsigned char *e = &s[text->length];
510 
511             if ( !(p0 = p = INIT_SEQ_STRING(text->length)) )
512 	      return FALSE;
513 
514             if ( type == PL_CODE_LIST ) {
515               for( ; s < e; s++)
516                 p = EXTEND_SEQ_CODES(p, *s);
517             } else {
518               for( ; s < e; s++)
519                 p = EXTEND_SEQ_CHARS(p, *s);
520             }
521 	    break;
522 	  }
523 	  case ENC_WCHAR:
524 	  { const pl_wchar_t *s = (const pl_wchar_t *)text->text.t;
525 	    const pl_wchar_t *e = &s[text->length];
526 
527             if ( !(p0 = p = INIT_SEQ_STRING(text->length)) )
528 	      return FALSE;
529 
530             if ( type == PL_CODE_LIST ) {
531               for( ; s < e; s++)
532                 p = EXTEND_SEQ_CODES(p, *s);
533             } else {
534               for( ; s < e; s++)
535                 p = EXTEND_SEQ_CHARS(p, *s);
536             }
537 	    break;
538 	  }
539 	  case ENC_UTF8:
540 	  { const char *s = text->text.t;
541 	    const char *e = &s[text->length];
542 	    size_t len = utf8_strlen(s, text->length);
543 
544             if ( !(p0 = p = INIT_SEQ_STRING(len)) )
545 	      return FALSE;
546 
547             if ( type == PL_CODE_LIST ) {
548               while (s < e) {
549                 int chr;
550 
551                 s = utf8_get_char(s, &chr);
552                 p = EXTEND_SEQ_CODES(p, chr);
553               }
554             } else {
555               while (s < e) {
556                 int chr;
557 
558                 s = utf8_get_char(s, &chr);
559                 p = EXTEND_SEQ_CHARS(p, chr);
560               }
561             }
562 	    break;
563 	  }
564 	  case ENC_ANSI:
565 	  { const char *s = text->text.t;
566 	    size_t rc, n = text->length;
567 	    size_t len = 0;
568 	    mbstate_t mbs;
569 	    wchar_t wc;
570 
571 	    memset(&mbs, 0, sizeof(mbs));
572 	    while( n > 0 )
573 	    { if ( (rc=mbrtowc(&wc, s, n, &mbs)) == (size_t)-1 || rc == 0 )
574 		return PL_error(NULL, 0, "cannot represent text in current locale",
575 				ERR_REPRESENTATION, ATOM_encoding);
576 
577 	      len++;
578 	      n -= rc;
579 	      s += rc;
580 	    }
581 
582             if ( !(p0 = p = INIT_SEQ_STRING(len)) )
583 	      return FALSE;
584 
585 	    n = text->length;
586 	    s = text->text.t;
587 	    memset(&mbs, 0, sizeof(mbs));
588 	    while(n > 0)
589 	    { rc = mbrtowc(&wc, s, n, &mbs);
590 
591 	      if ( type == PL_CODE_LIST )
592 		p = EXTEND_SEQ_CODES(p, wc);
593 	      else
594 		p = EXTEND_SEQ_CHARS(p, wc);
595 
596 	      s += rc;
597 	      n -= rc;
598 	    }
599 	    break;
600 	  }
601 	  default:
602 	  { assert(0);
603 
604 	    return FALSE;
605 	  }
606 	}
607 
608 	return CLOSE_SEQ_STRING(p, p0, tail, term, l );
609       }
610     }
611     default:
612     { assert(0);
613 
614       return FALSE;
615     }
616   }
617 }
618 
619 
620 int
PL_unify_text(term_t term,term_t tail,PL_chars_t * text,int type)621 PL_unify_text(term_t term, term_t tail, PL_chars_t *text, int type)
622 { GET_LD
623   int rc;
624 
625   PL_STRINGS_MARK();
626   rc = unify_text(term, tail, text, type PASS_LD);
627   PL_STRINGS_RELEASE();
628   return rc;
629 }
630 
631 
632 int
PL_unify_text_range(term_t term,PL_chars_t * text,size_t offset,size_t len,int type)633 PL_unify_text_range(term_t term, PL_chars_t *text,
634 		    size_t offset, size_t len, int type)
635 { if ( offset == 0 && len == text->length )
636   { return PL_unify_text(term, 0, text, type);
637   } else
638   { PL_chars_t sub;
639     int rc;
640 
641     if ( offset > text->length || offset + len > text->length )
642       return FALSE;
643 
644     if ( len == 1 && type == PL_ATOM )
645     { GET_LD
646       int c;
647 
648       if ( text->encoding == ENC_ISO_LATIN_1 )
649 	c = text->text.t[offset]&0xff;
650       else
651 	c = text->text.w[offset];
652 
653       return PL_unify_atom(term, codeToAtom(c));
654     }
655 
656     sub.length = len;
657     sub.storage = text->storage == PL_CHARS_STACK ? PL_CHARS_STACK : PL_CHARS_HEAP;
658     if ( text->encoding == ENC_ISO_LATIN_1 )
659     { sub.text.t   = text->text.t+offset;
660       sub.encoding = ENC_ISO_LATIN_1;
661       sub.canonical = TRUE;
662     } else
663     { sub.text.w   = text->text.w+offset;
664       sub.encoding = ENC_WCHAR;
665       sub.canonical = FALSE;
666     }
667 
668     rc = PL_unify_text(term, 0, &sub, type);
669 
670     PL_free_text(&sub);
671 
672     return rc;
673   }
674 }
675 
676 
677 /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
678 int PL_promote_text(PL_chars_t *text)
679 
680 Promote a text to USC if it is currently 8-bit text.
681 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
682 
683 int
PL_promote_text(PL_chars_t * text)684 PL_promote_text(PL_chars_t *text)
685 { if ( text->encoding != ENC_WCHAR )
686   { if ( text->storage == PL_CHARS_MALLOC )
687     { pl_wchar_t *new = PL_malloc(sizeof(pl_wchar_t)*(text->length+1));
688       pl_wchar_t *t = new;
689       const unsigned char *s = (const unsigned char *)text->text.t;
690       const unsigned char *e = &s[text->length];
691 
692       while(s<e)
693       { *t++ = *s++;
694       }
695       *t = EOS;
696 
697       PL_free(text->text.t);
698       text->text.w = new;
699 
700       text->encoding = ENC_WCHAR;
701     } else if ( text->storage == PL_CHARS_LOCAL &&
702 	        (text->length+1)*sizeof(pl_wchar_t) < sizeof(text->buf) )
703     { unsigned char buf[sizeof(text->buf)];
704       unsigned char *f = buf;
705       unsigned char *e = &buf[text->length];
706       pl_wchar_t *t = (pl_wchar_t*)text->buf;
707 
708       memcpy(buf, text->buf, text->length*sizeof(char));
709       while(f<e)
710       { *t++ = *f++;
711       }
712       *t = EOS;
713       text->encoding = ENC_WCHAR;
714     } else
715     { Buffer b = findBuffer(BUF_STACK);
716       const unsigned char *s = (const unsigned char *)text->text.t;
717       const unsigned char *e = &s[text->length];
718 
719       for( ; s<e; s++)
720 	addBuffer(b, *s, pl_wchar_t);
721       addBuffer(b, EOS, pl_wchar_t);
722 
723       text->text.w   = baseBuffer(b, pl_wchar_t);
724       text->encoding = ENC_WCHAR;
725       text->storage  = PL_CHARS_RING;
726     }
727   }
728 
729   succeed;
730 }
731 
732 
733 static int
PL_demote_text(PL_chars_t * text,int flags)734 PL_demote_text(PL_chars_t *text, int flags)
735 { if ( text->encoding != ENC_ISO_LATIN_1 )
736   { if ( text->storage == PL_CHARS_MALLOC )
737     { char *new = PL_malloc(sizeof(char)*(text->length+1));
738       char *t = new;
739       const pl_wchar_t *s = (const pl_wchar_t *)text->text.t;
740       const pl_wchar_t *e = &s[text->length];
741 
742       while(s<e)
743       { if ( *s > 0xff )
744 	{ PL_free(new);
745 	reperr:
746 	  if ( (flags&CVT_EXCEPTION) )
747 	    PL_error(NULL, 0, "cannot represent text as ISO latin 1",
748 		     ERR_REPRESENTATION, ATOM_encoding);
749 	  return FALSE;
750 	}
751 	*t++ = *s++ & 0xff;
752       }
753       *t = EOS;
754 
755       PL_free(text->text.t);
756       text->text.t = new;
757 
758       text->encoding = ENC_ISO_LATIN_1;
759     } else if ( text->storage == PL_CHARS_LOCAL )
760     { pl_wchar_t buf[sizeof(text->buf)/sizeof(pl_wchar_t)];
761       pl_wchar_t *f = buf;
762       pl_wchar_t *e = &buf[text->length];
763       char *t = text->buf;
764 
765       memcpy(buf, text->buf, text->length*sizeof(pl_wchar_t));
766       while(f<e)
767       { if ( *f > 0xff )
768 	  goto reperr;
769 	*t++ = *f++ & 0xff;
770       }
771       *t = EOS;
772       text->encoding = ENC_ISO_LATIN_1;
773     } else
774     { Buffer b = findBuffer(BUF_STACK);
775       const pl_wchar_t *s = (const pl_wchar_t*)text->text.w;
776       const pl_wchar_t *e = &s[text->length];
777 
778       for( ; s<e; s++)
779       { if ( *s > 0xff )
780 	{ unfindBuffer(b, BUF_STACK);
781 	  goto reperr;
782 	}
783 	addBuffer(b, *s&0xff, char);
784       }
785       addBuffer(b, EOS, char);
786 
787       text->text.t   = baseBuffer(b, char);
788       text->storage  = PL_CHARS_RING;
789       text->encoding = ENC_ISO_LATIN_1;
790     }
791   }
792 
793   succeed;
794 }
795 
796 
797 static int
can_demote(PL_chars_t * text)798 can_demote(PL_chars_t *text)
799 { if ( text->encoding != ENC_ISO_LATIN_1 )
800   { const pl_wchar_t *w = (const pl_wchar_t*)text->text.w;
801     const pl_wchar_t *e = &w[text->length];
802 
803     for(; w<e; w++)
804     { if ( *w > 0xff )
805 	return FALSE;
806     }
807   }
808 
809   return TRUE;
810 }
811 
812 
813 /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
814 Convert text to 8-bit according to flags.   May hold REP_UTF8 to convert
815 to UTF-8, REP_MB to convert to locale 8-bit representation or nothing to
816 convert to ISO Latin-1. This predicate can   fail  of the text cannot be
817 represented.
818 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
819 
820 static int
wctobuffer(wchar_t c,mbstate_t * mbs,Buffer buf)821 wctobuffer(wchar_t c, mbstate_t *mbs, Buffer buf)
822 { char b[PL_MB_LEN_MAX];
823   size_t n;
824 
825   if ( (n=wcrtomb(b, c, mbs)) != (size_t)-1 )
826   { size_t i;
827 
828     for(i=0; i<n; i++)
829       addBuffer(buf, b[i], char);
830 
831     return TRUE;
832   }
833 
834   return FALSE;				/* cannot represent */
835 }
836 
837 
838 static void
utf8tobuffer(wchar_t c,Buffer buf)839 utf8tobuffer(wchar_t c, Buffer buf)
840 { if ( c <= 0x7f )
841   { addBuffer(buf, (char)c, char);
842   } else
843   { char b[6];
844     char *e = b;
845     const char *s;
846 
847     e = utf8_put_char(e, c);
848     for(s=b; s<e; s++)
849       addBuffer(buf, *s, char);
850   }
851 }
852 
853 
854 int
PL_mb_text(PL_chars_t * text,int flags)855 PL_mb_text(PL_chars_t *text, int flags)
856 { int norep = -1;
857   IOENC target = ((flags&REP_UTF8) ? ENC_UTF8 :
858 		  (flags&REP_MB)   ? ENC_ANSI : ENC_ISO_LATIN_1);
859 
860   if ( text->encoding != target )
861   { Buffer b = findBuffer(BUF_STACK);
862 
863     switch(text->encoding)
864     { case ENC_ISO_LATIN_1:
865       { const unsigned char *s = (const unsigned char*)text->text.t;
866 	const unsigned char *e = &s[text->length];
867 
868 	if ( target == ENC_UTF8 )
869 	{ for( ; s<e; s++)
870 	  { utf8tobuffer(*s, b);
871 	  }
872 	  addBuffer(b, 0, char);
873 	} else /* if ( target == ENC_MB ) */
874 	{ mbstate_t mbs;
875 
876 	  memset(&mbs, 0, sizeof(mbs));
877 	  for( ; s<e; s++)
878 	  { if ( !wctobuffer(*s, &mbs, b) )
879 	    { unfindBuffer(b, BUF_STACK);
880 	      norep = *s;
881 	      goto rep_error;
882 	    }
883 	  }
884 	  wctobuffer(0, &mbs, b);
885 	}
886 
887         break;
888       }
889       case ENC_WCHAR:
890       { if ( target == ENC_ISO_LATIN_1 )
891 	{ return PL_demote_text(text, flags);
892 	} else
893 	{ const pl_wchar_t *w = (const pl_wchar_t*)text->text.w;
894 	  const pl_wchar_t *e = &w[text->length];
895 
896 	  if ( target == ENC_UTF8 )
897 	  { for( ; w<e; w++)
898 	    { utf8tobuffer(*w, b);
899 	    }
900 	    addBuffer(b, 0, char);
901 	  } else /* if ( target == ENC_MB ) */
902 	  { mbstate_t mbs;
903 
904 	    memset(&mbs, 0, sizeof(mbs));
905 	    for( ; w<e; w++)
906 	    { if ( !wctobuffer(*w, &mbs, b) )
907 	      { unfindBuffer(b, BUF_STACK);
908 		norep = *w;
909 		goto rep_error;
910 	      }
911 	    }
912 	    wctobuffer(0, &mbs, b);
913 	  }
914 	}
915 	break;
916       }
917       default:
918       { assert(0);
919 	fail;
920       }
921     }
922 
923     PL_free_text(text);
924 
925     text->length   = sizeOfBuffer(b)-1;
926     text->text.t   = baseBuffer(b, char);
927     text->encoding = target;
928     text->storage  = PL_CHARS_RING;
929   }
930 
931   succeed;
932 
933 rep_error:
934   if ( (flags & CVT_EXCEPTION) )
935   { char msg[128];
936 
937     sprintf(msg,
938 	    "Cannot represent char U%04x using %s encoding",
939 	    norep,
940 	    target == ENC_ISO_LATIN_1 ? "ISO Latin-1" : "current locale");
941 
942     return PL_error(NULL, 0, msg, ERR_REPRESENTATION, ATOM_encoding);
943   }
944 
945   fail;
946 }
947 
948 
949 static void
flip_shorts(unsigned char * s,size_t len)950 flip_shorts(unsigned char *s, size_t len)
951 { unsigned char *e = s+len;
952 
953   for(; s<e; s+=2)
954   { unsigned char t = s[0];
955     s[0] = s[1];
956     s[1] = t;
957   }
958 }
959 
960 
961 static int
native_byte_order(IOENC enc)962 native_byte_order(IOENC enc)
963 {
964 #ifdef WORDS_BIGENDIAN
965   return enc == ENC_UNICODE_BE;
966 #else
967   return enc == ENC_UNICODE_LE;
968 #endif
969 }
970 
971 
972 int
PL_canonicalise_text(PL_chars_t * text)973 PL_canonicalise_text(PL_chars_t *text)
974 { if ( !text->canonical )
975   { switch(text->encoding )
976     { case ENC_OCTET:
977 	text->encoding = ENC_ISO_LATIN_1;
978       case ENC_ISO_LATIN_1:
979         text->canonical = TRUE;
980 	break;				/* nothing to do */
981       case ENC_WCHAR:
982       { const pl_wchar_t *w = (const pl_wchar_t*)text->text.w;
983 	const pl_wchar_t *e = &w[text->length];
984 
985 	for(; w<e; w++)
986 	{ if ( *w > 0xff )
987 	    return TRUE;
988 	}
989 
990 	return PL_demote_text(text, 0);
991       }
992       case ENC_UNICODE_LE:		/* assume text->length is in bytes */
993       case ENC_UNICODE_BE:
994       {
995 #if SIZEOF_WCHAR_T == 2
996         assert(text->length%2 == 0);
997 	if ( !native_byte_order(text->encoding) )
998 	{ if ( text->storage == PL_CHARS_HEAP )
999 	    PL_save_text(text, BUF_MALLOC);
1000 	  flip_shorts((unsigned char*)text->text.t, text->length);
1001 	}
1002 	text->encoding = ENC_WCHAR;
1003 	return TRUE;
1004 #else /*SIZEOF_WCHAR_T!=2*/
1005 	size_t len = text->length/sizeof(short);
1006 	const unsigned short *w = (const unsigned short *)text->text.t;
1007 	const unsigned short *e = &w[len];
1008 	int wide = FALSE;
1009 
1010 	assert(text->length%2 == 0);
1011 	if ( !native_byte_order(text->encoding) )
1012 	{ if ( text->storage == PL_CHARS_HEAP )
1013 	    PL_save_text(text, BUF_MALLOC);
1014 	  flip_shorts((unsigned char*)text->text.t, text->length);
1015 	}
1016 
1017 	for(; w<e; w++)
1018 	{ if ( *w > 0xff )
1019 	  { wide = TRUE;
1020 	    break;
1021 	  }
1022 	}
1023 	w = (const unsigned short*)text->text.t;
1024 
1025 	if ( wide )
1026 	{ pl_wchar_t *t, *to = PL_malloc(sizeof(pl_wchar_t)*(len+1));
1027 
1028 	  for(t=to; w<e; )
1029 	  { *t++ = *w++;
1030 	  }
1031 	  *t = EOS;
1032 
1033 	  text->length = len;
1034 	  text->encoding = ENC_WCHAR;
1035 	  if ( text->storage == PL_CHARS_MALLOC )
1036 	    PL_free(text->text.t);
1037 	  else
1038 	    text->storage  = PL_CHARS_MALLOC;
1039 
1040 	  text->text.w = to;
1041 	} else
1042 	{ unsigned char *t, *to = PL_malloc(len+1);
1043 
1044 	  for(t=to; w<e; )
1045 	    *t++ = (unsigned char)*w++;
1046 	  *t = EOS;
1047 
1048 	  text->length = len;
1049 	  text->encoding = ENC_ISO_LATIN_1;
1050 	  if ( text->storage == PL_CHARS_MALLOC )
1051 	    PL_free(text->text.t);
1052 	  else
1053 	    text->storage = PL_CHARS_MALLOC;
1054 
1055 	  text->text.t = (char*)to;
1056 	}
1057 
1058 	succeed;
1059 #endif /*SIZEOF_WCHAR_T==2*/
1060       }
1061       case ENC_UTF8:
1062       { const char *s = text->text.t;
1063 	const char *e = &s[text->length];
1064 
1065 	while(s<e && !(*s & 0x80))
1066 	  s++;
1067 	if ( s == e )
1068 	{ text->encoding  = ENC_ISO_LATIN_1;
1069 	  text->canonical = TRUE;
1070 	} else
1071 	{ int chr;
1072 	  int wide = FALSE;
1073 	  size_t len = s - text->text.t;
1074 
1075 	  while(s<e)
1076 	  { s = utf8_get_char(s, &chr);
1077 	    if ( chr > 0xff )		/* requires wide characters */
1078 	      wide = TRUE;
1079 	    len++;
1080 	  }
1081 
1082 	  s = (const char *)text->text.t;
1083 	  text->length = len;
1084 
1085 	  if ( wide )
1086 	  { pl_wchar_t *t, *to = PL_malloc(sizeof(pl_wchar_t)*(len+1));
1087 
1088 	    for(t=to; s<e; )
1089 	    { s = utf8_get_char(s, &chr);
1090 	      *t++ = chr;
1091 	    }
1092 	    *t = EOS;
1093 
1094 	    text->encoding = ENC_WCHAR;
1095 	    if ( text->storage == PL_CHARS_MALLOC )
1096 	      PL_free(text->text.t);
1097 	    text->text.w  = to;
1098 	    text->storage = PL_CHARS_MALLOC;
1099 	  } else
1100 	  { char *t, *to = PL_malloc(len+1);
1101 
1102 	    for(t=to; s<e;)
1103 	    { s = utf8_get_char(s, &chr);
1104 	      *t++ = chr;
1105 	    }
1106 	    *t = EOS;
1107 
1108 	    text->encoding = ENC_ISO_LATIN_1;
1109 	    if ( text->storage == PL_CHARS_MALLOC )
1110 	      PL_free(text->text.t);
1111 	    text->text.t  = to;
1112 	    text->storage = PL_CHARS_MALLOC;
1113 	  }
1114 
1115 	  text->canonical = TRUE;
1116 	}
1117 
1118 	succeed;
1119       }
1120       case ENC_ANSI:
1121       { mbstate_t mbs;
1122 	size_t len = 0;
1123 	int iso = TRUE;
1124 	char *s = text->text.t;
1125 	size_t rc, n = text->length;
1126 	wchar_t wc;
1127 
1128 	memset(&mbs, 0, sizeof(mbs));
1129 	while( n > 0 )
1130 	{ if ( (rc=mbrtowc(&wc, s, n, &mbs)) == (size_t)-1 || rc == 0)
1131 	    return FALSE;		/* encoding error */
1132 
1133 	  if ( wc > 0xff )
1134 	    iso = FALSE;
1135 	  len++;
1136 	  n -= rc;
1137 	  s += rc;
1138 	}
1139 
1140 	if ( n == 0 )
1141 	{ const char *from = text->text.t;
1142 	  void *do_free;
1143 
1144 	  n = text->length;
1145 	  memset(&mbs, 0, sizeof(mbs));
1146 
1147 	  if ( text->storage == PL_CHARS_MALLOC )
1148 	    do_free = text->text.t;
1149 	  else
1150 	    do_free = NULL;
1151 
1152 	  if ( iso )
1153 	  { char *to;
1154 
1155 	    text->encoding = ENC_ISO_LATIN_1;
1156 	    if ( len+1 < sizeof(text->buf) )
1157 	    { text->text.t = text->buf;
1158 	      text->storage = PL_CHARS_LOCAL;
1159 	    } else
1160 	    { text->text.t = PL_malloc(len+1);
1161 	      text->storage = PL_CHARS_MALLOC;
1162 	    }
1163 
1164 	    to = text->text.t;
1165 	    while( n > 0 )
1166 	    { rc = mbrtowc(&wc, from, n, &mbs);
1167 
1168 	      *to++ = (char)wc;
1169 	      n -= rc;
1170 	      from += rc;
1171 	    }
1172 	    *to = EOS;
1173 	  } else
1174 	  { wchar_t *to;
1175 	    char b2[sizeof(text->buf)];
1176 
1177 	    text->encoding = ENC_WCHAR;
1178 	    if ( len+1 < sizeof(text->buf)/sizeof(wchar_t) )
1179 	    { if ( text->text.t == text->buf )
1180 	      { memcpy(b2, text->buf, sizeof(text->buf));
1181 		from = b2;
1182 	      }
1183 	      text->text.w = (wchar_t*)text->buf;
1184 	    } else
1185 	    { text->text.w = PL_malloc((len+1)*sizeof(wchar_t));
1186 	      text->storage = PL_CHARS_MALLOC;
1187 	    }
1188 
1189 	    to = text->text.w;
1190 	    while( n > 0 )
1191 	    { rc = mbrtowc(&wc, from, n, &mbs);
1192 
1193 	      *to++ = wc;
1194 	      n -= rc;
1195 	      from += rc;
1196 	    }
1197 	    *to = EOS;
1198 	  }
1199 
1200 	  text->length = len;
1201 	  text->canonical = TRUE;
1202 	  if ( do_free )
1203 	    PL_free(do_free);
1204 
1205 	  succeed;
1206 	}
1207 
1208 	fail;
1209       }
1210       default:
1211 	assert(0);
1212     }
1213   }
1214 
1215   succeed;
1216 }
1217 
1218 
1219 void
PL_free_text(PL_chars_t * text)1220 PL_free_text(PL_chars_t *text)
1221 { if ( text->storage == PL_CHARS_MALLOC && text->text.t )
1222     PL_free(text->text.t);
1223 }
1224 
1225 
1226 /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
1227 Recode a text to the given   encoding. Currrenly only supports re-coding
1228 to UTF-8 for ENC_ASCII, ENC_ISO_LATIN_1, ENC_WCHAR and ENC_ANSI.
1229 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
1230 
1231 int
PL_text_recode(PL_chars_t * text,IOENC encoding)1232 PL_text_recode(PL_chars_t *text, IOENC encoding)
1233 { if ( text->encoding != encoding )
1234   { switch(encoding)
1235     { case ENC_UTF8:
1236       { Buffer b;
1237 
1238 	switch(text->encoding)
1239 	{ case ENC_ASCII:
1240 	    text->encoding = ENC_UTF8;
1241 	    break;
1242 	  case ENC_ISO_LATIN_1:
1243 	  { const unsigned char *s = (const unsigned char *)text->text.t;
1244 	    const unsigned char *e = &s[text->length];
1245 
1246 	    for( ; s<e; s++)
1247 	    { if ( *s&0x80 )
1248 	      { s = (const unsigned char *)text->text.t;
1249 		goto convert_utf8;
1250 	      }
1251 	    }
1252 					/* ASCII; nothing to do */
1253 	    text->encoding = ENC_UTF8;
1254 	    break;
1255 
1256 	  convert_utf8:
1257 	    b = findBuffer(BUF_STACK);
1258 	    for( ; s<e; s++)
1259 	      utf8tobuffer(*s, b);
1260 	  swap_to_utf8:
1261 	    PL_free_text(text);
1262             text->length   = entriesBuffer(b, char);
1263 	    addBuffer(b, EOS, char);
1264 	    text->text.t   = baseBuffer(b, char);
1265 	    text->encoding = ENC_UTF8;
1266 	    text->storage  = PL_CHARS_RING;
1267 
1268 	    break;
1269 	  }
1270 	  case ENC_WCHAR:
1271 	  { const pl_wchar_t *s = text->text.w;
1272 	    const pl_wchar_t *e = &s[text->length];
1273 
1274 	    b = findBuffer(BUF_STACK);
1275 	    for( ; s<e; s++)
1276 	      utf8tobuffer(*s, b);
1277 	    goto swap_to_utf8;
1278 	  }
1279 	  case ENC_ANSI:
1280 	  { mbstate_t mbs;
1281 	    size_t rc, n = text->length;
1282 	    wchar_t wc;
1283 	    const char *s = (const char *)text->text.t;
1284 
1285 	    b = findBuffer(BUF_STACK);
1286 	    memset(&mbs, 0, sizeof(mbs));
1287 	    while( n > 0 )
1288 	    { if ( (rc=mbrtowc(&wc, s, n, &mbs)) == (size_t)-1 || rc == 0)
1289 		return FALSE;		/* encoding error */
1290 
1291 	      utf8tobuffer(wc, b);
1292 	      n -= rc;
1293 	      s += rc;
1294 	    }
1295 	    if ( n == 0 )
1296 	      goto swap_to_utf8;
1297 
1298 	    return FALSE;
1299 	  }
1300 	  default:
1301 	    assert(0);
1302 	    return FALSE;
1303 	}
1304 	return TRUE;
1305 	default:
1306 	  assert(0);
1307 	  return FALSE;
1308       }
1309     }
1310   } else
1311     return TRUE;
1312 }
1313 
1314 
1315 /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
1316 PL_cmp_text(PL_chars_t *t1, size_t o1,
1317 	    PL_chars_t *t2, size_t o2,
1318 	    size_t len)
1319 
1320 Compares two substrings of two text representations.
1321 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
1322 
1323 int
PL_cmp_text(PL_chars_t * t1,size_t o1,PL_chars_t * t2,size_t o2,size_t len)1324 PL_cmp_text(PL_chars_t *t1, size_t o1, PL_chars_t *t2, size_t o2,
1325 	    size_t len)
1326 { ssize_t l = len;
1327   int ifeq = 0;
1328 
1329   if ( l > (ssize_t)(t1->length - o1) )
1330   { l = t1->length - o1;
1331     ifeq = CMP_LESS;				/* first is short */
1332   }
1333   if ( l > (ssize_t)(t2->length - o2) )
1334   { l = t2->length - o2;
1335     if ( ifeq == 0 )
1336       ifeq = CMP_GREATER;
1337   }
1338 
1339   if ( l == 0 )					/* too long offsets */
1340     return ifeq;
1341 
1342   if ( t1->encoding == ENC_ISO_LATIN_1 && t2->encoding == ENC_ISO_LATIN_1 )
1343   { const unsigned char *s = (const unsigned char *)t1->text.t+o1;
1344     const unsigned char *q = (const unsigned char *)t2->text.t+o2;
1345 
1346     for(; l-- > 0 && *s == *q; s++, q++ )
1347       ;
1348     if ( l < 0 )
1349       return ifeq;
1350     else
1351       return *s > *q ? CMP_GREATER : CMP_LESS;
1352   } else if ( t1->encoding == ENC_WCHAR && t2->encoding == ENC_WCHAR )
1353   { const pl_wchar_t *s = t1->text.w+o1;
1354     const pl_wchar_t *q = t2->text.w+o2;
1355 
1356     for(; l-- > 0 && *s == *q; s++, q++ )
1357       ;
1358     if ( l < 0 )
1359       return ifeq;
1360     else
1361       return *s > *q ? CMP_GREATER : CMP_LESS;
1362   } else if ( t1->encoding == ENC_ISO_LATIN_1 && t2->encoding == ENC_WCHAR )
1363   { const unsigned char *s = (const unsigned char *)t1->text.t+o1;
1364     const pl_wchar_t *q = t2->text.w+o2;
1365 
1366     for(; l-- > 0 && *s == *q; s++, q++ )
1367       ;
1368     if ( l < 0 )
1369       return ifeq;
1370     else
1371       return *s > *q ? CMP_GREATER : CMP_LESS;
1372   } else
1373   { const pl_wchar_t *s = t1->text.w+o1;
1374     const unsigned char *q = (const unsigned char *)t2->text.t+o2;
1375 
1376     for(; l-- > 0 && *s == *q; s++, q++ )
1377       ;
1378     if ( l < 0 )
1379       return ifeq;
1380     else
1381       return *s > *q ? CMP_GREATER : CMP_LESS;
1382   }
1383 }
1384 
1385 
1386 int
PL_concat_text(int n,PL_chars_t ** text,PL_chars_t * result)1387 PL_concat_text(int n, PL_chars_t **text, PL_chars_t *result)
1388 { size_t total_length = 0;
1389   int latin = TRUE;
1390   int i;
1391 
1392   for(i=0; i<n; i++)
1393   { if ( latin && !can_demote(text[i]) )
1394       latin = FALSE;
1395     total_length += text[i]->length;
1396   }
1397 
1398   result->canonical = TRUE;
1399   result->length = total_length;
1400 
1401   if ( latin )
1402   { char *to;
1403 
1404     result->encoding = ENC_ISO_LATIN_1;
1405     if ( total_length+1 < sizeof(result->buf) )
1406     { result->text.t = result->buf;
1407       result->storage = PL_CHARS_LOCAL;
1408     } else
1409     { result->text.t = PL_malloc(total_length+1);
1410       result->storage = PL_CHARS_MALLOC;
1411     }
1412 
1413     for(to=result->text.t, i=0; i<n; i++)
1414     { memcpy(to, text[i]->text.t, text[i]->length);
1415       to += text[i]->length;
1416     }
1417     *to = EOS;
1418   } else
1419   { pl_wchar_t *to;
1420 
1421     result->encoding = ENC_WCHAR;
1422     if ( total_length+1 < sizeof(result->buf)/sizeof(pl_wchar_t) )
1423     { result->text.w = (pl_wchar_t*)result->buf;
1424       result->storage = PL_CHARS_LOCAL;
1425     } else
1426     { result->text.w = PL_malloc((total_length+1)*sizeof(pl_wchar_t));
1427       result->storage = PL_CHARS_MALLOC;
1428     }
1429 
1430     for(to=result->text.w, i=0; i<n; i++)
1431     { if ( text[i]->encoding == ENC_WCHAR )
1432       { memcpy(to, text[i]->text.w, text[i]->length*sizeof(pl_wchar_t));
1433 	to += text[i]->length;
1434       } else
1435       { const unsigned char *f = (const unsigned char *)text[i]->text.t;
1436 	const unsigned char *e = &f[text[i]->length];
1437 
1438 	while(f<e)
1439 	  *to++ = *f++;
1440       }
1441     }
1442     assert((size_t)(to-result->text.w) == total_length);
1443     *to = EOS;
1444   }
1445 
1446   return TRUE;
1447 }
1448 
1449 
1450 IOSTREAM *
Sopen_text(PL_chars_t * txt,const char * mode)1451 Sopen_text(PL_chars_t *txt, const char *mode)
1452 { IOSTREAM *stream;
1453 
1454   if ( !streq(mode, "r") )
1455   { errno = EINVAL;
1456     return NULL;
1457   }
1458 
1459   stream = Sopen_string(NULL,
1460 			txt->text.t,
1461 			bufsize_text(txt, txt->length),
1462 			mode);
1463   stream->encoding = txt->encoding;
1464 
1465   return stream;
1466 }
1467