1 /*
2  * Copyright (c) 1993-1994 by Xerox Corporation.  All rights reserved.
3  *
4  * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY EXPRESSED
5  * OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
6  *
7  * Permission is hereby granted to use or copy this program
8  * for any purpose,  provided the above notices are retained on all copies.
9  * Permission to modify the code and to distribute modified code is granted,
10  * provided the above notices are retained, and a notice that the code was
11  * modified is included with the above copyright notice.
12  */
13 
14 /*
15  * These are functions on cords that do not need to understand their
16  * implementation.  They serve also serve as example client code for
17  * cord_basics.
18  */
19 
20 #ifdef HAVE_CONFIG_H
21 # include "config.h"
22 #endif
23 #ifndef CORD_BUILD
24 # define CORD_BUILD
25 #endif
26 
27 # include <stdio.h>
28 # include <string.h>
29 # include <stdlib.h>
30 # include <stdarg.h>
31 
32 # include "cord.h"
33 # include "ec.h"
34 
35 # define I_HIDE_POINTERS    /* So we get access to allocation lock. */
36                 /* We use this for lazy file reading,   */
37                 /* so that we remain independent        */
38                 /* of the threads primitives.           */
39 # include "gc.h"
40 
41 /* For now we assume that pointer reads and writes are atomic,  */
42 /* i.e. another thread always sees the state before or after    */
43 /* a write.  This might be false on a Motorola M68K with        */
44 /* pointers that are not 32-bit aligned.  But there probably    */
45 /* aren't too many threads packages running on those.           */
46 # define ATOMIC_WRITE(x,y) (x) = (y)
47 # define ATOMIC_READ(x) (*(x))
48 
49 /* The standard says these are in stdio.h, but they aren't always: */
50 # ifndef SEEK_SET
51 #   define SEEK_SET 0
52 # endif
53 # ifndef SEEK_END
54 #   define SEEK_END 2
55 # endif
56 
57 # define BUFSZ 2048     /* Size of stack allocated buffers when */
58                         /* we want large buffers.               */
59 
60 typedef void (* oom_fn)(void);
61 
62 # define OUT_OF_MEMORY { if (CORD_oom_fn != (oom_fn) 0) (*CORD_oom_fn)(); \
63                          ABORT("Out of memory"); }
64 # define ABORT(msg) { fprintf(stderr, "%s\n", msg); abort(); }
65 
66 #if GC_GNUC_PREREQ(3, 4)
67 # define CORD_ATTR_UNUSED __attribute__((__unused__))
68 #else
69 # define CORD_ATTR_UNUSED /* empty */
70 #endif
71 
CORD_cat_char(CORD x,char c)72 CORD CORD_cat_char(CORD x, char c)
73 {
74     char * string;
75 
76     if (c == '\0') return(CORD_cat(x, CORD_nul(1)));
77     string = (char *)GC_MALLOC_ATOMIC(2);
78     if (string == 0) OUT_OF_MEMORY;
79     string[0] = c;
80     string[1] = '\0';
81     return(CORD_cat_char_star(x, string, 1));
82 }
83 
CORD_catn(int nargs,...)84 CORD CORD_catn(int nargs, ...)
85 {
86     CORD result = CORD_EMPTY;
87     va_list args;
88     int i;
89 
90     va_start(args, nargs);
91     for (i = 0; i < nargs; i++) {
92         CORD next = va_arg(args, CORD);
93         result = CORD_cat(result, next);
94     }
95     va_end(args);
96     return(result);
97 }
98 
99 typedef struct {
100     size_t len;
101     size_t count;
102     char * buf;
103 } CORD_fill_data;
104 
CORD_fill_proc(char c,void * client_data)105 int CORD_fill_proc(char c, void * client_data)
106 {
107     CORD_fill_data * d = (CORD_fill_data *)client_data;
108     size_t count = d -> count;
109 
110     (d -> buf)[count] = c;
111     d -> count = ++count;
112     if (count >= d -> len) {
113         return(1);
114     } else {
115         return(0);
116     }
117 }
118 
CORD_batched_fill_proc(const char * s,void * client_data)119 int CORD_batched_fill_proc(const char * s, void * client_data)
120 {
121     CORD_fill_data * d = (CORD_fill_data *)client_data;
122     size_t count = d -> count;
123     size_t max = d -> len;
124     char * buf = d -> buf;
125     const char * t = s;
126 
127     while((buf[count] = *t++) != '\0') {
128         count++;
129         if (count >= max) {
130             d -> count = count;
131             return(1);
132         }
133     }
134     d -> count = count;
135     return(0);
136 }
137 
138 /* Fill buf with len characters starting at i.  */
139 /* Assumes len characters are available in buf. */
140 /* Return 1 if buf is filled fully (and len is  */
141 /* non-zero), 0 otherwise.                      */
CORD_fill_buf(CORD x,size_t i,size_t len,char * buf)142 int CORD_fill_buf(CORD x, size_t i, size_t len, char * buf)
143 {
144     CORD_fill_data fd;
145 
146     fd.len = len;
147     fd.buf = buf;
148     fd.count = 0;
149     return CORD_iter5(x, i, CORD_fill_proc, CORD_batched_fill_proc, &fd);
150 }
151 
CORD_cmp(CORD x,CORD y)152 int CORD_cmp(CORD x, CORD y)
153 {
154     CORD_pos xpos;
155     CORD_pos ypos;
156 
157     if (y == CORD_EMPTY) return(x != CORD_EMPTY);
158     if (x == CORD_EMPTY) return(-1);
159     if (CORD_IS_STRING(y) && CORD_IS_STRING(x)) return(strcmp(x,y));
160     CORD_set_pos(xpos, x, 0);
161     CORD_set_pos(ypos, y, 0);
162     for(;;) {
163         size_t avail, yavail;
164 
165         if (!CORD_pos_valid(xpos)) {
166             if (CORD_pos_valid(ypos)) {
167                 return(-1);
168             } else {
169                 return(0);
170             }
171         }
172         if (!CORD_pos_valid(ypos)) {
173             return(1);
174         }
175         avail = CORD_pos_chars_left(xpos);
176         if (avail == 0
177             || (yavail = CORD_pos_chars_left(ypos)) == 0) {
178             char xcurrent = CORD_pos_fetch(xpos);
179             char ycurrent = CORD_pos_fetch(ypos);
180             if (xcurrent != ycurrent) return(xcurrent - ycurrent);
181             CORD_next(xpos);
182             CORD_next(ypos);
183         } else {
184             /* process as many characters as we can */
185             int result;
186 
187             if (avail > yavail) avail = yavail;
188             result = strncmp(CORD_pos_cur_char_addr(xpos),
189                          CORD_pos_cur_char_addr(ypos), avail);
190             if (result != 0) return(result);
191             CORD_pos_advance(xpos, avail);
192             CORD_pos_advance(ypos, avail);
193         }
194     }
195 }
196 
CORD_ncmp(CORD x,size_t x_start,CORD y,size_t y_start,size_t len)197 int CORD_ncmp(CORD x, size_t x_start, CORD y, size_t y_start, size_t len)
198 {
199     CORD_pos xpos;
200     CORD_pos ypos;
201     size_t count;
202 
203     CORD_set_pos(xpos, x, x_start);
204     CORD_set_pos(ypos, y, y_start);
205     for(count = 0; count < len;) {
206         long avail, yavail;
207 
208         if (!CORD_pos_valid(xpos)) {
209             if (CORD_pos_valid(ypos)) {
210                 return(-1);
211             } else {
212                 return(0);
213             }
214         }
215         if (!CORD_pos_valid(ypos)) {
216             return(1);
217         }
218         if ((avail = CORD_pos_chars_left(xpos)) <= 0
219             || (yavail = CORD_pos_chars_left(ypos)) <= 0) {
220             char xcurrent = CORD_pos_fetch(xpos);
221             char ycurrent = CORD_pos_fetch(ypos);
222 
223             if (xcurrent != ycurrent) return(xcurrent - ycurrent);
224             CORD_next(xpos);
225             CORD_next(ypos);
226             count++;
227         } else {
228             /* process as many characters as we can */
229             int result;
230 
231             if (avail > yavail) avail = yavail;
232             count += avail;
233             if (count > len)
234                 avail -= (long)(count - len);
235             result = strncmp(CORD_pos_cur_char_addr(xpos),
236                          CORD_pos_cur_char_addr(ypos), (size_t)avail);
237             if (result != 0) return(result);
238             CORD_pos_advance(xpos, (size_t)avail);
239             CORD_pos_advance(ypos, (size_t)avail);
240         }
241     }
242     return(0);
243 }
244 
CORD_to_char_star(CORD x)245 char * CORD_to_char_star(CORD x)
246 {
247     size_t len = CORD_len(x);
248     char * result = (char *)GC_MALLOC_ATOMIC(len + 1);
249 
250     if (result == 0) OUT_OF_MEMORY;
251     if (len > 0 && CORD_fill_buf(x, 0, len, result) != 1)
252       ABORT("CORD_fill_buf malfunction");
253     result[len] = '\0';
254     return(result);
255 }
256 
CORD_from_char_star(const char * s)257 CORD CORD_from_char_star(const char *s)
258 {
259     char * result;
260     size_t len = strlen(s);
261 
262     if (0 == len) return(CORD_EMPTY);
263     result = (char *)GC_MALLOC_ATOMIC(len + 1);
264     if (result == 0) OUT_OF_MEMORY;
265     memcpy(result, s, len+1);
266     return(result);
267 }
268 
CORD_to_const_char_star(CORD x)269 const char * CORD_to_const_char_star(CORD x)
270 {
271     if (x == 0) return("");
272     if (CORD_IS_STRING(x)) return((const char *)x);
273     return(CORD_to_char_star(x));
274 }
275 
CORD_fetch(CORD x,size_t i)276 char CORD_fetch(CORD x, size_t i)
277 {
278     CORD_pos xpos;
279 
280     CORD_set_pos(xpos, x, i);
281     if (!CORD_pos_valid(xpos)) ABORT("bad index?");
282     return(CORD_pos_fetch(xpos));
283 }
284 
285 
CORD_put_proc(char c,void * client_data)286 int CORD_put_proc(char c, void * client_data)
287 {
288     FILE * f = (FILE *)client_data;
289 
290     return(putc(c, f) == EOF);
291 }
292 
CORD_batched_put_proc(const char * s,void * client_data)293 int CORD_batched_put_proc(const char * s, void * client_data)
294 {
295     FILE * f = (FILE *)client_data;
296 
297     return(fputs(s, f) == EOF);
298 }
299 
300 
CORD_put(CORD x,FILE * f)301 int CORD_put(CORD x, FILE * f)
302 {
303     if (CORD_iter5(x, 0, CORD_put_proc, CORD_batched_put_proc, f)) {
304         return(EOF);
305     } else {
306         return(1);
307     }
308 }
309 
310 typedef struct {
311     size_t pos;     /* Current position in the cord */
312     char target;    /* Character we're looking for  */
313 } chr_data;
314 
CORD_chr_proc(char c,void * client_data)315 int CORD_chr_proc(char c, void * client_data)
316 {
317     chr_data * d = (chr_data *)client_data;
318 
319     if (c == d -> target) return(1);
320     (d -> pos) ++;
321     return(0);
322 }
323 
CORD_rchr_proc(char c,void * client_data)324 int CORD_rchr_proc(char c, void * client_data)
325 {
326     chr_data * d = (chr_data *)client_data;
327 
328     if (c == d -> target) return(1);
329     (d -> pos) --;
330     return(0);
331 }
332 
CORD_batched_chr_proc(const char * s,void * client_data)333 int CORD_batched_chr_proc(const char *s, void * client_data)
334 {
335     chr_data * d = (chr_data *)client_data;
336     const char * occ = strchr(s, d -> target);
337 
338     if (NULL == occ) {
339         d -> pos += strlen(s);
340         return(0);
341     } else {
342         d -> pos += occ - s;
343         return(1);
344     }
345 }
346 
CORD_chr(CORD x,size_t i,int c)347 size_t CORD_chr(CORD x, size_t i, int c)
348 {
349     chr_data d;
350 
351     d.pos = i;
352     d.target = (char)c;
353     if (CORD_iter5(x, i, CORD_chr_proc, CORD_batched_chr_proc, &d)) {
354         return(d.pos);
355     } else {
356         return(CORD_NOT_FOUND);
357     }
358 }
359 
CORD_rchr(CORD x,size_t i,int c)360 size_t CORD_rchr(CORD x, size_t i, int c)
361 {
362     chr_data d;
363 
364     d.pos = i;
365     d.target = (char)c;
366     if (CORD_riter4(x, i, CORD_rchr_proc, &d)) {
367         return(d.pos);
368     } else {
369         return(CORD_NOT_FOUND);
370     }
371 }
372 
373 /* Find the first occurrence of s in x at position start or later.      */
374 /* This uses an asymptotically poor algorithm, which should typically   */
375 /* perform acceptably.  We compare the first few characters directly,   */
376 /* and call CORD_ncmp whenever there is a partial match.                */
377 /* This has the advantage that we allocate very little, or not at all.  */
378 /* It's very fast if there are few close misses.                        */
CORD_str(CORD x,size_t start,CORD s)379 size_t CORD_str(CORD x, size_t start, CORD s)
380 {
381     CORD_pos xpos;
382     size_t xlen = CORD_len(x);
383     size_t slen;
384     size_t start_len;
385     const char * s_start;
386     unsigned long s_buf = 0;    /* The first few characters of s        */
387     unsigned long x_buf = 0;    /* Start of candidate substring.        */
388                     /* Initialized only to make compilers   */
389                     /* happy.                               */
390     unsigned long mask = 0;
391     size_t i;
392     size_t match_pos;
393 
394     if (s == CORD_EMPTY) return(start);
395     if (CORD_IS_STRING(s)) {
396         s_start = s;
397         slen = strlen(s);
398     } else {
399         s_start = CORD_to_char_star(CORD_substr(s, 0, sizeof(unsigned long)));
400         slen = CORD_len(s);
401     }
402     if (xlen < start || xlen - start < slen) return(CORD_NOT_FOUND);
403     start_len = slen;
404     if (start_len > sizeof(unsigned long)) start_len = sizeof(unsigned long);
405     CORD_set_pos(xpos, x, start);
406     for (i = 0; i < start_len; i++) {
407         mask <<= 8;
408         mask |= 0xff;
409         s_buf <<= 8;
410         s_buf |= (unsigned char)s_start[i];
411         x_buf <<= 8;
412         x_buf |= (unsigned char)CORD_pos_fetch(xpos);
413         CORD_next(xpos);
414     }
415     for (match_pos = start; ; match_pos++) {
416         if ((x_buf & mask) == s_buf) {
417             if (slen == start_len ||
418                 CORD_ncmp(x, match_pos + start_len,
419                       s, start_len, slen - start_len) == 0) {
420                 return(match_pos);
421             }
422         }
423     if ( match_pos == xlen - slen ) {
424         return(CORD_NOT_FOUND);
425     }
426         x_buf <<= 8;
427         x_buf |= (unsigned char)CORD_pos_fetch(xpos);
428         CORD_next(xpos);
429     }
430 }
431 
CORD_ec_flush_buf(CORD_ec x)432 void CORD_ec_flush_buf(CORD_ec x)
433 {
434     size_t len = x[0].ec_bufptr - x[0].ec_buf;
435     char * s;
436 
437     if (len == 0) return;
438     s = (char *)GC_MALLOC_ATOMIC(len + 1);
439     if (NULL == s) OUT_OF_MEMORY;
440     memcpy(s, x[0].ec_buf, len);
441     s[len] = '\0';
442     x[0].ec_cord = CORD_cat_char_star(x[0].ec_cord, s, len);
443     x[0].ec_bufptr = x[0].ec_buf;
444 }
445 
CORD_ec_append_cord(CORD_ec x,CORD s)446 void CORD_ec_append_cord(CORD_ec x, CORD s)
447 {
448     CORD_ec_flush_buf(x);
449     x[0].ec_cord = CORD_cat(x[0].ec_cord, s);
450 }
451 
CORD_nul_func(size_t i CORD_ATTR_UNUSED,void * client_data)452 char CORD_nul_func(size_t i CORD_ATTR_UNUSED, void * client_data)
453 {
454     return (char)(GC_word)client_data;
455 }
456 
CORD_chars(char c,size_t i)457 CORD CORD_chars(char c, size_t i)
458 {
459     return CORD_from_fn(CORD_nul_func, (void *)(GC_word)(unsigned char)c, i);
460 }
461 
CORD_from_file_eager(FILE * f)462 CORD CORD_from_file_eager(FILE * f)
463 {
464     CORD_ec ecord;
465 
466     CORD_ec_init(ecord);
467     for(;;) {
468         int c = getc(f);
469 
470         if (c == 0) {
471           /* Append the right number of NULs                            */
472           /* Note that any string of NULs is represented in 4 words,    */
473           /* independent of its length.                                 */
474             size_t count = 1;
475 
476             CORD_ec_flush_buf(ecord);
477             while ((c = getc(f)) == 0) count++;
478             ecord[0].ec_cord = CORD_cat(ecord[0].ec_cord, CORD_nul(count));
479         }
480         if (c == EOF) break;
481         CORD_ec_append(ecord, (char)c);
482     }
483     (void) fclose(f);
484     return(CORD_balance(CORD_ec_to_cord(ecord)));
485 }
486 
487 /* The state maintained for a lazily read file consists primarily       */
488 /* of a large direct-mapped cache of previously read values.            */
489 /* We could rely more on stdio buffering.  That would have 2            */
490 /* disadvantages:                                                       */
491 /*  1) Empirically, not all fseek implementations preserve the          */
492 /*     buffer whenever they could.                                      */
493 /*  2) It would fail if 2 different sections of a long cord             */
494 /*     were being read alternately.                                     */
495 /* We do use the stdio buffer for read ahead.                           */
496 /* To guarantee thread safety in the presence of atomic pointer         */
497 /* writes, cache lines are always replaced, and never modified in       */
498 /* place.                                                               */
499 
500 # define LOG_CACHE_SZ 14
501 # define CACHE_SZ (1 << LOG_CACHE_SZ)
502 # define LOG_LINE_SZ 9
503 # define LINE_SZ (1 << LOG_LINE_SZ)
504 
505 typedef struct {
506     size_t tag;
507     char data[LINE_SZ];
508         /* data[i%LINE_SZ] = ith char in file if tag = i/LINE_SZ    */
509 } cache_line;
510 
511 typedef struct {
512     FILE * lf_file;
513     size_t lf_current;  /* Current file pointer value */
514     cache_line * volatile lf_cache[CACHE_SZ/LINE_SZ];
515 } lf_state;
516 
517 # define MOD_CACHE_SZ(n) ((n) & (CACHE_SZ - 1))
518 # define DIV_CACHE_SZ(n) ((n) >> LOG_CACHE_SZ)
519 # define MOD_LINE_SZ(n) ((n) & (LINE_SZ - 1))
520 # define DIV_LINE_SZ(n) ((n) >> LOG_LINE_SZ)
521 # define LINE_START(n) ((n) & ~(LINE_SZ - 1))
522 
523 typedef struct {
524     lf_state * state;
525     size_t file_pos;    /* Position of needed character. */
526     cache_line * new_cache;
527 } refill_data;
528 
529 /* Executed with allocation lock. */
refill_cache(void * client_data)530 static void * GC_CALLBACK refill_cache(void * client_data)
531 {
532     lf_state * state = ((refill_data *)client_data) -> state;
533     size_t file_pos = ((refill_data *)client_data) -> file_pos;
534     FILE *f = state -> lf_file;
535     size_t line_start = LINE_START(file_pos);
536     size_t line_no = DIV_LINE_SZ(MOD_CACHE_SZ(file_pos));
537     cache_line * new_cache = ((refill_data *)client_data) -> new_cache;
538 
539     if (line_start != state -> lf_current
540         && fseek(f, (long)line_start, SEEK_SET) != 0) {
541             ABORT("fseek failed");
542     }
543     if (fread(new_cache -> data, sizeof(char), LINE_SZ, f)
544         <= file_pos - line_start) {
545         ABORT("fread failed");
546     }
547     new_cache -> tag = DIV_LINE_SZ(file_pos);
548     /* Store barrier goes here. */
549     ATOMIC_WRITE(state -> lf_cache[line_no], new_cache);
550     GC_END_STUBBORN_CHANGE((/* no volatile */ void *)(state -> lf_cache
551                                                       + line_no));
552     state -> lf_current = line_start + LINE_SZ;
553     return (void *)((GC_word)new_cache->data[MOD_LINE_SZ(file_pos)]);
554 }
555 
CORD_lf_func(size_t i,void * client_data)556 char CORD_lf_func(size_t i, void * client_data)
557 {
558     lf_state * state = (lf_state *)client_data;
559     cache_line * volatile * cl_addr =
560                         &(state -> lf_cache[DIV_LINE_SZ(MOD_CACHE_SZ(i))]);
561     cache_line * cl = (cache_line *)ATOMIC_READ(cl_addr);
562 
563     if (cl == 0 || cl -> tag != DIV_LINE_SZ(i)) {
564         /* Cache miss */
565         refill_data rd;
566 
567         rd.state = state;
568         rd.file_pos =  i;
569         rd.new_cache = GC_NEW_ATOMIC(cache_line);
570         if (rd.new_cache == 0) OUT_OF_MEMORY;
571         return (char)((GC_word)GC_call_with_alloc_lock(refill_cache, &rd));
572     }
573     return(cl -> data[MOD_LINE_SZ(i)]);
574 }
575 
CORD_lf_close_proc(void * obj,void * client_data CORD_ATTR_UNUSED)576 void CORD_lf_close_proc(void * obj, void * client_data CORD_ATTR_UNUSED)
577 {
578     if (fclose(((lf_state *)obj) -> lf_file) != 0) {
579         ABORT("CORD_lf_close_proc: fclose failed");
580     }
581 }
582 
CORD_from_file_lazy_inner(FILE * f,size_t len)583 CORD CORD_from_file_lazy_inner(FILE * f, size_t len)
584 {
585     lf_state * state = GC_NEW(lf_state);
586     int i;
587 
588     if (state == 0) OUT_OF_MEMORY;
589     if (len != 0) {
590         /* Dummy read to force buffer allocation.       */
591         /* This greatly increases the probability       */
592         /* of avoiding deadlock if buffer allocation    */
593         /* is redirected to GC_malloc and the           */
594         /* world is multi-threaded.                     */
595         char buf[1];
596 
597         if (fread(buf, 1, 1, f) > 1
598             || fseek(f, 0l, SEEK_SET) != 0) {
599             ABORT("Bad f argument or I/O failure");
600         }
601     }
602     state -> lf_file = f;
603     for (i = 0; i < CACHE_SZ/LINE_SZ; i++) {
604         state -> lf_cache[i] = 0;
605     }
606     state -> lf_current = 0;
607     GC_REGISTER_FINALIZER(state, CORD_lf_close_proc, 0, 0, 0);
608     return(CORD_from_fn(CORD_lf_func, state, len));
609 }
610 
CORD_from_file_lazy(FILE * f)611 CORD CORD_from_file_lazy(FILE * f)
612 {
613     long len;
614 
615     if (fseek(f, 0l, SEEK_END) != 0
616         || (len = ftell(f)) < 0
617         || fseek(f, 0l, SEEK_SET) != 0) {
618         ABORT("Bad f argument or I/O failure");
619     }
620     return(CORD_from_file_lazy_inner(f, (size_t)len));
621 }
622 
623 # define LAZY_THRESHOLD (128*1024 + 1)
624 
CORD_from_file(FILE * f)625 CORD CORD_from_file(FILE * f)
626 {
627     long len;
628 
629     if (fseek(f, 0l, SEEK_END) != 0
630         || (len = ftell(f)) < 0
631         || fseek(f, 0l, SEEK_SET) != 0) {
632         ABORT("Bad f argument or I/O failure");
633     }
634     if (len < LAZY_THRESHOLD) {
635         return(CORD_from_file_eager(f));
636     } else {
637         return(CORD_from_file_lazy_inner(f, (size_t)len));
638     }
639 }
640