1 /*
2 * Copyright (c) 1993-1994 by Xerox Corporation. All rights reserved.
3 *
4 * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY EXPRESSED
5 * OR IMPLIED. ANY USE IS AT YOUR OWN RISK.
6 *
7 * Permission is hereby granted to use or copy this program
8 * for any purpose, provided the above notices are retained on all copies.
9 * Permission to modify the code and to distribute modified code is granted,
10 * provided the above notices are retained, and a notice that the code was
11 * modified is included with the above copyright notice.
12 */
13
14 /*
15 * These are functions on cords that do not need to understand their
16 * implementation. They serve also serve as example client code for
17 * cord_basics.
18 */
19
20 #ifdef HAVE_CONFIG_H
21 # include "config.h"
22 #endif
23 #ifndef CORD_BUILD
24 # define CORD_BUILD
25 #endif
26
27 # include <stdio.h>
28 # include <string.h>
29 # include <stdlib.h>
30 # include <stdarg.h>
31
32 # include "cord.h"
33 # include "ec.h"
34
35 # define I_HIDE_POINTERS /* So we get access to allocation lock. */
36 /* We use this for lazy file reading, */
37 /* so that we remain independent */
38 /* of the threads primitives. */
39 # include "gc.h"
40
41 /* For now we assume that pointer reads and writes are atomic, */
42 /* i.e. another thread always sees the state before or after */
43 /* a write. This might be false on a Motorola M68K with */
44 /* pointers that are not 32-bit aligned. But there probably */
45 /* aren't too many threads packages running on those. */
46 # define ATOMIC_WRITE(x,y) (x) = (y)
47 # define ATOMIC_READ(x) (*(x))
48
49 /* The standard says these are in stdio.h, but they aren't always: */
50 # ifndef SEEK_SET
51 # define SEEK_SET 0
52 # endif
53 # ifndef SEEK_END
54 # define SEEK_END 2
55 # endif
56
57 # define BUFSZ 2048 /* Size of stack allocated buffers when */
58 /* we want large buffers. */
59
60 typedef void (* oom_fn)(void);
61
62 # define OUT_OF_MEMORY { if (CORD_oom_fn != (oom_fn) 0) (*CORD_oom_fn)(); \
63 ABORT("Out of memory"); }
64 # define ABORT(msg) { fprintf(stderr, "%s\n", msg); abort(); }
65
66 #if GC_GNUC_PREREQ(3, 4)
67 # define CORD_ATTR_UNUSED __attribute__((__unused__))
68 #else
69 # define CORD_ATTR_UNUSED /* empty */
70 #endif
71
CORD_cat_char(CORD x,char c)72 CORD CORD_cat_char(CORD x, char c)
73 {
74 char * string;
75
76 if (c == '\0') return(CORD_cat(x, CORD_nul(1)));
77 string = (char *)GC_MALLOC_ATOMIC(2);
78 if (string == 0) OUT_OF_MEMORY;
79 string[0] = c;
80 string[1] = '\0';
81 return(CORD_cat_char_star(x, string, 1));
82 }
83
CORD_catn(int nargs,...)84 CORD CORD_catn(int nargs, ...)
85 {
86 CORD result = CORD_EMPTY;
87 va_list args;
88 int i;
89
90 va_start(args, nargs);
91 for (i = 0; i < nargs; i++) {
92 CORD next = va_arg(args, CORD);
93 result = CORD_cat(result, next);
94 }
95 va_end(args);
96 return(result);
97 }
98
99 typedef struct {
100 size_t len;
101 size_t count;
102 char * buf;
103 } CORD_fill_data;
104
CORD_fill_proc(char c,void * client_data)105 int CORD_fill_proc(char c, void * client_data)
106 {
107 CORD_fill_data * d = (CORD_fill_data *)client_data;
108 size_t count = d -> count;
109
110 (d -> buf)[count] = c;
111 d -> count = ++count;
112 if (count >= d -> len) {
113 return(1);
114 } else {
115 return(0);
116 }
117 }
118
CORD_batched_fill_proc(const char * s,void * client_data)119 int CORD_batched_fill_proc(const char * s, void * client_data)
120 {
121 CORD_fill_data * d = (CORD_fill_data *)client_data;
122 size_t count = d -> count;
123 size_t max = d -> len;
124 char * buf = d -> buf;
125 const char * t = s;
126
127 while((buf[count] = *t++) != '\0') {
128 count++;
129 if (count >= max) {
130 d -> count = count;
131 return(1);
132 }
133 }
134 d -> count = count;
135 return(0);
136 }
137
138 /* Fill buf with len characters starting at i. */
139 /* Assumes len characters are available in buf. */
140 /* Return 1 if buf is filled fully (and len is */
141 /* non-zero), 0 otherwise. */
CORD_fill_buf(CORD x,size_t i,size_t len,char * buf)142 int CORD_fill_buf(CORD x, size_t i, size_t len, char * buf)
143 {
144 CORD_fill_data fd;
145
146 fd.len = len;
147 fd.buf = buf;
148 fd.count = 0;
149 return CORD_iter5(x, i, CORD_fill_proc, CORD_batched_fill_proc, &fd);
150 }
151
CORD_cmp(CORD x,CORD y)152 int CORD_cmp(CORD x, CORD y)
153 {
154 CORD_pos xpos;
155 CORD_pos ypos;
156
157 if (y == CORD_EMPTY) return(x != CORD_EMPTY);
158 if (x == CORD_EMPTY) return(-1);
159 if (CORD_IS_STRING(y) && CORD_IS_STRING(x)) return(strcmp(x,y));
160 CORD_set_pos(xpos, x, 0);
161 CORD_set_pos(ypos, y, 0);
162 for(;;) {
163 size_t avail, yavail;
164
165 if (!CORD_pos_valid(xpos)) {
166 if (CORD_pos_valid(ypos)) {
167 return(-1);
168 } else {
169 return(0);
170 }
171 }
172 if (!CORD_pos_valid(ypos)) {
173 return(1);
174 }
175 avail = CORD_pos_chars_left(xpos);
176 if (avail == 0
177 || (yavail = CORD_pos_chars_left(ypos)) == 0) {
178 char xcurrent = CORD_pos_fetch(xpos);
179 char ycurrent = CORD_pos_fetch(ypos);
180 if (xcurrent != ycurrent) return(xcurrent - ycurrent);
181 CORD_next(xpos);
182 CORD_next(ypos);
183 } else {
184 /* process as many characters as we can */
185 int result;
186
187 if (avail > yavail) avail = yavail;
188 result = strncmp(CORD_pos_cur_char_addr(xpos),
189 CORD_pos_cur_char_addr(ypos), avail);
190 if (result != 0) return(result);
191 CORD_pos_advance(xpos, avail);
192 CORD_pos_advance(ypos, avail);
193 }
194 }
195 }
196
CORD_ncmp(CORD x,size_t x_start,CORD y,size_t y_start,size_t len)197 int CORD_ncmp(CORD x, size_t x_start, CORD y, size_t y_start, size_t len)
198 {
199 CORD_pos xpos;
200 CORD_pos ypos;
201 size_t count;
202
203 CORD_set_pos(xpos, x, x_start);
204 CORD_set_pos(ypos, y, y_start);
205 for(count = 0; count < len;) {
206 long avail, yavail;
207
208 if (!CORD_pos_valid(xpos)) {
209 if (CORD_pos_valid(ypos)) {
210 return(-1);
211 } else {
212 return(0);
213 }
214 }
215 if (!CORD_pos_valid(ypos)) {
216 return(1);
217 }
218 if ((avail = CORD_pos_chars_left(xpos)) <= 0
219 || (yavail = CORD_pos_chars_left(ypos)) <= 0) {
220 char xcurrent = CORD_pos_fetch(xpos);
221 char ycurrent = CORD_pos_fetch(ypos);
222
223 if (xcurrent != ycurrent) return(xcurrent - ycurrent);
224 CORD_next(xpos);
225 CORD_next(ypos);
226 count++;
227 } else {
228 /* process as many characters as we can */
229 int result;
230
231 if (avail > yavail) avail = yavail;
232 count += avail;
233 if (count > len)
234 avail -= (long)(count - len);
235 result = strncmp(CORD_pos_cur_char_addr(xpos),
236 CORD_pos_cur_char_addr(ypos), (size_t)avail);
237 if (result != 0) return(result);
238 CORD_pos_advance(xpos, (size_t)avail);
239 CORD_pos_advance(ypos, (size_t)avail);
240 }
241 }
242 return(0);
243 }
244
CORD_to_char_star(CORD x)245 char * CORD_to_char_star(CORD x)
246 {
247 size_t len = CORD_len(x);
248 char * result = (char *)GC_MALLOC_ATOMIC(len + 1);
249
250 if (result == 0) OUT_OF_MEMORY;
251 if (len > 0 && CORD_fill_buf(x, 0, len, result) != 1)
252 ABORT("CORD_fill_buf malfunction");
253 result[len] = '\0';
254 return(result);
255 }
256
CORD_from_char_star(const char * s)257 CORD CORD_from_char_star(const char *s)
258 {
259 char * result;
260 size_t len = strlen(s);
261
262 if (0 == len) return(CORD_EMPTY);
263 result = (char *)GC_MALLOC_ATOMIC(len + 1);
264 if (result == 0) OUT_OF_MEMORY;
265 memcpy(result, s, len+1);
266 return(result);
267 }
268
CORD_to_const_char_star(CORD x)269 const char * CORD_to_const_char_star(CORD x)
270 {
271 if (x == 0) return("");
272 if (CORD_IS_STRING(x)) return((const char *)x);
273 return(CORD_to_char_star(x));
274 }
275
CORD_fetch(CORD x,size_t i)276 char CORD_fetch(CORD x, size_t i)
277 {
278 CORD_pos xpos;
279
280 CORD_set_pos(xpos, x, i);
281 if (!CORD_pos_valid(xpos)) ABORT("bad index?");
282 return(CORD_pos_fetch(xpos));
283 }
284
285
CORD_put_proc(char c,void * client_data)286 int CORD_put_proc(char c, void * client_data)
287 {
288 FILE * f = (FILE *)client_data;
289
290 return(putc(c, f) == EOF);
291 }
292
CORD_batched_put_proc(const char * s,void * client_data)293 int CORD_batched_put_proc(const char * s, void * client_data)
294 {
295 FILE * f = (FILE *)client_data;
296
297 return(fputs(s, f) == EOF);
298 }
299
300
CORD_put(CORD x,FILE * f)301 int CORD_put(CORD x, FILE * f)
302 {
303 if (CORD_iter5(x, 0, CORD_put_proc, CORD_batched_put_proc, f)) {
304 return(EOF);
305 } else {
306 return(1);
307 }
308 }
309
310 typedef struct {
311 size_t pos; /* Current position in the cord */
312 char target; /* Character we're looking for */
313 } chr_data;
314
CORD_chr_proc(char c,void * client_data)315 int CORD_chr_proc(char c, void * client_data)
316 {
317 chr_data * d = (chr_data *)client_data;
318
319 if (c == d -> target) return(1);
320 (d -> pos) ++;
321 return(0);
322 }
323
CORD_rchr_proc(char c,void * client_data)324 int CORD_rchr_proc(char c, void * client_data)
325 {
326 chr_data * d = (chr_data *)client_data;
327
328 if (c == d -> target) return(1);
329 (d -> pos) --;
330 return(0);
331 }
332
CORD_batched_chr_proc(const char * s,void * client_data)333 int CORD_batched_chr_proc(const char *s, void * client_data)
334 {
335 chr_data * d = (chr_data *)client_data;
336 const char * occ = strchr(s, d -> target);
337
338 if (NULL == occ) {
339 d -> pos += strlen(s);
340 return(0);
341 } else {
342 d -> pos += occ - s;
343 return(1);
344 }
345 }
346
CORD_chr(CORD x,size_t i,int c)347 size_t CORD_chr(CORD x, size_t i, int c)
348 {
349 chr_data d;
350
351 d.pos = i;
352 d.target = (char)c;
353 if (CORD_iter5(x, i, CORD_chr_proc, CORD_batched_chr_proc, &d)) {
354 return(d.pos);
355 } else {
356 return(CORD_NOT_FOUND);
357 }
358 }
359
CORD_rchr(CORD x,size_t i,int c)360 size_t CORD_rchr(CORD x, size_t i, int c)
361 {
362 chr_data d;
363
364 d.pos = i;
365 d.target = (char)c;
366 if (CORD_riter4(x, i, CORD_rchr_proc, &d)) {
367 return(d.pos);
368 } else {
369 return(CORD_NOT_FOUND);
370 }
371 }
372
373 /* Find the first occurrence of s in x at position start or later. */
374 /* This uses an asymptotically poor algorithm, which should typically */
375 /* perform acceptably. We compare the first few characters directly, */
376 /* and call CORD_ncmp whenever there is a partial match. */
377 /* This has the advantage that we allocate very little, or not at all. */
378 /* It's very fast if there are few close misses. */
CORD_str(CORD x,size_t start,CORD s)379 size_t CORD_str(CORD x, size_t start, CORD s)
380 {
381 CORD_pos xpos;
382 size_t xlen = CORD_len(x);
383 size_t slen;
384 size_t start_len;
385 const char * s_start;
386 unsigned long s_buf = 0; /* The first few characters of s */
387 unsigned long x_buf = 0; /* Start of candidate substring. */
388 /* Initialized only to make compilers */
389 /* happy. */
390 unsigned long mask = 0;
391 size_t i;
392 size_t match_pos;
393
394 if (s == CORD_EMPTY) return(start);
395 if (CORD_IS_STRING(s)) {
396 s_start = s;
397 slen = strlen(s);
398 } else {
399 s_start = CORD_to_char_star(CORD_substr(s, 0, sizeof(unsigned long)));
400 slen = CORD_len(s);
401 }
402 if (xlen < start || xlen - start < slen) return(CORD_NOT_FOUND);
403 start_len = slen;
404 if (start_len > sizeof(unsigned long)) start_len = sizeof(unsigned long);
405 CORD_set_pos(xpos, x, start);
406 for (i = 0; i < start_len; i++) {
407 mask <<= 8;
408 mask |= 0xff;
409 s_buf <<= 8;
410 s_buf |= (unsigned char)s_start[i];
411 x_buf <<= 8;
412 x_buf |= (unsigned char)CORD_pos_fetch(xpos);
413 CORD_next(xpos);
414 }
415 for (match_pos = start; ; match_pos++) {
416 if ((x_buf & mask) == s_buf) {
417 if (slen == start_len ||
418 CORD_ncmp(x, match_pos + start_len,
419 s, start_len, slen - start_len) == 0) {
420 return(match_pos);
421 }
422 }
423 if ( match_pos == xlen - slen ) {
424 return(CORD_NOT_FOUND);
425 }
426 x_buf <<= 8;
427 x_buf |= (unsigned char)CORD_pos_fetch(xpos);
428 CORD_next(xpos);
429 }
430 }
431
CORD_ec_flush_buf(CORD_ec x)432 void CORD_ec_flush_buf(CORD_ec x)
433 {
434 size_t len = x[0].ec_bufptr - x[0].ec_buf;
435 char * s;
436
437 if (len == 0) return;
438 s = (char *)GC_MALLOC_ATOMIC(len + 1);
439 if (NULL == s) OUT_OF_MEMORY;
440 memcpy(s, x[0].ec_buf, len);
441 s[len] = '\0';
442 x[0].ec_cord = CORD_cat_char_star(x[0].ec_cord, s, len);
443 x[0].ec_bufptr = x[0].ec_buf;
444 }
445
CORD_ec_append_cord(CORD_ec x,CORD s)446 void CORD_ec_append_cord(CORD_ec x, CORD s)
447 {
448 CORD_ec_flush_buf(x);
449 x[0].ec_cord = CORD_cat(x[0].ec_cord, s);
450 }
451
CORD_nul_func(size_t i CORD_ATTR_UNUSED,void * client_data)452 char CORD_nul_func(size_t i CORD_ATTR_UNUSED, void * client_data)
453 {
454 return (char)(GC_word)client_data;
455 }
456
CORD_chars(char c,size_t i)457 CORD CORD_chars(char c, size_t i)
458 {
459 return CORD_from_fn(CORD_nul_func, (void *)(GC_word)(unsigned char)c, i);
460 }
461
CORD_from_file_eager(FILE * f)462 CORD CORD_from_file_eager(FILE * f)
463 {
464 CORD_ec ecord;
465
466 CORD_ec_init(ecord);
467 for(;;) {
468 int c = getc(f);
469
470 if (c == 0) {
471 /* Append the right number of NULs */
472 /* Note that any string of NULs is represented in 4 words, */
473 /* independent of its length. */
474 size_t count = 1;
475
476 CORD_ec_flush_buf(ecord);
477 while ((c = getc(f)) == 0) count++;
478 ecord[0].ec_cord = CORD_cat(ecord[0].ec_cord, CORD_nul(count));
479 }
480 if (c == EOF) break;
481 CORD_ec_append(ecord, (char)c);
482 }
483 (void) fclose(f);
484 return(CORD_balance(CORD_ec_to_cord(ecord)));
485 }
486
487 /* The state maintained for a lazily read file consists primarily */
488 /* of a large direct-mapped cache of previously read values. */
489 /* We could rely more on stdio buffering. That would have 2 */
490 /* disadvantages: */
491 /* 1) Empirically, not all fseek implementations preserve the */
492 /* buffer whenever they could. */
493 /* 2) It would fail if 2 different sections of a long cord */
494 /* were being read alternately. */
495 /* We do use the stdio buffer for read ahead. */
496 /* To guarantee thread safety in the presence of atomic pointer */
497 /* writes, cache lines are always replaced, and never modified in */
498 /* place. */
499
500 # define LOG_CACHE_SZ 14
501 # define CACHE_SZ (1 << LOG_CACHE_SZ)
502 # define LOG_LINE_SZ 9
503 # define LINE_SZ (1 << LOG_LINE_SZ)
504
505 typedef struct {
506 size_t tag;
507 char data[LINE_SZ];
508 /* data[i%LINE_SZ] = ith char in file if tag = i/LINE_SZ */
509 } cache_line;
510
511 typedef struct {
512 FILE * lf_file;
513 size_t lf_current; /* Current file pointer value */
514 cache_line * volatile lf_cache[CACHE_SZ/LINE_SZ];
515 } lf_state;
516
517 # define MOD_CACHE_SZ(n) ((n) & (CACHE_SZ - 1))
518 # define DIV_CACHE_SZ(n) ((n) >> LOG_CACHE_SZ)
519 # define MOD_LINE_SZ(n) ((n) & (LINE_SZ - 1))
520 # define DIV_LINE_SZ(n) ((n) >> LOG_LINE_SZ)
521 # define LINE_START(n) ((n) & ~(LINE_SZ - 1))
522
523 typedef struct {
524 lf_state * state;
525 size_t file_pos; /* Position of needed character. */
526 cache_line * new_cache;
527 } refill_data;
528
529 /* Executed with allocation lock. */
refill_cache(void * client_data)530 static void * GC_CALLBACK refill_cache(void * client_data)
531 {
532 lf_state * state = ((refill_data *)client_data) -> state;
533 size_t file_pos = ((refill_data *)client_data) -> file_pos;
534 FILE *f = state -> lf_file;
535 size_t line_start = LINE_START(file_pos);
536 size_t line_no = DIV_LINE_SZ(MOD_CACHE_SZ(file_pos));
537 cache_line * new_cache = ((refill_data *)client_data) -> new_cache;
538
539 if (line_start != state -> lf_current
540 && fseek(f, (long)line_start, SEEK_SET) != 0) {
541 ABORT("fseek failed");
542 }
543 if (fread(new_cache -> data, sizeof(char), LINE_SZ, f)
544 <= file_pos - line_start) {
545 ABORT("fread failed");
546 }
547 new_cache -> tag = DIV_LINE_SZ(file_pos);
548 /* Store barrier goes here. */
549 ATOMIC_WRITE(state -> lf_cache[line_no], new_cache);
550 GC_END_STUBBORN_CHANGE((/* no volatile */ void *)(state -> lf_cache
551 + line_no));
552 state -> lf_current = line_start + LINE_SZ;
553 return (void *)((GC_word)new_cache->data[MOD_LINE_SZ(file_pos)]);
554 }
555
CORD_lf_func(size_t i,void * client_data)556 char CORD_lf_func(size_t i, void * client_data)
557 {
558 lf_state * state = (lf_state *)client_data;
559 cache_line * volatile * cl_addr =
560 &(state -> lf_cache[DIV_LINE_SZ(MOD_CACHE_SZ(i))]);
561 cache_line * cl = (cache_line *)ATOMIC_READ(cl_addr);
562
563 if (cl == 0 || cl -> tag != DIV_LINE_SZ(i)) {
564 /* Cache miss */
565 refill_data rd;
566
567 rd.state = state;
568 rd.file_pos = i;
569 rd.new_cache = GC_NEW_ATOMIC(cache_line);
570 if (rd.new_cache == 0) OUT_OF_MEMORY;
571 return (char)((GC_word)GC_call_with_alloc_lock(refill_cache, &rd));
572 }
573 return(cl -> data[MOD_LINE_SZ(i)]);
574 }
575
CORD_lf_close_proc(void * obj,void * client_data CORD_ATTR_UNUSED)576 void CORD_lf_close_proc(void * obj, void * client_data CORD_ATTR_UNUSED)
577 {
578 if (fclose(((lf_state *)obj) -> lf_file) != 0) {
579 ABORT("CORD_lf_close_proc: fclose failed");
580 }
581 }
582
CORD_from_file_lazy_inner(FILE * f,size_t len)583 CORD CORD_from_file_lazy_inner(FILE * f, size_t len)
584 {
585 lf_state * state = GC_NEW(lf_state);
586 int i;
587
588 if (state == 0) OUT_OF_MEMORY;
589 if (len != 0) {
590 /* Dummy read to force buffer allocation. */
591 /* This greatly increases the probability */
592 /* of avoiding deadlock if buffer allocation */
593 /* is redirected to GC_malloc and the */
594 /* world is multi-threaded. */
595 char buf[1];
596
597 if (fread(buf, 1, 1, f) > 1
598 || fseek(f, 0l, SEEK_SET) != 0) {
599 ABORT("Bad f argument or I/O failure");
600 }
601 }
602 state -> lf_file = f;
603 for (i = 0; i < CACHE_SZ/LINE_SZ; i++) {
604 state -> lf_cache[i] = 0;
605 }
606 state -> lf_current = 0;
607 GC_REGISTER_FINALIZER(state, CORD_lf_close_proc, 0, 0, 0);
608 return(CORD_from_fn(CORD_lf_func, state, len));
609 }
610
CORD_from_file_lazy(FILE * f)611 CORD CORD_from_file_lazy(FILE * f)
612 {
613 long len;
614
615 if (fseek(f, 0l, SEEK_END) != 0
616 || (len = ftell(f)) < 0
617 || fseek(f, 0l, SEEK_SET) != 0) {
618 ABORT("Bad f argument or I/O failure");
619 }
620 return(CORD_from_file_lazy_inner(f, (size_t)len));
621 }
622
623 # define LAZY_THRESHOLD (128*1024 + 1)
624
CORD_from_file(FILE * f)625 CORD CORD_from_file(FILE * f)
626 {
627 long len;
628
629 if (fseek(f, 0l, SEEK_END) != 0
630 || (len = ftell(f)) < 0
631 || fseek(f, 0l, SEEK_SET) != 0) {
632 ABORT("Bad f argument or I/O failure");
633 }
634 if (len < LAZY_THRESHOLD) {
635 return(CORD_from_file_eager(f));
636 } else {
637 return(CORD_from_file_lazy_inner(f, (size_t)len));
638 }
639 }
640