1264c2f6fSedward /*
2*e85b5252Sbostic * Copyright (c) 1989, 1993
3*e85b5252Sbostic * The Regents of the University of California. All rights reserved.
4264c2f6fSedward *
58e751acfSbostic * This code is derived from software contributed to Berkeley by
68e751acfSbostic * Edward Wang at The University of California, Berkeley.
78e751acfSbostic *
8122a1d9eSbostic * %sccs.include.redist.c%
9264c2f6fSedward */
10264c2f6fSedward
11264c2f6fSedward #ifndef lint
12*e85b5252Sbostic static char sccsid[] = "@(#)compress.c 8.1 (Berkeley) 06/06/93";
13264c2f6fSedward #endif /* not lint */
14264c2f6fSedward
15264c2f6fSedward #include "ww.h"
16264c2f6fSedward #include "tt.h"
17264c2f6fSedward
187175ad38Sedward /* special */
197175ad38Sedward #include <stdio.h>
20ded62c24Sedward #include <fcntl.h>
217175ad38Sedward int cc_trace = 0;
227175ad38Sedward FILE *cc_trace_fp;
237175ad38Sedward
2453b1e3a6Sedward /* tunable parameters */
25264c2f6fSedward
267175ad38Sedward int cc_reverse = 1;
277175ad38Sedward int cc_sort = 0;
287175ad38Sedward int cc_chop = 0;
29264c2f6fSedward
307175ad38Sedward int cc_token_max = 8; /* <= TOKEN_MAX */
317175ad38Sedward int cc_token_min = 2; /* > tt.tt_put_token_cost */
327175ad38Sedward int cc_npass0 = 1;
337175ad38Sedward int cc_npass1 = 1;
3453b1e3a6Sedward
357175ad38Sedward int cc_bufsize = 1024 * 3; /* XXX, or 80 * 24 * 2 */
3653b1e3a6Sedward
377175ad38Sedward int cc_ntoken = 8192;
3853b1e3a6Sedward
397175ad38Sedward #define cc_weight XXX
407175ad38Sedward #ifndef cc_weight
417175ad38Sedward int cc_weight = 0;
4253b1e3a6Sedward #endif
4353b1e3a6Sedward
4453b1e3a6Sedward #define TOKEN_MAX 16
4553b1e3a6Sedward
467175ad38Sedward struct cc {
47264c2f6fSedward char string[TOKEN_MAX];
4853b1e3a6Sedward char length;
4953b1e3a6Sedward char flag;
507175ad38Sedward #ifndef cc_weight
5153b1e3a6Sedward short weight;
5253b1e3a6Sedward #endif
5353b1e3a6Sedward long time; /* time last seen */
5453b1e3a6Sedward short bcount; /* count in this buffer */
5553b1e3a6Sedward short ccount; /* count in compression */
5653b1e3a6Sedward short places; /* places in the buffer */
5753b1e3a6Sedward short code; /* token code */
587175ad38Sedward struct cc *qforw, *qback;
597175ad38Sedward struct cc *hforw, **hback;
60264c2f6fSedward };
61264c2f6fSedward
627175ad38Sedward short cc_thresholds[TOKEN_MAX + 1];
637175ad38Sedward #define thresh(length) (cc_thresholds[length])
6453b1e3a6Sedward #define threshp(code, count, length) \
657175ad38Sedward ((code) >= 0 || (short) (count) >= cc_thresholds[length])
66264c2f6fSedward
677175ad38Sedward #ifndef cc_weight
687175ad38Sedward short cc_wthresholds[TOKEN_MAX + 1];
697175ad38Sedward #define wthresh(length) (cc_wthresholds[length])
707175ad38Sedward #define wthreshp(weight, length) ((short) (weight) >= cc_wthresholds[length])
7153b1e3a6Sedward #else
7253b1e3a6Sedward #define wthreshp(weight, length) (0)
7353b1e3a6Sedward #endif
7453b1e3a6Sedward
757175ad38Sedward #ifndef cc_weight
767175ad38Sedward short cc_wlimits[TOKEN_MAX + 1];
777175ad38Sedward #define wlimit(length) (cc_wlimits[length])
7853b1e3a6Sedward #endif
7953b1e3a6Sedward
8053b1e3a6Sedward #define put_token_score(length) ((length) - tt.tt_put_token_cost)
8153b1e3a6Sedward
827175ad38Sedward int cc_score_adjustments[TOKEN_MAX + 1][8]; /* XXX, 8 > max of cc_thresholds */
8353b1e3a6Sedward #define score_adjust(score, p) \
8453b1e3a6Sedward do { \
8553b1e3a6Sedward int length = (p)->length; \
8653b1e3a6Sedward int ccount = (p)->ccount; \
8753b1e3a6Sedward if (threshp((p)->code, ccount, length) || \
8853b1e3a6Sedward wthreshp((p)->weight, length)) /* XXX */ \
8953b1e3a6Sedward (score) -= length - tt.tt_put_token_cost; \
9053b1e3a6Sedward else \
917175ad38Sedward (score) += cc_score_adjustments[length][ccount]; \
9253b1e3a6Sedward } while (0)
9353b1e3a6Sedward
947175ad38Sedward int cc_initial_scores[TOKEN_MAX + 1][8]; /* XXX, 8 > max of cc_thresholds */
9553b1e3a6Sedward
967175ad38Sedward struct cc cc_q0a, cc_q0b, cc_q1a, cc_q1b;
9753b1e3a6Sedward
987175ad38Sedward #define qinsert(p1, p2) \
9953b1e3a6Sedward do { \
1007175ad38Sedward register struct cc *forw = (p1)->qforw; \
1017175ad38Sedward register struct cc *back = (p1)->qback; \
10253b1e3a6Sedward back->qforw = forw; \
10353b1e3a6Sedward forw->qback = back; \
1047175ad38Sedward forw = (p2)->qforw; \
1057175ad38Sedward (p1)->qforw = forw; \
1067175ad38Sedward forw->qback = (p1); \
1077175ad38Sedward (p2)->qforw = (p1); \
1087175ad38Sedward (p1)->qback = (p2); \
10953b1e3a6Sedward } while (0)
11053b1e3a6Sedward
1117175ad38Sedward #define qinsertq(q, p) \
11253b1e3a6Sedward ((q)->qforw == (q) ? 0 : \
1137175ad38Sedward ((q)->qback->qforw = (p)->qforw, \
1147175ad38Sedward (p)->qforw->qback = (q)->qback, \
1157175ad38Sedward (q)->qforw->qback = (p), \
1167175ad38Sedward (p)->qforw = (q)->qforw, \
11753b1e3a6Sedward (q)->qforw = (q), \
11853b1e3a6Sedward (q)->qback = (q)))
11953b1e3a6Sedward
12053b1e3a6Sedward #define H (14)
12153b1e3a6Sedward #define HSIZE (1 << H)
1227175ad38Sedward #define hash(h, c) ((((h) >> H - 8 | (h) << 8) ^ (c)) & HSIZE - 1)
12353b1e3a6Sedward
1247175ad38Sedward char *cc_buffer;
1257175ad38Sedward struct cc **cc_output; /* the output array */
1267175ad38Sedward short *cc_places[TOKEN_MAX + 1];
1277175ad38Sedward short *cc_hashcodes; /* for computing hashcodes */
1287175ad38Sedward struct cc **cc_htab; /* the hash table */
1297175ad38Sedward struct cc **cc_tokens; /* holds all the active tokens */
1307175ad38Sedward struct cc_undo {
1317175ad38Sedward struct cc **pos;
1327175ad38Sedward struct cc *val;
1337175ad38Sedward } *cc_undo;
13453b1e3a6Sedward
1357175ad38Sedward long cc_time, cc_time0;
136264c2f6fSedward
1377175ad38Sedward char *cc_tt_ob, *cc_tt_obe;
1387175ad38Sedward
ccinit()1397175ad38Sedward ccinit()
140264c2f6fSedward {
14153b1e3a6Sedward register i, j;
1427175ad38Sedward register struct cc *p;
143264c2f6fSedward
1447175ad38Sedward if (tt.tt_token_max > cc_token_max)
1457175ad38Sedward tt.tt_token_max = cc_token_max;
1467175ad38Sedward if (tt.tt_token_min < cc_token_min)
1477175ad38Sedward tt.tt_token_min = cc_token_min;
14853b1e3a6Sedward if (tt.tt_token_min > tt.tt_token_max) {
149264c2f6fSedward tt.tt_ntoken = 0;
15053b1e3a6Sedward return 0;
151264c2f6fSedward }
1527175ad38Sedward if (tt.tt_ntoken > cc_ntoken / 2) /* not likely */
1537175ad38Sedward tt.tt_ntoken = cc_ntoken / 2;
15453b1e3a6Sedward #define C(x) (sizeof (x) / sizeof *(x))
1557175ad38Sedward for (i = 0; i < C(cc_thresholds); i++) {
15653b1e3a6Sedward int h = i - tt.tt_put_token_cost;
15753b1e3a6Sedward if (h > 0)
1587175ad38Sedward cc_thresholds[i] =
15953b1e3a6Sedward (tt.tt_set_token_cost + 1 + h - 1) / h + 1;
16053b1e3a6Sedward else
1617175ad38Sedward cc_thresholds[i] = 0;
162264c2f6fSedward }
1637175ad38Sedward for (i = 0; i < C(cc_score_adjustments); i++) {
1647175ad38Sedward int t = cc_thresholds[i];
1657175ad38Sedward for (j = 0; j < C(*cc_score_adjustments); j++) {
16653b1e3a6Sedward if (j >= t)
1677175ad38Sedward cc_score_adjustments[i][j] =
16853b1e3a6Sedward - (i - tt.tt_put_token_cost);
16953b1e3a6Sedward else if (j < t - 1)
1707175ad38Sedward cc_score_adjustments[i][j] = 0;
17153b1e3a6Sedward else
17253b1e3a6Sedward /*
17353b1e3a6Sedward * cost now is
17453b1e3a6Sedward * length * (ccount + 1) a
17553b1e3a6Sedward * cost before was
17653b1e3a6Sedward * set-token-cost + length +
17753b1e3a6Sedward * ccount * put-token-cost b
17853b1e3a6Sedward * the score adjustment is (b - a)
17953b1e3a6Sedward */
1807175ad38Sedward cc_score_adjustments[i][j] =
18153b1e3a6Sedward tt.tt_set_token_cost + i +
18253b1e3a6Sedward j * tt.tt_put_token_cost -
18353b1e3a6Sedward i * (j + 1);
18453b1e3a6Sedward if (j >= t)
1857175ad38Sedward cc_initial_scores[i][j] = 0;
18653b1e3a6Sedward else
18753b1e3a6Sedward /*
18853b1e3a6Sedward * - (set-token-cost +
18953b1e3a6Sedward * (length - put-token-cost) -
19053b1e3a6Sedward * (length - put-token-cost) * ccount)
19153b1e3a6Sedward */
1927175ad38Sedward cc_initial_scores[i][j] =
19353b1e3a6Sedward - (tt.tt_set_token_cost +
19453b1e3a6Sedward (i - tt.tt_put_token_cost) -
19553b1e3a6Sedward (i - tt.tt_put_token_cost) * j);
19653b1e3a6Sedward }
19753b1e3a6Sedward }
1987175ad38Sedward #ifndef cc_weight
1997175ad38Sedward for (i = 1; i < C(cc_wthresholds); i++) {
2007175ad38Sedward cc_wthresholds[i] =
20153b1e3a6Sedward ((tt.tt_set_token_cost + tt.tt_put_token_cost) / i +
20253b1e3a6Sedward i / 5 + 1) *
2037175ad38Sedward cc_weight + 1;
2047175ad38Sedward cc_wlimits[i] = cc_wthresholds[i] + cc_weight;
20553b1e3a6Sedward }
20653b1e3a6Sedward #endif
20753b1e3a6Sedward #undef C
2087175ad38Sedward if ((cc_output = (struct cc **)
2097175ad38Sedward malloc((unsigned) cc_bufsize * sizeof *cc_output)) == 0)
21053b1e3a6Sedward goto nomem;
2117175ad38Sedward if ((cc_hashcodes = (short *)
2127175ad38Sedward malloc((unsigned) cc_bufsize * sizeof *cc_hashcodes)) == 0)
21353b1e3a6Sedward goto nomem;
2147175ad38Sedward if ((cc_htab = (struct cc **) malloc(HSIZE * sizeof *cc_htab)) == 0)
21553b1e3a6Sedward goto nomem;
2167175ad38Sedward if ((cc_tokens = (struct cc **)
21753b1e3a6Sedward malloc((unsigned)
2187175ad38Sedward (cc_ntoken + tt.tt_token_max - tt.tt_token_min + 1) *
2197175ad38Sedward sizeof *cc_tokens)) == 0)
22053b1e3a6Sedward goto nomem;
2217175ad38Sedward if ((cc_undo = (struct cc_undo *)
2227175ad38Sedward malloc((unsigned) cc_bufsize * sizeof *cc_undo)) == 0)
22353b1e3a6Sedward goto nomem;
22453b1e3a6Sedward for (i = tt.tt_token_min; i <= tt.tt_token_max; i++)
2257175ad38Sedward if ((cc_places[i] = (short *)
2267175ad38Sedward malloc((unsigned) cc_bufsize * sizeof **cc_places)) == 0)
22753b1e3a6Sedward goto nomem;
2287175ad38Sedward cc_q0a.qforw = cc_q0a.qback = &cc_q0a;
2297175ad38Sedward cc_q0b.qforw = cc_q0b.qback = &cc_q0b;
2307175ad38Sedward cc_q1a.qforw = cc_q1a.qback = &cc_q1a;
2317175ad38Sedward cc_q1b.qforw = cc_q1b.qback = &cc_q1b;
2327175ad38Sedward if ((p = (struct cc *) malloc((unsigned) cc_ntoken * sizeof *p)) == 0)
23353b1e3a6Sedward goto nomem;
23453b1e3a6Sedward for (i = 0; i < tt.tt_ntoken; i++) {
23553b1e3a6Sedward p->code = i;
23653b1e3a6Sedward p->time = -1;
2377175ad38Sedward p->qback = cc_q0a.qback;
2387175ad38Sedward p->qforw = &cc_q0a;
23953b1e3a6Sedward p->qback->qforw = p;
2407175ad38Sedward cc_q0a.qback = p;
24153b1e3a6Sedward p++;
24253b1e3a6Sedward }
2437175ad38Sedward for (; i < cc_ntoken; i++) {
24453b1e3a6Sedward p->code = -1;
24553b1e3a6Sedward p->time = -1;
2467175ad38Sedward p->qback = cc_q1a.qback;
2477175ad38Sedward p->qforw = &cc_q1a;
24853b1e3a6Sedward p->qback->qforw = p;
2497175ad38Sedward cc_q1a.qback = p;
25053b1e3a6Sedward p++;
251264c2f6fSedward }
2527175ad38Sedward cc_tt_ob = tt_ob;
2537175ad38Sedward cc_tt_obe = tt_obe;
2547175ad38Sedward if ((cc_buffer = malloc((unsigned) cc_bufsize)) == 0)
2557175ad38Sedward goto nomem;
256264c2f6fSedward return 0;
257264c2f6fSedward nomem:
258264c2f6fSedward wwerrno = WWE_NOMEM;
259264c2f6fSedward return -1;
260264c2f6fSedward }
261264c2f6fSedward
ccstart()2627175ad38Sedward ccstart()
263264c2f6fSedward {
2647175ad38Sedward int ccflush();
265264c2f6fSedward
2663b2f06aaSedward ttflush();
2677175ad38Sedward tt_obp = tt_ob = cc_buffer;
2687175ad38Sedward tt_obe = tt_ob + cc_bufsize;
2697175ad38Sedward tt.tt_flush = ccflush;
2703b2f06aaSedward if (cc_trace) {
2713b2f06aaSedward cc_trace_fp = fopen("window-trace", "a");
2723b2f06aaSedward (void) fcntl(fileno(cc_trace_fp), F_SETFD, 1);
2733b2f06aaSedward }
2743b2f06aaSedward ccreset();
2753b2f06aaSedward }
2763b2f06aaSedward
ccreset()2773b2f06aaSedward ccreset()
2783b2f06aaSedward {
2793b2f06aaSedward register struct cc *p;
2803b2f06aaSedward
2817175ad38Sedward bzero((char *) cc_htab, HSIZE * sizeof *cc_htab);
2827175ad38Sedward for (p = cc_q0a.qforw; p != &cc_q0a; p = p->qforw)
28353b1e3a6Sedward p->hback = 0;
2847175ad38Sedward for (p = cc_q1a.qforw; p != &cc_q1a; p = p->qforw)
28553b1e3a6Sedward p->hback = 0;
286264c2f6fSedward }
287264c2f6fSedward
ccend()2887175ad38Sedward ccend()
289264c2f6fSedward {
2907175ad38Sedward
2913b2f06aaSedward ttflush();
2927175ad38Sedward tt_obp = tt_ob = cc_tt_ob;
2937175ad38Sedward tt_obe = cc_tt_obe;
2943b2f06aaSedward tt.tt_flush = 0;
2957175ad38Sedward if (cc_trace_fp != NULL) {
2967175ad38Sedward (void) fclose(cc_trace_fp);
2977175ad38Sedward cc_trace_fp = NULL;
2987175ad38Sedward }
2997175ad38Sedward }
3007175ad38Sedward
ccflush()3017175ad38Sedward ccflush()
3027175ad38Sedward {
3037175ad38Sedward int bufsize = tt_obp - tt_ob;
30453b1e3a6Sedward int n;
305264c2f6fSedward
3067175ad38Sedward if (tt_ob != cc_buffer)
3077175ad38Sedward abort();
3087175ad38Sedward if (cc_trace_fp != NULL) {
3097175ad38Sedward (void) fwrite(tt_ob, 1, bufsize, cc_trace_fp);
3103b2f06aaSedward (void) putc(-1, cc_trace_fp);
3117175ad38Sedward }
3123b2f06aaSedward tt.tt_flush = 0;
3133b2f06aaSedward (*tt.tt_compress)(1);
3147175ad38Sedward if (bufsize < tt.tt_token_min) {
3157175ad38Sedward ttflush();
3163b2f06aaSedward goto out;
317264c2f6fSedward }
3187175ad38Sedward tt_obp = tt_ob = cc_tt_ob;
3197175ad38Sedward tt_obe = cc_tt_obe;
3207175ad38Sedward cc_time0 = cc_time;
3217175ad38Sedward cc_time += bufsize;
3227175ad38Sedward n = cc_sweep_phase(cc_buffer, bufsize, cc_tokens);
3237175ad38Sedward cc_compress_phase(cc_output, bufsize, cc_tokens, n);
3247175ad38Sedward cc_output_phase(cc_buffer, cc_output, bufsize);
3257175ad38Sedward ttflush();
3267175ad38Sedward tt_obp = tt_ob = cc_buffer;
3277175ad38Sedward tt_obe = cc_buffer + cc_bufsize;
3283b2f06aaSedward out:
3293b2f06aaSedward (*tt.tt_compress)(0);
3307175ad38Sedward tt.tt_flush = ccflush;
331264c2f6fSedward }
33253b1e3a6Sedward
cc_sweep_phase(buffer,bufsize,tokens)3337175ad38Sedward cc_sweep_phase(buffer, bufsize, tokens)
33453b1e3a6Sedward char *buffer;
3357175ad38Sedward struct cc **tokens;
33653b1e3a6Sedward {
3377175ad38Sedward register struct cc **pp = tokens;
33853b1e3a6Sedward register i, n;
33953b1e3a6Sedward #ifdef STATS
34053b1e3a6Sedward int nn, ii;
34153b1e3a6Sedward #endif
34253b1e3a6Sedward
34353b1e3a6Sedward #ifdef STATS
3447175ad38Sedward if (verbose >= 0)
3457175ad38Sedward time_begin();
34653b1e3a6Sedward if (verbose > 0)
34753b1e3a6Sedward printf("Sweep:");
34853b1e3a6Sedward #endif
3497175ad38Sedward cc_sweep0(buffer, bufsize, tt.tt_token_min - 1);
35053b1e3a6Sedward #ifdef STATS
3517175ad38Sedward ntoken_stat = 0;
35253b1e3a6Sedward nn = 0;
35353b1e3a6Sedward ii = 0;
35453b1e3a6Sedward #endif
35553b1e3a6Sedward for (i = tt.tt_token_min; i <= tt.tt_token_max; i++) {
35653b1e3a6Sedward #ifdef STATS
35753b1e3a6Sedward if (verbose > 0) {
35853b1e3a6Sedward if (ii > 7) {
35953b1e3a6Sedward printf("\n ");
36053b1e3a6Sedward ii = 0;
36153b1e3a6Sedward }
36253b1e3a6Sedward ii++;
36353b1e3a6Sedward printf(" (%d", i);
36453b1e3a6Sedward (void) fflush(stdout);
36553b1e3a6Sedward }
36653b1e3a6Sedward #endif
3677175ad38Sedward n = cc_sweep(buffer, bufsize, pp, i);
36853b1e3a6Sedward pp += n;
36953b1e3a6Sedward #ifdef STATS
37053b1e3a6Sedward if (verbose > 0) {
37153b1e3a6Sedward if (--n > 0) {
37253b1e3a6Sedward printf(" %d", n);
37353b1e3a6Sedward nn += n;
37453b1e3a6Sedward }
37553b1e3a6Sedward putchar(')');
37653b1e3a6Sedward }
37753b1e3a6Sedward #endif
37853b1e3a6Sedward }
3797175ad38Sedward qinsertq(&cc_q1b, &cc_q1a);
38053b1e3a6Sedward #ifdef STATS
38153b1e3a6Sedward if (verbose > 0)
38253b1e3a6Sedward printf("\n %d tokens, %d candidates\n",
3837175ad38Sedward ntoken_stat, nn);
3847175ad38Sedward if (verbose >= 0)
3857175ad38Sedward time_end();
38653b1e3a6Sedward #endif
38753b1e3a6Sedward return pp - tokens;
38853b1e3a6Sedward }
38953b1e3a6Sedward
cc_sweep0(buffer,n,length)3907175ad38Sedward cc_sweep0(buffer, n, length)
39153b1e3a6Sedward char *buffer;
39253b1e3a6Sedward {
3937175ad38Sedward register char *p;
3947175ad38Sedward register short *hc;
39553b1e3a6Sedward register i;
3967175ad38Sedward register short c;
3977175ad38Sedward register short pc = tt.tt_padc;
39853b1e3a6Sedward
3997175ad38Sedward /* n and length are at least 1 */
4007175ad38Sedward p = buffer++;
4017175ad38Sedward hc = cc_hashcodes;
4027175ad38Sedward i = n;
40353b1e3a6Sedward do {
4047175ad38Sedward if ((*hc++ = *p++) == pc)
40553b1e3a6Sedward hc[-1] = -1;
4067175ad38Sedward } while (--i);
4077175ad38Sedward while (--length) {
4087175ad38Sedward p = buffer++;
4097175ad38Sedward hc = cc_hashcodes;
4107175ad38Sedward for (i = n--; --i;) {
4117175ad38Sedward if ((c = *p++) == pc || *hc < 0)
4127175ad38Sedward c = -1;
413264c2f6fSedward else
4147175ad38Sedward c = hash(*hc, c);
4157175ad38Sedward *hc++ = c;
416264c2f6fSedward }
41753b1e3a6Sedward }
41853b1e3a6Sedward }
41953b1e3a6Sedward
cc_sweep(buffer,bufsize,tokens,length)4207175ad38Sedward cc_sweep(buffer, bufsize, tokens, length)
42153b1e3a6Sedward char *buffer;
4227175ad38Sedward struct cc **tokens;
42353b1e3a6Sedward register length;
42453b1e3a6Sedward {
4257175ad38Sedward register struct cc *p;
42653b1e3a6Sedward register char *cp;
42753b1e3a6Sedward register i;
42853b1e3a6Sedward short *hc;
4297175ad38Sedward short *places = cc_places[length];
4307175ad38Sedward struct cc **pp = tokens;
43153b1e3a6Sedward short threshold = thresh(length);
4327175ad38Sedward #ifndef cc_weight
43353b1e3a6Sedward short wthreshold = wthresh(length);
43453b1e3a6Sedward short limit = wlimit(length);
43553b1e3a6Sedward #endif
43653b1e3a6Sedward int time;
4377175ad38Sedward short pc = tt.tt_padc;
43853b1e3a6Sedward
43953b1e3a6Sedward i = length - 1;
44053b1e3a6Sedward bufsize -= i;
44153b1e3a6Sedward cp = buffer + i;
4427175ad38Sedward hc = cc_hashcodes;
4437175ad38Sedward time = cc_time0;
44453b1e3a6Sedward for (i = 0; i < bufsize; i++, time++) {
4457175ad38Sedward struct cc **h;
44653b1e3a6Sedward
44753b1e3a6Sedward {
44853b1e3a6Sedward register short *hc1 = hc;
4497175ad38Sedward register short c = *cp++;
4507175ad38Sedward register short hh;
4517175ad38Sedward if ((hh = *hc1) < 0 || c == pc) {
45253b1e3a6Sedward *hc1++ = -1;
45353b1e3a6Sedward hc = hc1;
45453b1e3a6Sedward continue;
45553b1e3a6Sedward }
4567175ad38Sedward h = cc_htab + (*hc1++ = hash(hh, c));
4577175ad38Sedward hc = hc1;
45853b1e3a6Sedward }
45953b1e3a6Sedward for (p = *h; p != 0; p = p->hforw)
46053b1e3a6Sedward if (p->length == (char) length) {
46153b1e3a6Sedward register char *p1 = p->string;
46253b1e3a6Sedward register char *p2 = cp - length;
46353b1e3a6Sedward register n = length;
46453b1e3a6Sedward do
46553b1e3a6Sedward if (*p1++ != *p2++)
46653b1e3a6Sedward goto fail;
46753b1e3a6Sedward while (--n);
46853b1e3a6Sedward break;
46953b1e3a6Sedward fail:;
47053b1e3a6Sedward }
47153b1e3a6Sedward if (p == 0) {
4727175ad38Sedward p = cc_q1a.qback;
4737175ad38Sedward if (p == &cc_q1a ||
4747175ad38Sedward p->time >= cc_time0 && p->length == (char) length)
47553b1e3a6Sedward continue;
47653b1e3a6Sedward if (p->hback != 0)
47753b1e3a6Sedward if ((*p->hback = p->hforw) != 0)
47853b1e3a6Sedward p->hforw->hback = p->hback;
47953b1e3a6Sedward {
48053b1e3a6Sedward register char *p1 = p->string;
48153b1e3a6Sedward register char *p2 = cp - length;
48253b1e3a6Sedward register n = length;
48353b1e3a6Sedward do
48453b1e3a6Sedward *p1++ = *p2++;
48553b1e3a6Sedward while (--n);
48653b1e3a6Sedward }
48753b1e3a6Sedward p->length = length;
4887175ad38Sedward #ifndef cc_weight
4897175ad38Sedward p->weight = cc_weight;
49053b1e3a6Sedward #endif
49153b1e3a6Sedward p->time = time;
49253b1e3a6Sedward p->bcount = 1;
49353b1e3a6Sedward p->ccount = 0;
49453b1e3a6Sedward p->flag = 0;
49553b1e3a6Sedward if ((p->hforw = *h) != 0)
49653b1e3a6Sedward p->hforw->hback = &p->hforw;
49753b1e3a6Sedward *h = p;
49853b1e3a6Sedward p->hback = h;
4997175ad38Sedward qinsert(p, &cc_q1a);
50053b1e3a6Sedward places[i] = -1;
50153b1e3a6Sedward p->places = i;
50253b1e3a6Sedward #ifdef STATS
5037175ad38Sedward ntoken_stat++;
50453b1e3a6Sedward #endif
5057175ad38Sedward } else if (p->time < cc_time0) {
5067175ad38Sedward #ifndef cc_weight
50753b1e3a6Sedward if ((p->weight += p->time - time) < 0)
5087175ad38Sedward p->weight = cc_weight;
5097175ad38Sedward else if ((p->weight += cc_weight) > limit)
51053b1e3a6Sedward p->weight = limit;
51153b1e3a6Sedward #endif
51253b1e3a6Sedward p->time = time;
51353b1e3a6Sedward p->bcount = 1;
51453b1e3a6Sedward p->ccount = 0;
51553b1e3a6Sedward if (p->code >= 0) {
51653b1e3a6Sedward p->flag = 1;
51753b1e3a6Sedward *pp++ = p;
51853b1e3a6Sedward } else
5197175ad38Sedward #ifndef cc_weight
52053b1e3a6Sedward if (p->weight >= wthreshold) {
52153b1e3a6Sedward p->flag = 1;
52253b1e3a6Sedward *pp++ = p;
5237175ad38Sedward qinsert(p, &cc_q1b);
52453b1e3a6Sedward } else
52553b1e3a6Sedward #endif
52653b1e3a6Sedward {
52753b1e3a6Sedward p->flag = 0;
5287175ad38Sedward qinsert(p, &cc_q1a);
52953b1e3a6Sedward }
53053b1e3a6Sedward places[i] = -1;
53153b1e3a6Sedward p->places = i;
53253b1e3a6Sedward #ifdef STATS
5337175ad38Sedward ntoken_stat++;
53453b1e3a6Sedward #endif
53553b1e3a6Sedward } else if (p->time + length > time) {
53653b1e3a6Sedward /*
53753b1e3a6Sedward * overlapping token, don't count as two and
53853b1e3a6Sedward * don't update time, but do adjust weight to offset
53953b1e3a6Sedward * the difference
54053b1e3a6Sedward */
5417175ad38Sedward #ifndef cc_weight
5427175ad38Sedward if (cc_weight != 0) { /* XXX */
54353b1e3a6Sedward p->weight += time - p->time;
54453b1e3a6Sedward if (!p->flag && p->weight >= wthreshold) {
54553b1e3a6Sedward p->flag = 1;
54653b1e3a6Sedward *pp++ = p;
5477175ad38Sedward qinsert(p, &cc_q1b);
54853b1e3a6Sedward }
54953b1e3a6Sedward }
55053b1e3a6Sedward #endif
55153b1e3a6Sedward places[i] = p->places;
55253b1e3a6Sedward p->places = i;
55353b1e3a6Sedward } else {
5547175ad38Sedward #ifndef cc_weight
55553b1e3a6Sedward if ((p->weight += p->time - time) < 0)
5567175ad38Sedward p->weight = cc_weight;
5577175ad38Sedward else if ((p->weight += cc_weight) > limit)
55853b1e3a6Sedward p->weight = limit;
55953b1e3a6Sedward #endif
56053b1e3a6Sedward p->time = time;
56153b1e3a6Sedward p->bcount++;
56253b1e3a6Sedward if (!p->flag &&
56353b1e3a6Sedward /* code must be < 0 if flag false here */
56453b1e3a6Sedward (p->bcount >= threshold
5657175ad38Sedward #ifndef cc_weight
56653b1e3a6Sedward || p->weight >= wthreshold
56753b1e3a6Sedward #endif
56853b1e3a6Sedward )) {
56953b1e3a6Sedward p->flag = 1;
57053b1e3a6Sedward *pp++ = p;
5717175ad38Sedward qinsert(p, &cc_q1b);
57253b1e3a6Sedward }
57353b1e3a6Sedward places[i] = p->places;
57453b1e3a6Sedward p->places = i;
57553b1e3a6Sedward }
57653b1e3a6Sedward }
57753b1e3a6Sedward if ((i = pp - tokens) > 0) {
57853b1e3a6Sedward *pp = 0;
5797175ad38Sedward if (cc_reverse)
5807175ad38Sedward cc_sweep_reverse(tokens, places);
5817175ad38Sedward if (cc_sort && i > 1) {
5827175ad38Sedward int cc_token_compare();
58353b1e3a6Sedward qsort((char *) tokens, i, sizeof *tokens,
5847175ad38Sedward cc_token_compare);
58553b1e3a6Sedward }
5867175ad38Sedward if (cc_chop) {
5877175ad38Sedward if ((i = i * cc_chop / 100) == 0)
58853b1e3a6Sedward i = 1;
58953b1e3a6Sedward tokens[i] = 0;
59053b1e3a6Sedward }
59153b1e3a6Sedward i++;
59253b1e3a6Sedward }
59353b1e3a6Sedward return i;
59453b1e3a6Sedward }
59553b1e3a6Sedward
cc_sweep_reverse(pp,places)5967175ad38Sedward cc_sweep_reverse(pp, places)
5977175ad38Sedward register struct cc **pp;
59853b1e3a6Sedward register short *places;
59953b1e3a6Sedward {
6007175ad38Sedward register struct cc *p;
60153b1e3a6Sedward register short front, back, t;
60253b1e3a6Sedward
60353b1e3a6Sedward while ((p = *pp++) != 0) {
60453b1e3a6Sedward back = -1;
60553b1e3a6Sedward t = p->places;
60653b1e3a6Sedward /* the list is never empty */
60753b1e3a6Sedward do {
60853b1e3a6Sedward front = places[t];
60953b1e3a6Sedward places[t] = back;
61053b1e3a6Sedward back = t;
61153b1e3a6Sedward } while ((t = front) >= 0);
61253b1e3a6Sedward p->places = back;
61353b1e3a6Sedward }
61453b1e3a6Sedward }
61553b1e3a6Sedward
6167175ad38Sedward cc_compress_phase(output, bufsize, tokens, ntoken)
6177175ad38Sedward struct cc **output;
6187175ad38Sedward struct cc **tokens;
61953b1e3a6Sedward {
62053b1e3a6Sedward register i;
62153b1e3a6Sedward
62253b1e3a6Sedward bzero((char *) output, bufsize * sizeof *output);
6237175ad38Sedward for (i = 0; i < cc_npass0; i++)
6247175ad38Sedward cc_compress_phase1(output, tokens, ntoken, 0);
6257175ad38Sedward for (i = 0; i < cc_npass1; i++)
6267175ad38Sedward cc_compress_phase1(output, tokens, ntoken, 1);
6277175ad38Sedward cc_compress_cleanup(output, bufsize);
62853b1e3a6Sedward }
62953b1e3a6Sedward
cc_compress_phase1(output,tokens,ntoken,flag)6307175ad38Sedward cc_compress_phase1(output, tokens, ntoken, flag)
6317175ad38Sedward register struct cc **output;
6327175ad38Sedward struct cc **tokens;
63353b1e3a6Sedward {
6347175ad38Sedward register struct cc **pp;
6357175ad38Sedward #ifdef STATS
636d6e05283Sedward register int i = 0;
6377175ad38Sedward int nt = 0, cc = 0, nc = 0;
6387175ad38Sedward #endif
63953b1e3a6Sedward
64053b1e3a6Sedward #ifdef STATS
6417175ad38Sedward if (verbose >= 0)
6427175ad38Sedward time_begin();
64353b1e3a6Sedward if (verbose > 0)
64453b1e3a6Sedward printf("Compress:");
64553b1e3a6Sedward #endif
64653b1e3a6Sedward pp = tokens;
64753b1e3a6Sedward while (pp < tokens + ntoken) {
64853b1e3a6Sedward #ifdef STATS
64953b1e3a6Sedward if (verbose > 0) {
6507175ad38Sedward ntoken_stat = 0;
6517175ad38Sedward ccount_stat = 0;
6527175ad38Sedward ncover_stat = 0;
65353b1e3a6Sedward if (i > 2) {
65453b1e3a6Sedward printf("\n ");
65553b1e3a6Sedward i = 0;
65653b1e3a6Sedward }
65753b1e3a6Sedward i++;
65853b1e3a6Sedward printf(" (%d", (*pp)->length);
65953b1e3a6Sedward (void) fflush(stdout);
66053b1e3a6Sedward }
66153b1e3a6Sedward #endif
6627175ad38Sedward pp += cc_compress(output, pp, flag);
66353b1e3a6Sedward #ifdef STATS
6647175ad38Sedward if (verbose > 0) {
6657175ad38Sedward printf(" %dt %du %dc)", ntoken_stat, ccount_stat,
6667175ad38Sedward ncover_stat);
6677175ad38Sedward nt += ntoken_stat;
6687175ad38Sedward cc += ccount_stat;
6697175ad38Sedward nc += ncover_stat;
6707175ad38Sedward }
67153b1e3a6Sedward #endif
67253b1e3a6Sedward }
67353b1e3a6Sedward #ifdef STATS
67453b1e3a6Sedward if (verbose > 0)
6757175ad38Sedward printf("\n total: (%dt %du %dc)\n", nt, cc, nc);
6767175ad38Sedward if (verbose >= 0)
6777175ad38Sedward time_end();
67853b1e3a6Sedward #endif
67953b1e3a6Sedward }
68053b1e3a6Sedward
cc_compress_cleanup(output,bufsize)6817175ad38Sedward cc_compress_cleanup(output, bufsize)
6827175ad38Sedward register struct cc **output;
68353b1e3a6Sedward {
6847175ad38Sedward register struct cc **end;
68553b1e3a6Sedward
68653b1e3a6Sedward /* the previous output phase may have been interrupted */
6877175ad38Sedward qinsertq(&cc_q0b, &cc_q0a);
68853b1e3a6Sedward for (end = output + bufsize; output < end;) {
6897175ad38Sedward register struct cc *p;
69053b1e3a6Sedward register length;
69153b1e3a6Sedward if ((p = *output) == 0) {
69253b1e3a6Sedward output++;
69353b1e3a6Sedward continue;
69453b1e3a6Sedward }
69553b1e3a6Sedward length = p->length;
69653b1e3a6Sedward if (!p->flag) {
69753b1e3a6Sedward } else if (p->code >= 0) {
6987175ad38Sedward qinsert(p, &cc_q0b);
69953b1e3a6Sedward p->flag = 0;
70053b1e3a6Sedward } else if (p->ccount == 0) {
70153b1e3a6Sedward *output = 0;
70253b1e3a6Sedward } else if (p->ccount >= thresh(length)
7037175ad38Sedward #ifndef cc_weight
70453b1e3a6Sedward || wthreshp(p->weight, length)
70553b1e3a6Sedward #endif
70653b1e3a6Sedward ) {
70753b1e3a6Sedward p->flag = 0;
70853b1e3a6Sedward } else {
70953b1e3a6Sedward p->ccount = 0;
71053b1e3a6Sedward *output = 0;
71153b1e3a6Sedward }
71253b1e3a6Sedward output += length;
71353b1e3a6Sedward }
71453b1e3a6Sedward }
71553b1e3a6Sedward
7167175ad38Sedward cc_compress(output, tokens, flag)
7177175ad38Sedward struct cc **output;
7187175ad38Sedward struct cc **tokens;
71953b1e3a6Sedward char flag;
72053b1e3a6Sedward {
7217175ad38Sedward struct cc **pp = tokens;
7227175ad38Sedward register struct cc *p = *pp++;
72353b1e3a6Sedward int length = p->length;
72453b1e3a6Sedward int threshold = thresh(length);
7257175ad38Sedward #ifndef cc_weight
72653b1e3a6Sedward short wthreshold = wthresh(length);
72753b1e3a6Sedward #endif
7287175ad38Sedward short *places = cc_places[length];
7297175ad38Sedward int *initial_scores = cc_initial_scores[length];
73053b1e3a6Sedward int initial_score0 = put_token_score(length);
73153b1e3a6Sedward
73253b1e3a6Sedward do {
73353b1e3a6Sedward int score;
7347175ad38Sedward register struct cc_undo *undop;
73553b1e3a6Sedward int ccount;
73653b1e3a6Sedward #ifdef STATS
73753b1e3a6Sedward int ncover;
73853b1e3a6Sedward #endif
73953b1e3a6Sedward int i;
74053b1e3a6Sedward
74153b1e3a6Sedward ccount = p->ccount;
74253b1e3a6Sedward if ((short) ccount >= p->bcount)
74353b1e3a6Sedward continue;
74453b1e3a6Sedward if (p->code >= 0 || ccount >= threshold)
74553b1e3a6Sedward score = 0;
7467175ad38Sedward #ifndef cc_weight
74753b1e3a6Sedward else if (p->weight >= wthreshold)
74853b1e3a6Sedward /* allow one fewer match than normal */
74953b1e3a6Sedward /* XXX, should adjust for ccount */
75053b1e3a6Sedward score = - tt.tt_set_token_cost;
75153b1e3a6Sedward #endif
75253b1e3a6Sedward else
75353b1e3a6Sedward score = initial_scores[ccount];
7547175ad38Sedward undop = cc_undo;
75553b1e3a6Sedward #ifdef STATS
75653b1e3a6Sedward ncover = 0;
75753b1e3a6Sedward #endif
75853b1e3a6Sedward for (i = p->places; i >= 0; i = places[i]) {
7597175ad38Sedward register struct cc **jp;
7607175ad38Sedward register struct cc *x;
7617175ad38Sedward register struct cc **ip = output + i;
76253b1e3a6Sedward register score0 = initial_score0;
7637175ad38Sedward struct cc **iip = ip + length;
7647175ad38Sedward struct cc_undo *undop1 = undop;
76553b1e3a6Sedward
76653b1e3a6Sedward if ((x = *(jp = ip)) != 0)
76753b1e3a6Sedward goto z;
76853b1e3a6Sedward while (--jp >= output)
76953b1e3a6Sedward if ((x = *jp) != 0) {
77053b1e3a6Sedward if (jp + x->length > ip)
77153b1e3a6Sedward goto z;
77253b1e3a6Sedward break;
77353b1e3a6Sedward }
77453b1e3a6Sedward jp = ip + 1;
77553b1e3a6Sedward while (jp < iip) {
77653b1e3a6Sedward if ((x = *jp) == 0) {
77753b1e3a6Sedward jp++;
77853b1e3a6Sedward continue;
77953b1e3a6Sedward }
78053b1e3a6Sedward z:
78153b1e3a6Sedward if (x == p)
78253b1e3a6Sedward goto undo;
78353b1e3a6Sedward #ifdef STATS
78453b1e3a6Sedward ncover++;
78553b1e3a6Sedward #endif
78653b1e3a6Sedward undop->pos = jp;
78753b1e3a6Sedward undop->val = x;
78853b1e3a6Sedward undop++;
78953b1e3a6Sedward *jp = 0;
79053b1e3a6Sedward x->ccount--;
79153b1e3a6Sedward score_adjust(score0, x);
79253b1e3a6Sedward if (score0 < 0 && flag)
79353b1e3a6Sedward goto undo;
79453b1e3a6Sedward jp += x->length;
79553b1e3a6Sedward }
79653b1e3a6Sedward undop->pos = ip;
79753b1e3a6Sedward undop->val = 0;
79853b1e3a6Sedward undop++;
79953b1e3a6Sedward *ip = p;
80053b1e3a6Sedward ccount++;
80153b1e3a6Sedward score += score0;
80253b1e3a6Sedward continue;
80353b1e3a6Sedward undo:
80453b1e3a6Sedward while (--undop >= undop1)
80553b1e3a6Sedward if (*undop->pos = x = undop->val)
80653b1e3a6Sedward x->ccount++;
80753b1e3a6Sedward undop++;
80853b1e3a6Sedward }
80953b1e3a6Sedward if (score > 0) {
81053b1e3a6Sedward #ifdef STATS
8117175ad38Sedward ccount_stat += ccount - p->ccount;
8127175ad38Sedward ntoken_stat++;
8137175ad38Sedward ncover_stat += ncover;
81453b1e3a6Sedward #endif
81553b1e3a6Sedward p->ccount = ccount;
81653b1e3a6Sedward } else {
8177175ad38Sedward register struct cc_undo *u = cc_undo;
81853b1e3a6Sedward while (--undop >= u) {
8197175ad38Sedward register struct cc *x;
82053b1e3a6Sedward if (*undop->pos = x = undop->val)
82153b1e3a6Sedward x->ccount++;
82253b1e3a6Sedward }
82353b1e3a6Sedward }
82453b1e3a6Sedward } while ((p = *pp++) != 0);
82553b1e3a6Sedward return pp - tokens;
826264c2f6fSedward }
827264c2f6fSedward
cc_output_phase(buffer,output,bufsize)8287175ad38Sedward cc_output_phase(buffer, output, bufsize)
8297175ad38Sedward register char *buffer;
8307175ad38Sedward register struct cc **output;
8317175ad38Sedward register bufsize;
832264c2f6fSedward {
833264c2f6fSedward register i;
8347175ad38Sedward register struct cc *p, *p1;
835264c2f6fSedward
8367175ad38Sedward for (i = 0; i < bufsize;) {
83753b1e3a6Sedward if ((p = output[i]) == 0) {
8387175ad38Sedward ttputc(buffer[i]);
839264c2f6fSedward i++;
84053b1e3a6Sedward } else if (p->code >= 0) {
84153b1e3a6Sedward if (--p->ccount == 0)
8427175ad38Sedward qinsert(p, &cc_q0a);
84353b1e3a6Sedward (*tt.tt_put_token)(p->code, p->string, p->length);
844264c2f6fSedward wwntokuse++;
84553b1e3a6Sedward wwntoksave += put_token_score(p->length);
84653b1e3a6Sedward i += p->length;
8477175ad38Sedward } else if ((p1 = cc_q0a.qback) != &cc_q0a) {
84853b1e3a6Sedward p->code = p1->code;
84953b1e3a6Sedward p1->code = -1;
8507175ad38Sedward qinsert(p1, &cc_q1a);
85153b1e3a6Sedward if (--p->ccount == 0)
8527175ad38Sedward qinsert(p, &cc_q0a);
85353b1e3a6Sedward else
8547175ad38Sedward qinsert(p, &cc_q0b);
85553b1e3a6Sedward (*tt.tt_set_token)(p->code, p->string, p->length);
856264c2f6fSedward wwntokdef++;
857264c2f6fSedward wwntoksave -= tt.tt_set_token_cost;
85853b1e3a6Sedward i += p->length;
859264c2f6fSedward } else {
86053b1e3a6Sedward p->ccount--;
8617175ad38Sedward ttwrite(p->string, p->length);
86253b1e3a6Sedward wwntokbad++;
86353b1e3a6Sedward i += p->length;
864264c2f6fSedward }
865264c2f6fSedward }
8667175ad38Sedward wwntokc += bufsize;
867264c2f6fSedward }
86853b1e3a6Sedward
8697175ad38Sedward cc_token_compare(p1, p2)
8707175ad38Sedward struct cc **p1, **p2;
87153b1e3a6Sedward {
87253b1e3a6Sedward return (*p2)->bcount - (*p1)->bcount;
87353b1e3a6Sedward }
874