1 /* `L' command implementation for GNU sed, based on GNU fmt 1.22.
2 Copyright (C) 1994, 1995, 1996, 2002, 2003 Free Software Foundation, Inc.
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2, or (at your option)
7 any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software Foundation,
16 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
17
18 /* GNU fmt was written by Ross Paterson <rap@doc.ic.ac.uk>. */
19
20 #include "sed.h"
21
22 #include <stdio.h>
23 #include <ctype.h>
24 #include <sys/types.h>
25
26 #if HAVE_LIMITS_H
27 # include <limits.h>
28 #endif
29
30 #ifndef UINT_MAX
31 # define UINT_MAX ((unsigned int) ~(unsigned int) 0)
32 #endif
33
34 #ifndef INT_MAX
35 # define INT_MAX ((int) (UINT_MAX >> 1))
36 #endif
37
38 /* The following parameters represent the program's idea of what is
39 "best". Adjust to taste, subject to the caveats given. */
40
41 /* Prefer lines to be LEEWAY % shorter than the maximum width, giving
42 room for optimization. */
43 #define LEEWAY 7
44
45 /* Costs and bonuses are expressed as the equivalent departure from the
46 optimal line length, multiplied by 10. e.g. assigning something a
47 cost of 50 means that it is as bad as a line 5 characters too short
48 or too long. The definition of SHORT_COST(n) should not be changed.
49 However, EQUIV(n) may need tuning. */
50
51 typedef long COST;
52
53 #define MAXCOST (~(((unsigned long) 1) << (8 * sizeof (COST) -1)))
54
55 #define SQR(n) ((n) * (n))
56 #define EQUIV(n) SQR ((COST) (n))
57
58 /* Cost of a filled line n chars longer or shorter than best_width. */
59 #define SHORT_COST(n) EQUIV ((n) * 10)
60
61 /* Cost of the difference between adjacent filled lines. */
62 #define RAGGED_COST(n) (SHORT_COST (n) / 2)
63
64 /* Basic cost per line. */
65 #define LINE_COST EQUIV (70)
66
67 /* Cost of breaking a line after the first word of a sentence, where
68 the length of the word is N. */
69 #define WIDOW_COST(n) (EQUIV (200) / ((n) + 2))
70
71 /* Cost of breaking a line before the last word of a sentence, where
72 the length of the word is N. */
73 #define ORPHAN_COST(n) (EQUIV (150) / ((n) + 2))
74
75 /* Bonus for breaking a line at the end of a sentence. */
76 #define SENTENCE_BONUS EQUIV (50)
77
78 /* Cost of breaking a line after a period not marking end of a sentence.
79 With the definition of sentence we are using (borrowed from emacs, see
80 get_line()) such a break would then look like a sentence break. Hence
81 we assign a very high cost -- it should be avoided unless things are
82 really bad. */
83 #define NOBREAK_COST EQUIV (600)
84
85 /* Bonus for breaking a line before open parenthesis. */
86 #define PAREN_BONUS EQUIV (40)
87
88 /* Bonus for breaking a line after other punctuation. */
89 #define PUNCT_BONUS EQUIV(40)
90
91 /* Credit for breaking a long paragraph one line later. */
92 #define LINE_CREDIT EQUIV(3)
93
94 /* Size of paragraph buffer in words. Longer paragraphs are handled
95 neatly (cf. flush_paragraph()), so there's little to gain by making
96 these larger. */
97 #define MAXWORDS 1000
98
99 #define GETC() (parabuf == end_of_parabuf ? EOF : *parabuf++)
100
101 /* Extra ctype(3)-style macros. */
102
103 #define isopen(c) (strchr ("([`'\"", (c)) != NULL)
104 #define isclose(c) (strchr (")]'\"", (c)) != NULL)
105 #define isperiod(c) (strchr (".?!", (c)) != NULL)
106
107 /* Size of a tab stop, for expansion on input and re-introduction on
108 output. */
109 #define TABWIDTH 8
110
111 /* Word descriptor structure. */
112
113 typedef struct Word WORD;
114
115 struct Word
116 {
117
118 /* Static attributes determined during input. */
119
120 const char *text; /* the text of the word */
121 short length; /* length of this word */
122 short space; /* the size of the following space */
123 unsigned paren:1; /* starts with open paren */
124 unsigned period:1; /* ends in [.?!])* */
125 unsigned punct:1; /* ends in punctuation */
126 unsigned final:1; /* end of sentence */
127
128 /* The remaining fields are computed during the optimization. */
129
130 short line_length; /* length of the best line starting here */
131 COST best_cost; /* cost of best paragraph starting here */
132 WORD *next_break; /* break which achieves best_cost */
133 };
134
135 /* Forward declarations. */
136
137 static bool get_paragraph P_ ((void));
138 static int get_line P_ ((int c));
139 static int get_space P_ ((int c));
140 static int copy_rest P_ ((int c));
141 static bool same_para P_ ((int c));
142 static void flush_paragraph P_ ((void));
143 static void fmt_paragraph P_ ((void));
144 static void check_punctuation P_ ((WORD *w));
145 static COST base_cost P_ ((WORD *this));
146 static COST line_cost P_ ((WORD *next, int len));
147 static void put_paragraph P_ ((WORD *finish));
148 static void put_line P_ ((WORD *w, int indent));
149 static void put_word P_ ((WORD *w));
150 static void put_space P_ ((int space));
151
152 /* Option values. */
153
154 /* User-supplied maximum line width (default WIDTH). The only output
155 lines
156 longer than this will each comprise a single word. */
157 static int max_width;
158
159 /* Space for the paragraph text. */
160 static const char *parabuf;
161
162 /* End of space for the paragraph text. */
163 static const char *end_of_parabuf;
164
165 /* The file on which we output */
166 static FILE *outfile;
167
168 /* Values derived from the option values. */
169
170 /* The preferred width of text lines, set to LEEWAY % less than max_width. */
171 static int best_width;
172
173 /* Dynamic variables. */
174
175 /* Start column of the character most recently read from the input file. */
176 static int in_column;
177
178 /* Start column of the next character to be written to stdout. */
179 static int out_column;
180
181 /* The words of a paragraph -- longer paragraphs are handled neatly
182 (cf. flush_paragraph()). */
183 static WORD words[MAXWORDS];
184
185 /* A pointer into the above word array, indicating the first position
186 after the last complete word. Sometimes it will point at an incomplete
187 word. */
188 static WORD *word_limit;
189
190 /* Indentation of the first line of the current paragraph. */
191 static int first_indent;
192
193 /* Indentation of other lines of the current paragraph */
194 static int other_indent;
195
196 /* The last character read from the input file. */
197 static int next_char;
198
199 /* If nonzero, the length of the last line output in the current
200 paragraph, used to charge for raggedness at the split point for long
201 paragraphs chosen by fmt_paragraph(). */
202 static int last_line_length;
203
204 /* read file F and send formatted output to stdout. */
205
206 void
fmt(const char * line,const char * line_end,int max_length,FILE * output_file)207 fmt (const char *line, const char *line_end, int max_length, FILE *output_file)
208 {
209 parabuf = line;
210 end_of_parabuf = line_end;
211 outfile = output_file;
212
213 max_width = max_length;
214 best_width = max_width * (201 - 2 * LEEWAY) / 200;
215
216 in_column = 0;
217 other_indent = 0;
218 next_char = GETC();
219 while (get_paragraph ())
220 {
221 fmt_paragraph ();
222 put_paragraph (word_limit);
223 }
224 }
225
226 /* Read a paragraph from input file F. A paragraph consists of a
227 maximal number of non-blank (excluding any prefix) lines
228 with the same indent.
229
230 Return false if end-of-file was encountered before the start of a
231 paragraph, else true. */
232
233 static bool
get_paragraph()234 get_paragraph ()
235 {
236 register int c;
237
238 last_line_length = 0;
239 c = next_char;
240
241 /* Scan (and copy) blank lines, and lines not introduced by the prefix. */
242
243 while (c == '\n' || c == EOF)
244 {
245 c = copy_rest (c);
246 if (c == EOF)
247 {
248 next_char = EOF;
249 return false;
250 }
251 putc ('\n', outfile);
252 c = GETC();
253 }
254
255 /* Got a suitable first line for a paragraph. */
256
257 first_indent = in_column;
258 word_limit = words;
259 c = get_line (c);
260
261 /* Read rest of paragraph. */
262
263 other_indent = in_column;
264 while (same_para (c) && in_column == other_indent)
265 c = get_line (c);
266
267 (word_limit - 1)->period = (word_limit - 1)->final = true;
268 next_char = c;
269 return true;
270 }
271
272 /* Copy to the output a blank line. In the latter, C is \n or EOF.
273 Return the character (\n or EOF) ending the line. */
274
275 static int
copy_rest(register int c)276 copy_rest (register int c)
277 {
278 out_column = 0;
279 while (c != '\n' && c != EOF)
280 {
281 putc (c, outfile);
282 c = GETC();
283 }
284 return c;
285 }
286
287 /* Return true if a line whose first non-blank character after the
288 prefix (if any) is C could belong to the current paragraph,
289 otherwise false. */
290
291 static bool
same_para(register int c)292 same_para (register int c)
293 {
294 return (c != '\n' && c != EOF);
295 }
296
297 /* Read a line from the input data given first non-blank character C
298 after the prefix, and the following indent, and break it into words.
299 A word is a maximal non-empty string of non-white characters. A word
300 ending in [.?!]["')\]]* and followed by end-of-line or at least two
301 spaces ends a sentence, as in emacs.
302
303 Return the first non-blank character of the next line. */
304
305 static int
get_line(register int c)306 get_line (register int c)
307 {
308 int start;
309 register WORD *end_of_word;
310
311 end_of_word = &words[MAXWORDS - 2];
312
313 do
314 { /* for each word in a line */
315
316 /* Scan word. */
317
318 word_limit->text = parabuf - 1;
319 do
320 c = GETC();
321 while (c != EOF && !ISSPACE (c));
322 word_limit->length = parabuf - word_limit->text - (c != EOF);
323 in_column += word_limit->length;
324
325 check_punctuation (word_limit);
326
327 /* Scan inter-word space. */
328
329 start = in_column;
330 c = get_space (c);
331 word_limit->space = in_column - start;
332 word_limit->final = (c == EOF
333 || (word_limit->period
334 && (c == '\n' || word_limit->space > 1)));
335 if (c == '\n' || c == EOF)
336 word_limit->space = word_limit->final ? 2 : 1;
337 if (word_limit == end_of_word)
338 flush_paragraph ();
339 word_limit++;
340 if (c == EOF)
341 {
342 in_column = first_indent;
343 return EOF;
344 }
345 }
346 while (c != '\n');
347
348 in_column = 0;
349 c = GETC();
350 return get_space (c);
351 }
352
353 /* Read blank characters from the input data, starting with C, and keeping
354 in_column up-to-date. Return first non-blank character. */
355
356 static int
get_space(register int c)357 get_space (register int c)
358 {
359 for (;;)
360 {
361 if (c == ' ')
362 in_column++;
363 else if (c == '\t')
364 in_column = (in_column / TABWIDTH + 1) * TABWIDTH;
365 else
366 return c;
367 c = GETC();
368 }
369 }
370
371 /* Set extra fields in word W describing any attached punctuation. */
372
373 static void
check_punctuation(register WORD * w)374 check_punctuation (register WORD *w)
375 {
376 register const char *start, *finish;
377
378 start = w->text;
379 finish = start + (w->length - 1);
380 w->paren = isopen (*start);
381 w->punct = ISPUNCT (*finish);
382 while (isclose (*finish) && finish > start)
383 finish--;
384 w->period = isperiod (*finish);
385 }
386
387 /* Flush part of the paragraph to make room. This function is called on
388 hitting the limit on the number of words or characters. */
389
390 static void
flush_paragraph(void)391 flush_paragraph (void)
392 {
393 WORD *split_point;
394 register WORD *w;
395 COST best_break;
396
397 /* - format what you have so far as a paragraph,
398 - find a low-cost line break near the end,
399 - output to there,
400 - make that the start of the paragraph. */
401
402 fmt_paragraph ();
403
404 /* Choose a good split point. */
405
406 split_point = word_limit;
407 best_break = MAXCOST;
408 for (w = words->next_break; w != word_limit; w = w->next_break)
409 {
410 if (w->best_cost - w->next_break->best_cost < best_break)
411 {
412 split_point = w;
413 best_break = w->best_cost - w->next_break->best_cost;
414 }
415 if (best_break <= MAXCOST - LINE_CREDIT)
416 best_break += LINE_CREDIT;
417 }
418 put_paragraph (split_point);
419
420 /* Copy words from split_point down to word -- we use memmove because
421 the source and target may overlap. */
422
423 memmove ((char *) words, (char *) split_point,
424 (word_limit - split_point + 1) * sizeof (WORD));
425 word_limit -= split_point - words;
426 }
427
428 /* Compute the optimal formatting for the whole paragraph by computing
429 and remembering the optimal formatting for each suffix from the empty
430 one to the whole paragraph. */
431
432 static void
fmt_paragraph(void)433 fmt_paragraph (void)
434 {
435 register WORD *start, *w;
436 register int len;
437 register COST wcost, best;
438 int saved_length;
439
440 word_limit->best_cost = 0;
441 saved_length = word_limit->length;
442 word_limit->length = max_width; /* sentinel */
443
444 for (start = word_limit - 1; start >= words; start--)
445 {
446 best = MAXCOST;
447 len = start == words ? first_indent : other_indent;
448
449 /* At least one word, however long, in the line. */
450
451 w = start;
452 len += w->length;
453 do
454 {
455 w++;
456
457 /* Consider breaking before w. */
458
459 wcost = line_cost (w, len) + w->best_cost;
460 if (start == words && last_line_length > 0)
461 wcost += RAGGED_COST (len - last_line_length);
462 if (wcost < best)
463 {
464 best = wcost;
465 start->next_break = w;
466 start->line_length = len;
467 }
468 len += (w - 1)->space + w->length; /* w > start >= words */
469 }
470 while (len < max_width);
471 start->best_cost = best + base_cost (start);
472 }
473
474 word_limit->length = saved_length;
475 }
476
477 /* Return the constant component of the cost of breaking before the
478 word THIS. */
479
480 static COST
base_cost(register WORD * this)481 base_cost (register WORD *this)
482 {
483 register COST cost;
484
485 cost = LINE_COST;
486
487 if (this > words)
488 {
489 if ((this - 1)->period)
490 {
491 if ((this - 1)->final)
492 cost -= SENTENCE_BONUS;
493 else
494 cost += NOBREAK_COST;
495 }
496 else if ((this - 1)->punct)
497 cost -= PUNCT_BONUS;
498 else if (this > words + 1 && (this - 2)->final)
499 cost += WIDOW_COST ((this - 1)->length);
500 }
501
502 if (this->paren)
503 cost -= PAREN_BONUS;
504 else if (this->final)
505 cost += ORPHAN_COST (this->length);
506
507 return cost;
508 }
509
510 /* Return the component of the cost of breaking before word NEXT that
511 depends on LEN, the length of the line beginning there. */
512
513 static COST
line_cost(register WORD * next,register int len)514 line_cost (register WORD *next, register int len)
515 {
516 register int n;
517 register COST cost;
518
519 if (next == word_limit)
520 return 0;
521 n = best_width - len;
522 cost = SHORT_COST (n);
523 if (next->next_break != word_limit)
524 {
525 n = len - next->line_length;
526 cost += RAGGED_COST (n);
527 }
528 return cost;
529 }
530
531 /* Output to stdout a paragraph from word up to (but not including)
532 FINISH, which must be in the next_break chain from word. */
533
534 static void
put_paragraph(register WORD * finish)535 put_paragraph (register WORD *finish)
536 {
537 register WORD *w;
538
539 put_line (words, first_indent);
540 for (w = words->next_break; w != finish; w = w->next_break)
541 put_line (w, other_indent);
542 }
543
544 /* Output to stdout the line beginning with word W, beginning in column
545 INDENT, including the prefix (if any). */
546
547 static void
put_line(register WORD * w,int indent)548 put_line (register WORD *w, int indent)
549 {
550 register WORD *endline;
551 out_column = 0;
552 put_space (indent);
553
554 endline = w->next_break - 1;
555 for (; w != endline; w++)
556 {
557 put_word (w);
558 put_space (w->space);
559 }
560 put_word (w);
561 last_line_length = out_column;
562 putc ('\n', outfile);
563 }
564
565 /* Output to stdout the word W. */
566
567 static void
put_word(register WORD * w)568 put_word (register WORD *w)
569 {
570 register const char *s;
571 register int n;
572
573 s = w->text;
574 for (n = w->length; n != 0; n--)
575 putc (*s++, outfile);
576 out_column += w->length;
577 }
578
579 /* Output to stdout SPACE spaces, or equivalent tabs. */
580
581 static void
put_space(int space)582 put_space (int space)
583 {
584 out_column += space;
585 while (space--)
586 putc (' ', outfile);
587 }
588