1 /* `L' command implementation for GNU sed, based on GNU fmt 1.22.
2    Copyright (C) 1994, 1995, 1996, 2002, 2003 Free Software Foundation, Inc.
3 
4    This program is free software; you can redistribute it and/or modify
5    it under the terms of the GNU General Public License as published by
6    the Free Software Foundation; either version 2, or (at your option)
7    any later version.
8 
9    This program is distributed in the hope that it will be useful,
10    but WITHOUT ANY WARRANTY; without even the implied warranty of
11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12    GNU General Public License for more details.
13 
14    You should have received a copy of the GNU General Public License
15    along with this program; if not, write to the Free Software Foundation,
16    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
17 
18 /* GNU fmt was written by Ross Paterson <rap@doc.ic.ac.uk>.  */
19 
20 #include "sed.h"
21 
22 #include <stdio.h>
23 #include <ctype.h>
24 #include <sys/types.h>
25 
26 #if HAVE_LIMITS_H
27 # include <limits.h>
28 #endif
29 
30 #ifndef UINT_MAX
31 # define UINT_MAX ((unsigned int) ~(unsigned int) 0)
32 #endif
33 
34 #ifndef INT_MAX
35 # define INT_MAX ((int) (UINT_MAX >> 1))
36 #endif
37 
38 /* The following parameters represent the program's idea of what is
39    "best".  Adjust to taste, subject to the caveats given.  */
40 
41 /* Prefer lines to be LEEWAY % shorter than the maximum width, giving
42    room for optimization.  */
43 #define	LEEWAY	7
44 
45 /* Costs and bonuses are expressed as the equivalent departure from the
46    optimal line length, multiplied by 10.  e.g. assigning something a
47    cost of 50 means that it is as bad as a line 5 characters too short
48    or too long.  The definition of SHORT_COST(n) should not be changed.
49    However, EQUIV(n) may need tuning.  */
50 
51 typedef long COST;
52 
53 #define	MAXCOST	(~(((unsigned long) 1) << (8 * sizeof (COST) -1)))
54 
55 #define	SQR(n)		((n) * (n))
56 #define	EQUIV(n)	SQR ((COST) (n))
57 
58 /* Cost of a filled line n chars longer or shorter than best_width.  */
59 #define	SHORT_COST(n)	EQUIV ((n) * 10)
60 
61 /* Cost of the difference between adjacent filled lines.  */
62 #define	RAGGED_COST(n)	(SHORT_COST (n) / 2)
63 
64 /* Basic cost per line.  */
65 #define	LINE_COST	EQUIV (70)
66 
67 /* Cost of breaking a line after the first word of a sentence, where
68    the length of the word is N.  */
69 #define	WIDOW_COST(n)	(EQUIV (200) / ((n) + 2))
70 
71 /* Cost of breaking a line before the last word of a sentence, where
72    the length of the word is N.  */
73 #define	ORPHAN_COST(n)	(EQUIV (150) / ((n) + 2))
74 
75 /* Bonus for breaking a line at the end of a sentence.  */
76 #define	SENTENCE_BONUS	EQUIV (50)
77 
78 /* Cost of breaking a line after a period not marking end of a sentence.
79    With the definition of sentence we are using (borrowed from emacs, see
80    get_line()) such a break would then look like a sentence break.  Hence
81    we assign a very high cost -- it should be avoided unless things are
82    really bad.  */
83 #define	NOBREAK_COST	EQUIV (600)
84 
85 /* Bonus for breaking a line before open parenthesis.  */
86 #define	PAREN_BONUS	EQUIV (40)
87 
88 /* Bonus for breaking a line after other punctuation.  */
89 #define	PUNCT_BONUS	EQUIV(40)
90 
91 /* Credit for breaking a long paragraph one line later.  */
92 #define	LINE_CREDIT	EQUIV(3)
93 
94 /* Size of paragraph buffer in words.  Longer paragraphs are handled
95    neatly (cf. flush_paragraph()), so there's little to gain by making
96    these larger.  */
97 #define	MAXWORDS	1000
98 
99 #define GETC()          (parabuf == end_of_parabuf ? EOF : *parabuf++)
100 
101 /* Extra ctype(3)-style macros.  */
102 
103 #define	isopen(c)	(strchr ("([`'\"", (c)) != NULL)
104 #define	isclose(c)	(strchr (")]'\"", (c)) != NULL)
105 #define	isperiod(c)	(strchr (".?!", (c)) != NULL)
106 
107 /* Size of a tab stop, for expansion on input and re-introduction on
108    output.  */
109 #define	TABWIDTH	8
110 
111 /* Word descriptor structure.  */
112 
113 typedef struct Word WORD;
114 
115 struct Word
116   {
117 
118     /* Static attributes determined during input.  */
119 
120     const char *text;		/* the text of the word */
121     short length;		/* length of this word */
122     short space;		/* the size of the following space */
123     unsigned paren:1;		/* starts with open paren */
124     unsigned period:1;		/* ends in [.?!])* */
125     unsigned punct:1;		/* ends in punctuation */
126     unsigned final:1;		/* end of sentence */
127 
128     /* The remaining fields are computed during the optimization.  */
129 
130     short line_length;		/* length of the best line starting here */
131     COST best_cost;		/* cost of best paragraph starting here */
132     WORD *next_break;		/* break which achieves best_cost */
133   };
134 
135 /* Forward declarations.  */
136 
137 static bool get_paragraph P_ ((void));
138 static int get_line P_ ((int c));
139 static int get_space P_ ((int c));
140 static int copy_rest P_ ((int c));
141 static bool same_para P_ ((int c));
142 static void flush_paragraph P_ ((void));
143 static void fmt_paragraph P_ ((void));
144 static void check_punctuation P_ ((WORD *w));
145 static COST base_cost P_ ((WORD *this));
146 static COST line_cost P_ ((WORD *next, int len));
147 static void put_paragraph P_ ((WORD *finish));
148 static void put_line P_ ((WORD *w, int indent));
149 static void put_word P_ ((WORD *w));
150 static void put_space P_ ((int space));
151 
152 /* Option values.  */
153 
154 /* User-supplied maximum line width (default WIDTH).  The only output
155    lines
156    longer than this will each comprise a single word.  */
157 static int max_width;
158 
159 /* Space for the paragraph text.  */
160 static const char *parabuf;
161 
162 /* End of space for the paragraph text.  */
163 static const char *end_of_parabuf;
164 
165 /* The file on which we output */
166 static FILE *outfile;
167 
168 /* Values derived from the option values.  */
169 
170 /* The preferred width of text lines, set to LEEWAY % less than max_width.  */
171 static int best_width;
172 
173 /* Dynamic variables.  */
174 
175 /* Start column of the character most recently read from the input file.  */
176 static int in_column;
177 
178 /* Start column of the next character to be written to stdout.  */
179 static int out_column;
180 
181 /* The words of a paragraph -- longer paragraphs are handled neatly
182    (cf. flush_paragraph()).  */
183 static WORD words[MAXWORDS];
184 
185 /* A pointer into the above word array, indicating the first position
186    after the last complete word.  Sometimes it will point at an incomplete
187    word.  */
188 static WORD *word_limit;
189 
190 /* Indentation of the first line of the current paragraph.  */
191 static int first_indent;
192 
193 /* Indentation of other lines of the current paragraph */
194 static int other_indent;
195 
196 /* The last character read from the input file.  */
197 static int next_char;
198 
199 /* If nonzero, the length of the last line output in the current
200    paragraph, used to charge for raggedness at the split point for long
201    paragraphs chosen by fmt_paragraph().  */
202 static int last_line_length;
203 
204 /* read file F and send formatted output to stdout.  */
205 
206 void
fmt(const char * line,const char * line_end,int max_length,FILE * output_file)207 fmt (const char *line, const char *line_end, int max_length, FILE *output_file)
208 {
209   parabuf = line;
210   end_of_parabuf = line_end;
211   outfile = output_file;
212 
213   max_width = max_length;
214   best_width = max_width * (201 - 2 * LEEWAY) / 200;
215 
216   in_column = 0;
217   other_indent = 0;
218   next_char = GETC();
219   while (get_paragraph ())
220     {
221       fmt_paragraph ();
222       put_paragraph (word_limit);
223     }
224 }
225 
226 /* Read a paragraph from input file F.  A paragraph consists of a
227    maximal number of non-blank (excluding any prefix) lines
228    with the same indent.
229 
230    Return false if end-of-file was encountered before the start of a
231    paragraph, else true.  */
232 
233 static bool
get_paragraph()234 get_paragraph ()
235 {
236   register int c;
237 
238   last_line_length = 0;
239   c = next_char;
240 
241   /* Scan (and copy) blank lines, and lines not introduced by the prefix.  */
242 
243   while (c == '\n' || c == EOF)
244     {
245       c = copy_rest (c);
246       if (c == EOF)
247 	{
248 	  next_char = EOF;
249 	  return false;
250 	}
251       putc ('\n', outfile);
252       c = GETC();
253     }
254 
255   /* Got a suitable first line for a paragraph.  */
256 
257   first_indent = in_column;
258   word_limit = words;
259   c = get_line (c);
260 
261   /* Read rest of paragraph.  */
262 
263   other_indent = in_column;
264   while (same_para (c) && in_column == other_indent)
265     c = get_line (c);
266 
267   (word_limit - 1)->period = (word_limit - 1)->final = true;
268   next_char = c;
269   return true;
270 }
271 
272 /* Copy to the output a blank line.  In the latter, C is \n or EOF.
273    Return the character (\n or EOF) ending the line.  */
274 
275 static int
copy_rest(register int c)276 copy_rest (register int c)
277 {
278   out_column = 0;
279   while (c != '\n' && c != EOF)
280     {
281       putc (c, outfile);
282       c = GETC();
283     }
284   return c;
285 }
286 
287 /* Return true if a line whose first non-blank character after the
288    prefix (if any) is C could belong to the current paragraph,
289    otherwise false.  */
290 
291 static bool
same_para(register int c)292 same_para (register int c)
293 {
294   return (c != '\n' && c != EOF);
295 }
296 
297 /* Read a line from the input data given first non-blank character C
298    after the prefix, and the following indent, and break it into words.
299    A word is a maximal non-empty string of non-white characters.  A word
300    ending in [.?!]["')\]]* and followed by end-of-line or at least two
301    spaces ends a sentence, as in emacs.
302 
303    Return the first non-blank character of the next line.  */
304 
305 static int
get_line(register int c)306 get_line (register int c)
307 {
308   int start;
309   register WORD *end_of_word;
310 
311   end_of_word = &words[MAXWORDS - 2];
312 
313   do
314     {				/* for each word in a line */
315 
316       /* Scan word.  */
317 
318       word_limit->text = parabuf - 1;
319       do
320 	c = GETC();
321       while (c != EOF && !ISSPACE (c));
322       word_limit->length = parabuf - word_limit->text - (c != EOF);
323       in_column += word_limit->length;
324 
325       check_punctuation (word_limit);
326 
327       /* Scan inter-word space.  */
328 
329       start = in_column;
330       c = get_space (c);
331       word_limit->space = in_column - start;
332       word_limit->final = (c == EOF
333 			   || (word_limit->period
334 			       && (c == '\n' || word_limit->space > 1)));
335       if (c == '\n' || c == EOF)
336 	word_limit->space = word_limit->final ? 2 : 1;
337       if (word_limit == end_of_word)
338 	flush_paragraph ();
339       word_limit++;
340       if (c == EOF)
341 	{
342 	  in_column = first_indent;
343 	  return EOF;
344 	}
345     }
346   while (c != '\n');
347 
348   in_column = 0;
349   c = GETC();
350   return get_space (c);
351 }
352 
353 /* Read blank characters from the input data, starting with C, and keeping
354    in_column up-to-date.  Return first non-blank character.  */
355 
356 static int
get_space(register int c)357 get_space (register int c)
358 {
359   for (;;)
360     {
361       if (c == ' ')
362 	in_column++;
363       else if (c == '\t')
364 	in_column = (in_column / TABWIDTH + 1) * TABWIDTH;
365       else
366 	return c;
367       c = GETC();
368     }
369 }
370 
371 /* Set extra fields in word W describing any attached punctuation.  */
372 
373 static void
check_punctuation(register WORD * w)374 check_punctuation (register WORD *w)
375 {
376   register const char *start, *finish;
377 
378   start = w->text;
379   finish = start + (w->length - 1);
380   w->paren = isopen (*start);
381   w->punct = ISPUNCT (*finish);
382   while (isclose (*finish) && finish > start)
383     finish--;
384   w->period = isperiod (*finish);
385 }
386 
387 /* Flush part of the paragraph to make room.  This function is called on
388    hitting the limit on the number of words or characters.  */
389 
390 static void
flush_paragraph(void)391 flush_paragraph (void)
392 {
393   WORD *split_point;
394   register WORD *w;
395   COST best_break;
396 
397   /* - format what you have so far as a paragraph,
398      - find a low-cost line break near the end,
399      - output to there,
400      - make that the start of the paragraph.  */
401 
402   fmt_paragraph ();
403 
404   /* Choose a good split point.  */
405 
406   split_point = word_limit;
407   best_break = MAXCOST;
408   for (w = words->next_break; w != word_limit; w = w->next_break)
409     {
410       if (w->best_cost - w->next_break->best_cost < best_break)
411 	{
412 	  split_point = w;
413 	  best_break = w->best_cost - w->next_break->best_cost;
414 	}
415       if (best_break <= MAXCOST - LINE_CREDIT)
416 	best_break += LINE_CREDIT;
417     }
418   put_paragraph (split_point);
419 
420   /* Copy words from split_point down to word -- we use memmove because
421      the source and target may overlap.  */
422 
423   memmove ((char *) words, (char *) split_point,
424 	 (word_limit - split_point + 1) * sizeof (WORD));
425   word_limit -= split_point - words;
426 }
427 
428 /* Compute the optimal formatting for the whole paragraph by computing
429    and remembering the optimal formatting for each suffix from the empty
430    one to the whole paragraph.  */
431 
432 static void
fmt_paragraph(void)433 fmt_paragraph (void)
434 {
435   register WORD *start, *w;
436   register int len;
437   register COST wcost, best;
438   int saved_length;
439 
440   word_limit->best_cost = 0;
441   saved_length = word_limit->length;
442   word_limit->length = max_width;	/* sentinel */
443 
444   for (start = word_limit - 1; start >= words; start--)
445     {
446       best = MAXCOST;
447       len = start == words ? first_indent : other_indent;
448 
449       /* At least one word, however long, in the line.  */
450 
451       w = start;
452       len += w->length;
453       do
454 	{
455 	  w++;
456 
457 	  /* Consider breaking before w.  */
458 
459 	  wcost = line_cost (w, len) + w->best_cost;
460 	  if (start == words && last_line_length > 0)
461 	    wcost += RAGGED_COST (len - last_line_length);
462 	  if (wcost < best)
463 	    {
464 	      best = wcost;
465 	      start->next_break = w;
466 	      start->line_length = len;
467 	    }
468 	  len += (w - 1)->space + w->length;	/* w > start >= words */
469 	}
470       while (len < max_width);
471       start->best_cost = best + base_cost (start);
472     }
473 
474   word_limit->length = saved_length;
475 }
476 
477 /* Return the constant component of the cost of breaking before the
478    word THIS.  */
479 
480 static COST
base_cost(register WORD * this)481 base_cost (register WORD *this)
482 {
483   register COST cost;
484 
485   cost = LINE_COST;
486 
487   if (this > words)
488     {
489       if ((this - 1)->period)
490 	{
491 	  if ((this - 1)->final)
492 	    cost -= SENTENCE_BONUS;
493 	  else
494 	    cost += NOBREAK_COST;
495 	}
496       else if ((this - 1)->punct)
497 	cost -= PUNCT_BONUS;
498       else if (this > words + 1 && (this - 2)->final)
499 	cost += WIDOW_COST ((this - 1)->length);
500     }
501 
502   if (this->paren)
503     cost -= PAREN_BONUS;
504   else if (this->final)
505     cost += ORPHAN_COST (this->length);
506 
507   return cost;
508 }
509 
510 /* Return the component of the cost of breaking before word NEXT that
511    depends on LEN, the length of the line beginning there.  */
512 
513 static COST
line_cost(register WORD * next,register int len)514 line_cost (register WORD *next, register int len)
515 {
516   register int n;
517   register COST cost;
518 
519   if (next == word_limit)
520     return 0;
521   n = best_width - len;
522   cost = SHORT_COST (n);
523   if (next->next_break != word_limit)
524     {
525       n = len - next->line_length;
526       cost += RAGGED_COST (n);
527     }
528   return cost;
529 }
530 
531 /* Output to stdout a paragraph from word up to (but not including)
532    FINISH, which must be in the next_break chain from word.  */
533 
534 static void
put_paragraph(register WORD * finish)535 put_paragraph (register WORD *finish)
536 {
537   register WORD *w;
538 
539   put_line (words, first_indent);
540   for (w = words->next_break; w != finish; w = w->next_break)
541     put_line (w, other_indent);
542 }
543 
544 /* Output to stdout the line beginning with word W, beginning in column
545    INDENT, including the prefix (if any).  */
546 
547 static void
put_line(register WORD * w,int indent)548 put_line (register WORD *w, int indent)
549 {
550   register WORD *endline;
551   out_column = 0;
552   put_space (indent);
553 
554   endline = w->next_break - 1;
555   for (; w != endline; w++)
556     {
557       put_word (w);
558       put_space (w->space);
559     }
560   put_word (w);
561   last_line_length = out_column;
562   putc ('\n', outfile);
563 }
564 
565 /* Output to stdout the word W.  */
566 
567 static void
put_word(register WORD * w)568 put_word (register WORD *w)
569 {
570   register const char *s;
571   register int n;
572 
573   s = w->text;
574   for (n = w->length; n != 0; n--)
575     putc (*s++, outfile);
576   out_column += w->length;
577 }
578 
579 /* Output to stdout SPACE spaces, or equivalent tabs.  */
580 
581 static void
put_space(int space)582 put_space (int space)
583 {
584   out_column += space;
585   while (space--)
586     putc (' ', outfile);
587 }
588