1 /* $OpenBSD: fmt.c,v 1.39 2018/10/18 05:04:52 otto Exp $ */ 2 /* 3 * This file is a derived work. 4 * The changes are covered by the following Copyright and license: 5 * 6 * Copyright (c) 2015, 2016 Ingo Schwarze <schwarze@openbsd.org> 7 * Copyright (c) 2000 Paul Janzen <pjanzen@foatdi.net> 8 * 9 * Permission to use, copy, modify, and distribute this software for any 10 * purpose with or without fee is hereby granted, provided that the above 11 * copyright notice and this permission notice appear in all copies. 12 * 13 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 14 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 15 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 16 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 17 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 18 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 19 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 20 * 21 * 22 * The unchanged parts are covered by the following Copyright and license: 23 * 24 * Copyright (c) 1997 Gareth McCaughan. All rights reserved. 25 * 26 * Redistribution and use of this code, in source or binary forms, 27 * with or without modification, are permitted subject to the following 28 * conditions: 29 * 30 * - Redistribution of source code must retain the above copyright 31 * notice, this list of conditions and the following disclaimer. 32 * 33 * - If you distribute modified source code it must also include 34 * a notice saying that it has been modified, and giving a brief 35 * description of what changes have been made. 36 * 37 * Disclaimer: I am not responsible for the results of using this code. 38 * If it formats your hard disc, sends obscene messages to 39 * your boss and kills your children then that's your problem 40 * not mine. I give absolutely no warranty of any sort as to 41 * what the program will do, and absolutely refuse to be held 42 * liable for any consequences of your using it. 43 * Thank you. Have a nice day. 44 * 45 * 46 * Brief overview of the changes made by OpenBSD: 47 * Added UTF-8 support (2016). 48 * Added pledge(2) support (2015). 49 * ANSI function syntax and KNF (2004). 50 * Added -w option (2000). 51 * Some minor changes can be seen in the public OpenBSD CVS repository. 52 */ 53 54 /* Sensible version of fmt 55 * 56 * Syntax: fmt [ options ] [ goal [ max ] ] [ filename ... ] 57 * 58 * Since the documentation for the original fmt is so poor, here 59 * is an accurate description of what this one does. It's usually 60 * the same. The *mechanism* used may differ from that suggested 61 * here. Note that we are *not* entirely compatible with fmt, 62 * because fmt gets so many things wrong. 63 * 64 * 1. Tabs are expanded, assuming 8-space tab stops. 65 * If the `-t <n>' option is given, we assume <n>-space 66 * tab stops instead. 67 * Trailing blanks are removed from all lines. 68 * x\b == nothing, for any x other than \b. 69 * Other control characters are simply stripped. This 70 * includes \r. 71 * 2. Each line is split into leading whitespace and 72 * everything else. Maximal consecutive sequences of 73 * lines with the same leading whitespace are considered 74 * to form paragraphs, except that a blank line is always 75 * a paragraph to itself. 76 * If the `-p' option is given then the first line of a 77 * paragraph is permitted to have indentation different 78 * from that of the other lines. 79 * If the `-m' option is given then a line that looks 80 * like a mail message header, if it is not immediately 81 * preceded by a non-blank non-message-header line, is 82 * taken to start a new paragraph, which also contains 83 * any subsequent lines with non-empty leading whitespace. 84 * Unless the `-n' option is given, lines beginning with 85 * a . (dot) are not formatted. 86 * 3. The "everything else" is split into words; a word 87 * includes its trailing whitespace, and a word at the 88 * end of a line is deemed to be followed by a single 89 * space, or two spaces if it ends with a sentence-end 90 * character. (See the `-d' option for how to change that.) 91 * If the `-s' option has been given, then a word's trailing 92 * whitespace is replaced by what it would have had if it 93 * had occurred at end of line. 94 * 4. Each paragraph is sent to standard output as follows. 95 * We output the leading whitespace, and then enough words 96 * to make the line length as near as possible to the goal 97 * without exceeding the maximum. (If a single word would 98 * exceed the maximum, we output that anyway.) Of course 99 * the trailing whitespace of the last word is ignored. 100 * We then emit a newline and start again if there are any 101 * words left. 102 * Note that for a blank line this translates as "We emit 103 * a newline". 104 * If the `-l <n>' option is given, then leading whitespace 105 * is modified slightly: <n> spaces are replaced by a tab. 106 * Indented paragraphs (see above under `-p') make matters 107 * more complicated than this suggests. Actually every paragraph 108 * has two `leading whitespace' values; the value for the first 109 * line, and the value for the most recent line. (While processing 110 * the first line, the two are equal. When `-p' has not been 111 * given, they are always equal.) The leading whitespace 112 * actually output is that of the first line (for the first 113 * line of *output*) or that of the most recent line (for 114 * all other lines of output). 115 * When `-m' has been given, message header paragraphs are 116 * taken as having first-leading-whitespace empty and 117 * subsequent-leading-whitespace two spaces. 118 * 119 * Multiple input files are formatted one at a time, so that a file 120 * never ends in the middle of a line. 121 * 122 * There's an alternative mode of operation, invoked by giving 123 * the `-c' option. In that case we just center every line, 124 * and most of the other options are ignored. This should 125 * really be in a separate program, but we must stay compatible 126 * with old `fmt'. 127 * 128 * QUERY: Should `-m' also try to do the right thing with quoted text? 129 * QUERY: `-b' to treat backslashed whitespace as old `fmt' does? 130 * QUERY: Option meaning `never join lines'? 131 * QUERY: Option meaning `split in mid-word to avoid overlong lines'? 132 * (Those last two might not be useful, since we have `fold'.) 133 * 134 * Differences from old `fmt': 135 * 136 * - We have many more options. Options that aren't understood 137 * generate a lengthy usage message, rather than being 138 * treated as filenames. 139 * - Even with `-m', our handling of message headers is 140 * significantly different. (And much better.) 141 * - We don't treat `\ ' as non-word-breaking. 142 * - Downward changes of indentation start new paragraphs 143 * for us, as well as upward. (I think old `fmt' behaves 144 * in the way it does in order to allow indented paragraphs, 145 * but this is a broken way of making indented paragraphs 146 * behave right.) 147 * - Given the choice of going over or under |goal_length| 148 * by the same amount, we go over; old `fmt' goes under. 149 * - We treat `?' as ending a sentence, and not `:'. Old `fmt' 150 * does the reverse. 151 * - We return approved return codes. Old `fmt' returns 152 * 1 for some errors, and *the number of unopenable files* 153 * when that was all that went wrong. 154 * - We have fewer crashes and more helpful error messages. 155 * - We don't turn spaces into tabs at starts of lines unless 156 * specifically requested. 157 * - New `fmt' is somewhat smaller and slightly faster than 158 * old `fmt'. 159 * 160 * Bugs: 161 * 162 * None known. There probably are some, though. 163 * 164 * Portability: 165 * 166 * I believe this code to be pretty portable. It does require 167 * that you have `getopt'. If you need to include "getopt.h" 168 * for this (e.g., if your system didn't come with `getopt' 169 * and you installed it yourself) then you should arrange for 170 * NEED_getopt_h to be #defined. 171 * 172 * Everything here should work OK even on nasty 16-bit 173 * machines and nice 64-bit ones. However, it's only really 174 * been tested on my FreeBSD machine. Your mileage may vary. 175 */ 176 177 #include <ctype.h> 178 #include <err.h> 179 #include <locale.h> 180 #include <stdio.h> 181 #include <stdlib.h> 182 #include <string.h> 183 #include <unistd.h> 184 #include <wchar.h> 185 #include <wctype.h> 186 187 /* Something that, we hope, will never be a genuine line length, 188 * indentation etc. 189 */ 190 #define SILLY ((size_t)-1) 191 192 /* I used to use |strtoul| for this, but (1) not all systems have it 193 * and (2) it's probably better to use |strtol| to detect negative 194 * numbers better. 195 * If |fussyp==0| then we don't complain about non-numbers 196 * (returning 0 instead), but we do complain about bad numbers. 197 */ 198 static size_t 199 get_positive(const char *s, const char *err_mess, int fussyP) 200 { 201 char *t; 202 long result = strtol(s, &t, 0); 203 204 if (*t) { 205 if (fussyP) 206 goto Lose; 207 else 208 return 0; 209 } 210 if (result <= 0) { 211 Lose: 212 errx(1, "%s", err_mess); 213 } 214 215 return (size_t) result; 216 } 217 218 /* Global variables */ 219 220 static int centerP = 0; /* Try to center lines? */ 221 static size_t goal_length = 0; /* Target length for output lines */ 222 static size_t max_length = 0; /* Maximum length for output lines */ 223 static int coalesce_spaces_P = 0; /* Coalesce multiple whitespace -> ' ' ? */ 224 static int allow_indented_paragraphs = 0; /* Can first line have diff. ind.? */ 225 static int tab_width = 8; /* Number of spaces per tab stop */ 226 static size_t output_tab_width = 0; /* Ditto, when squashing leading spaces */ 227 static const char *sentence_enders = ".?!"; /* Double-space after these */ 228 static int grok_mail_headers = 0; /* treat embedded mail headers magically? */ 229 static int format_troff = 0; /* Format troff? */ 230 231 static int n_errors = 0; /* Number of failed files. */ 232 static size_t x; /* Horizontal position in output line */ 233 static size_t x0; /* Ditto, ignoring leading whitespace */ 234 static size_t pending_spaces; /* Spaces to add before next word */ 235 static int output_in_paragraph = 0; /* Any of current para written out yet? */ 236 237 /* Prototypes */ 238 239 static void process_named_file(const char *); 240 static void process_stream(FILE *, const char *); 241 static size_t indent_length(const char *); 242 static int might_be_header(const char *); 243 static void new_paragraph(size_t); 244 static void output_word(size_t, size_t, const char *, int, int, int); 245 static void output_indent(size_t); 246 static void center_stream(FILE *, const char *); 247 static char *get_line(FILE *); 248 static void *xreallocarray(void *, size_t, size_t); 249 void usage(void); 250 251 #define ERRS(x) (x >= 127 ? 127 : ++x) 252 253 /* Here is perhaps the right place to mention that this code is 254 * all in top-down order. Hence, |main| comes first. 255 */ 256 int 257 main(int argc, char *argv[]) 258 { 259 int ch; /* used for |getopt| processing */ 260 261 (void)setlocale(LC_CTYPE, ""); 262 263 if (pledge("stdio rpath", NULL) == -1) 264 err(1, "pledge"); 265 266 /* 1. Grok parameters. */ 267 while ((ch = getopt(argc, argv, "0123456789cd:hl:mnpst:w:")) != -1) { 268 switch (ch) { 269 case 'c': 270 centerP = 1; 271 break; 272 case 'd': 273 sentence_enders = optarg; 274 break; 275 case 'l': 276 output_tab_width 277 = get_positive(optarg, "output tab width must be positive", 1); 278 break; 279 case 'm': 280 grok_mail_headers = 1; 281 break; 282 case 'n': 283 format_troff = 1; 284 break; 285 case 'p': 286 allow_indented_paragraphs = 1; 287 break; 288 case 's': 289 coalesce_spaces_P = 1; 290 break; 291 case 't': 292 tab_width = get_positive(optarg, "tab width must be positive", 1); 293 break; 294 case 'w': 295 goal_length = get_positive(optarg, "width must be positive", 1); 296 max_length = goal_length; 297 break; 298 case '0': case '1': case '2': case '3': case '4': case '5': 299 case '6': case '7': case '8': case '9': 300 /* XXX this is not a stylistically approved use of getopt() */ 301 if (goal_length == 0) { 302 char *p; 303 304 p = argv[optind - 1]; 305 if (p[0] == '-' && p[1] == ch && !p[2]) 306 goal_length = get_positive(++p, "width must be nonzero", 1); 307 else 308 goal_length = get_positive(argv[optind]+1, 309 "width must be nonzero", 1); 310 max_length = goal_length; 311 } 312 break; 313 case 'h': 314 default: 315 usage(); 316 /* NOT REACHED */ 317 } 318 } 319 320 argc -= optind; 321 argv += optind; 322 323 /* [ goal [ maximum ] ] */ 324 if (argc > 0 && goal_length == 0 && 325 (goal_length = get_positive(*argv,"goal length must be positive", 0)) != 0) { 326 --argc; 327 ++argv; 328 if (argc > 0 && (max_length = get_positive(*argv,"max length must be positive", 0)) != 0) { 329 --argc; 330 ++argv; 331 if (max_length < goal_length) 332 errx(1, "max length must be >= goal length"); 333 } 334 } 335 336 if (goal_length == 0) 337 goal_length = 65; 338 if (max_length == 0) 339 max_length = goal_length+10; 340 341 /* 2. Process files. */ 342 343 if (argc > 0) { 344 while (argc-- > 0) 345 process_named_file(*argv++); 346 } else { 347 if (pledge("stdio", NULL) == -1) 348 err(1, "pledge"); 349 process_stream(stdin, "standard input"); 350 } 351 352 /* We're done. */ 353 return n_errors; 354 355 } 356 357 /* Process a single file, given its name. 358 */ 359 static void 360 process_named_file(const char *name) 361 { 362 FILE *f; 363 364 if ((f = fopen(name, "r")) == NULL) { 365 warn("%s", name); 366 ERRS(n_errors); 367 } else { 368 process_stream(f, name); 369 fclose(f); 370 } 371 } 372 373 /* Types of mail header continuation lines: 374 */ 375 typedef enum { 376 hdr_ParagraphStart = -1, 377 hdr_NonHeader = 0, 378 hdr_Header = 1, 379 hdr_Continuation = 2 380 } HdrType; 381 382 /* Process a stream. This is where the real work happens, 383 * except that centering is handled separately. 384 */ 385 static void 386 process_stream(FILE *stream, const char *name) 387 { 388 const char *wordp, *cp; 389 wchar_t wc; 390 size_t np; 391 size_t last_indent = SILLY; /* how many spaces in last indent? */ 392 size_t para_line_number = 0; /* how many lines already read in this para? */ 393 size_t first_indent = SILLY; /* indentation of line 0 of paragraph */ 394 int wcl; /* number of bytes in wide character */ 395 int wcw; /* display width of wide character */ 396 int word_length; /* number of bytes in word */ 397 int word_width; /* display width of word */ 398 int space_width; /* display width of space after word */ 399 int line_width; /* display width of line */ 400 HdrType prev_header_type = hdr_ParagraphStart; 401 HdrType header_type; 402 403 /* ^-- header_type of previous line; -1 at para start */ 404 const char *line; 405 406 if (centerP) { 407 center_stream(stream, name); 408 return; 409 } 410 411 while ((line = get_line(stream)) != NULL) { 412 np = indent_length(line); 413 header_type = hdr_NonHeader; 414 if (grok_mail_headers && prev_header_type != hdr_NonHeader) { 415 if (np == 0 && might_be_header(line)) 416 header_type = hdr_Header; 417 else if (np > 0 && prev_header_type>hdr_NonHeader) 418 header_type = hdr_Continuation; 419 } 420 421 /* We need a new paragraph if and only if: 422 * this line is blank, 423 * OR it's a troff request, 424 * OR it's a mail header, 425 * OR it's not a mail header AND the last line was one, 426 * OR the indentation has changed 427 * AND the line isn't a mail header continuation line 428 * AND this isn't the second line of an indented paragraph. 429 */ 430 if (*line == '\0' || (*line == '.' && !format_troff) || 431 header_type == hdr_Header || 432 (header_type == hdr_NonHeader && prev_header_type > hdr_NonHeader) || 433 (np != last_indent && header_type != hdr_Continuation && 434 (!allow_indented_paragraphs || para_line_number != 1)) ) { 435 new_paragraph(np); 436 para_line_number = 0; 437 first_indent = np; 438 last_indent = np; 439 440 /* nroff compatibility */ 441 if (*line == '.' && !format_troff) { 442 puts(line); 443 continue; 444 } 445 if (header_type == hdr_Header) 446 last_indent = 2; /* for cont. lines */ 447 if (*line == '\0') { 448 putchar('\n'); 449 prev_header_type = hdr_ParagraphStart; 450 continue; 451 } else { 452 /* If this is an indented paragraph other than a mail header 453 * continuation, set |last_indent|. 454 */ 455 if (np != last_indent && header_type != hdr_Continuation) 456 last_indent = np; 457 } 458 prev_header_type = header_type; 459 } 460 461 line_width = np; 462 for (wordp = line; *wordp != '\0'; wordp = cp) { 463 word_length = 0; 464 word_width = space_width = 0; 465 for (cp = wordp; *cp != '\0'; cp += wcl) { 466 wcl = mbtowc(&wc, cp, MB_CUR_MAX); 467 if (wcl == -1) { 468 (void)mbtowc(NULL, NULL, MB_CUR_MAX); 469 wc = L'?'; 470 wcl = 1; 471 wcw = 1; 472 } else if (wc == L'\t') 473 wcw = (line_width / tab_width + 1) * 474 tab_width - line_width; 475 else if ((wcw = wcwidth(wc)) == -1) 476 wcw = 1; 477 if (iswblank(wc) && wc != 0xa0) { 478 /* Skip whitespace at start of line. */ 479 if (word_length == 0) { 480 wordp += wcl; 481 continue; 482 } 483 /* Count whitespace after word. */ 484 space_width += wcw; 485 } else { 486 /* Detect end of word. */ 487 if (space_width > 0) 488 break; 489 /* Measure word. */ 490 word_length += wcl; 491 word_width += wcw; 492 } 493 line_width += wcw; 494 } 495 496 /* Send the word to the output machinery. */ 497 output_word(first_indent, last_indent, wordp, 498 word_length, word_width, space_width); 499 } 500 ++para_line_number; 501 } 502 503 new_paragraph(0); 504 if (ferror(stream)) { 505 warn("%s", name); 506 ERRS(n_errors); 507 } 508 } 509 510 /* How long is the indent on this line? 511 */ 512 static size_t 513 indent_length(const char *line) 514 { 515 size_t n = 0; 516 517 for (;;) { 518 switch(*line++) { 519 case ' ': 520 ++n; 521 continue; 522 case '\t': 523 n = (n / tab_width + 1) * tab_width; 524 continue; 525 default: 526 break; 527 } 528 break; 529 } 530 return n; 531 } 532 533 /* Might this line be a mail header? 534 * We deem a line to be a possible header if it matches the 535 * Perl regexp /^[A-Z][-A-Za-z0-9]*:\s/. This is *not* the same 536 * as in RFC whatever-number-it-is; we want to be gratuitously 537 * conservative to avoid mangling ordinary civilised text. 538 */ 539 static int 540 might_be_header(const char *line) 541 { 542 543 if (!isupper((unsigned char)*line++)) 544 return 0; 545 while (isalnum((unsigned char)*line) || *line == '-') 546 ++line; 547 return (*line == ':' && isspace((unsigned char)line[1])); 548 } 549 550 /* Begin a new paragraph with an indent of |indent| spaces. 551 */ 552 static void 553 new_paragraph(size_t indent) 554 { 555 556 if (x0 > 0) 557 putchar('\n'); 558 x = indent; 559 x0 = 0; 560 pending_spaces = 0; 561 output_in_paragraph = 0; 562 } 563 564 /* Output spaces or tabs for leading indentation. 565 */ 566 static void 567 output_indent(size_t n_spaces) 568 { 569 570 if (n_spaces == 0) 571 return; 572 if (output_tab_width) { 573 while (n_spaces >= output_tab_width) { 574 putchar('\t'); 575 n_spaces -= output_tab_width; 576 } 577 } 578 while (n_spaces-- > 0) 579 putchar(' '); 580 } 581 582 /* Output a single word. 583 * indent0 and indent1 are the indents to use on the first and subsequent 584 * lines of a paragraph. They'll often be the same, of course. 585 */ 586 static void 587 output_word(size_t indent0, size_t indent1, const char *word, 588 int length, int width, int spaces) 589 { 590 size_t new_x = x + pending_spaces + width; 591 592 /* If either |spaces==0| (at end of line) or |coalesce_spaces_P| 593 * (squashing internal whitespace), then add just one space; 594 * except that if the last character was a sentence-ender we 595 * actually add two spaces. 596 */ 597 if (coalesce_spaces_P || spaces == 0) 598 spaces = strchr(sentence_enders, word[length-1]) ? 2 : 1; 599 600 if (x0 == 0) 601 output_indent(output_in_paragraph ? indent1 : indent0); 602 else if (new_x > max_length || x >= goal_length || 603 (new_x > goal_length && new_x-goal_length > goal_length-x)) { 604 putchar('\n'); 605 output_indent(indent1); 606 x0 = 0; 607 x = indent1; 608 } else { 609 x0 += pending_spaces; 610 x += pending_spaces; 611 while (pending_spaces--) 612 putchar(' '); 613 } 614 x0 += width; 615 x += width; 616 while(length--) 617 putchar(*word++); 618 pending_spaces = spaces; 619 output_in_paragraph = 1; 620 } 621 622 /* Process a stream, but just center its lines rather than trying to 623 * format them neatly. 624 */ 625 static void 626 center_stream(FILE *stream, const char *name) 627 { 628 char *line, *cp; 629 wchar_t wc; 630 size_t l; /* Display width of the line. */ 631 int wcw; /* Display width of one character. */ 632 int wcl; /* Length in bytes of one character. */ 633 634 while ((line = get_line(stream)) != NULL) { 635 l = 0; 636 for (cp = line; *cp != '\0'; cp += wcl) { 637 if (*cp == '\t') 638 *cp = ' '; 639 if ((wcl = mbtowc(&wc, cp, MB_CUR_MAX)) == -1) { 640 (void)mbtowc(NULL, NULL, MB_CUR_MAX); 641 *cp = '?'; 642 wcl = 1; 643 wcw = 1; 644 } else if ((wcw = wcwidth(wc)) == -1) 645 wcw = 1; 646 if (l == 0 && iswspace(wc)) 647 line += wcl; 648 else 649 l += wcw; 650 } 651 while (l < goal_length) { 652 putchar(' '); 653 l += 2; 654 } 655 puts(line); 656 } 657 658 if (ferror(stream)) { 659 warn("%s", name); 660 ERRS(n_errors); 661 } 662 } 663 664 /* Get a single line from a stream. Strip control 665 * characters and trailing whitespace, and handle backspaces. 666 * Return the address of the buffer containing the line. 667 * This can cope with arbitrarily long lines, and with lines 668 * without terminating \n. 669 * If there are no characters left or an error happens, we 670 * return NULL. 671 */ 672 static char * 673 get_line(FILE *stream) 674 { 675 int ch; 676 int troff = 0; 677 static char *buf = NULL; 678 static size_t length = 0; 679 size_t len = 0; 680 681 if (buf == NULL) { 682 length = 100; 683 buf = xreallocarray(NULL, length, 1); 684 } 685 686 while ((ch = getc(stream)) != '\n' && ch != EOF) { 687 if ((len == 0) && (ch == '.' && !format_troff)) 688 troff = 1; 689 if (troff || ch == '\t' || !iscntrl(ch)) { 690 if (len >= length - 1) { 691 buf = xreallocarray(buf, length, 2); 692 length *= 2; 693 } 694 buf[len++] = ch; 695 } else if (ch == '\b') { 696 if (len) 697 --len; 698 } 699 } 700 while (len > 0 && isspace((unsigned char)buf[len-1])) 701 --len; 702 buf[len] = '\0'; 703 return (len > 0 || ch != EOF) ? buf : NULL; 704 } 705 706 /* (Re)allocate some memory, exiting with an error if we can't. 707 */ 708 static void * 709 xreallocarray(void *ptr, size_t nmemb, size_t size) 710 { 711 void *p; 712 713 p = reallocarray(ptr, nmemb, size); 714 if (p == NULL) 715 errx(1, "out of memory"); 716 return p; 717 } 718 719 void 720 usage(void) 721 { 722 extern char *__progname; 723 724 fprintf(stderr, 725 "usage: %s [-cmnps] [-d chars] [-l number] [-t number]\n" 726 "\t[goal [maximum] | -width | -w width] [file ...]\n", 727 __progname); 728 exit (1); 729 } 730