1.\" Id: mandoc.3,v 1.22 2013/10/06 17:01:52 schwarze Exp 2.\" 3.\" Copyright (c) 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv> 4.\" Copyright (c) 2010 Ingo Schwarze <schwarze@openbsd.org> 5.\" 6.\" Permission to use, copy, modify, and distribute this software for any 7.\" purpose with or without fee is hereby granted, provided that the above 8.\" copyright notice and this permission notice appear in all copies. 9.\" 10.\" THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 11.\" WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12.\" MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 13.\" ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14.\" WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 15.\" ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 16.\" OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17.\" 18.Dd October 6, 2013 19.Dt MANDOC 3 20.Os 21.Sh NAME 22.Nm mandoc , 23.Nm mandoc_escape , 24.Nm man_meta , 25.Nm man_mparse , 26.Nm man_node , 27.Nm mchars_alloc , 28.Nm mchars_free , 29.Nm mchars_num2char , 30.Nm mchars_num2uc , 31.Nm mchars_spec2cp , 32.Nm mchars_spec2str , 33.Nm mdoc_meta , 34.Nm mdoc_node , 35.Nm mparse_alloc , 36.Nm mparse_free , 37.Nm mparse_getkeep , 38.Nm mparse_keep , 39.Nm mparse_readfd , 40.Nm mparse_reset , 41.Nm mparse_result , 42.Nm mparse_strerror , 43.Nm mparse_strlevel 44.Nd mandoc macro compiler library 45.Sh LIBRARY 46.Lb libmandoc 47.Sh SYNOPSIS 48.In man.h 49.In mdoc.h 50.In mandoc.h 51.Ft "enum mandoc_esc" 52.Fo mandoc_escape 53.Fa "const char const **end" 54.Fa "const char const **start" 55.Fa "int *sz" 56.Fc 57.Ft "const struct man_meta *" 58.Fo man_meta 59.Fa "const struct man *man" 60.Fc 61.Ft "const struct mparse *" 62.Fo man_mparse 63.Fa "const struct man *man" 64.Fc 65.Ft "const struct man_node *" 66.Fo man_node 67.Fa "const struct man *man" 68.Fc 69.Ft "struct mchars *" 70.Fn mchars_alloc "void" 71.Ft void 72.Fn mchars_free "struct mchars *p" 73.Ft char 74.Fn mchars_num2char "const char *cp" "size_t sz" 75.Ft int 76.Fn mchars_num2uc "const char *cp" "size_t sz" 77.Ft "const char *" 78.Fo mchars_spec2str 79.Fa "const struct mchars *p" 80.Fa "const char *cp" 81.Fa "size_t sz" 82.Fa "size_t *rsz" 83.Fc 84.Ft int 85.Fo mchars_spec2cp 86.Fa "const struct mchars *p" 87.Fa "const char *cp" 88.Fa "size_t sz" 89.Fc 90.Ft "const struct mdoc_meta *" 91.Fo mdoc_meta 92.Fa "const struct mdoc *mdoc" 93.Fc 94.Ft "const struct mdoc_node *" 95.Fo mdoc_node 96.Fa "const struct mdoc *mdoc" 97.Fc 98.Ft void 99.Fo mparse_alloc 100.Fa "enum mparset type" 101.Fa "enum mandoclevel wlevel" 102.Fa "mandocmsg msg" 103.Fa "void *msgarg" 104.Fc 105.Ft void 106.Fo mparse_free 107.Fa "struct mparse *parse" 108.Fc 109.Ft void 110.Fo mparse_getkeep 111.Fa "const struct mparse *parse" 112.Fc 113.Ft void 114.Fo mparse_keep 115.Fa "struct mparse *parse" 116.Fc 117.Ft "enum mandoclevel" 118.Fo mparse_readfd 119.Fa "struct mparse *parse" 120.Fa "int fd" 121.Fa "const char *fname" 122.Fc 123.Ft void 124.Fo mparse_reset 125.Fa "struct mparse *parse" 126.Fc 127.Ft void 128.Fo mparse_result 129.Fa "struct mparse *parse" 130.Fa "struct mdoc **mdoc" 131.Fa "struct man **man" 132.Fc 133.Ft "const char *" 134.Fo mparse_strerror 135.Fa "enum mandocerr" 136.Fc 137.Ft "const char *" 138.Fo mparse_strlevel 139.Fa "enum mandoclevel" 140.Fc 141.Vt extern const char * const * man_macronames; 142.Vt extern const char * const * mdoc_argnames; 143.Vt extern const char * const * mdoc_macronames; 144.Fd "#define ASCII_NBRSP" 145.Fd "#define ASCII_HYPH" 146.Sh DESCRIPTION 147The 148.Nm mandoc 149library parses a 150.Ux 151manual into an abstract syntax tree (AST). 152.Ux 153manuals are composed of 154.Xr mdoc 7 155or 156.Xr man 7 , 157and may be mixed with 158.Xr roff 7 , 159.Xr tbl 7 , 160and 161.Xr eqn 7 162invocations. 163.Pp 164The following describes a general parse sequence: 165.Bl -enum 166.It 167initiate a parsing sequence with 168.Fn mparse_alloc ; 169.It 170parse files or file descriptors with 171.Fn mparse_readfd ; 172.It 173retrieve a parsed syntax tree, if the parse was successful, with 174.Fn mparse_result ; 175.It 176iterate over parse nodes with 177.Fn mdoc_node 178or 179.Fn man_node ; 180.It 181free all allocated memory with 182.Fn mparse_free , 183or invoke 184.Fn mparse_reset 185and parse new files. 186.El 187.Pp 188The 189.Nm 190library also contains routines for translating character strings into glyphs 191.Pq see Fn mchars_alloc 192and parsing escape sequences from strings 193.Pq see Fn mandoc_escape . 194.Sh REFERENCE 195This section documents the functions, types, and variables available 196via 197.In mandoc.h . 198.Ss Types 199.Bl -ohang 200.It Vt "enum mandoc_esc" 201An escape sequence classification. 202.It Vt "enum mandocerr" 203A fatal error, error, or warning message during parsing. 204.It Vt "enum mandoclevel" 205A classification of an 206.Vt "enum mandoclevel" 207as regards system operation. 208.It Vt "struct mchars" 209An opaque pointer to an object allowing for translation between 210character strings and glyphs. 211See 212.Fn mchars_alloc . 213.It Vt "enum mparset" 214The type of parser when reading input. 215This should usually be 216.Dv MPARSE_AUTO 217for auto-detection. 218.It Vt "struct mparse" 219An opaque pointer to a running parse sequence. 220Created with 221.Fn mparse_alloc 222and freed with 223.Fn mparse_free . 224This may be used across parsed input if 225.Fn mparse_reset 226is called between parses. 227.It Vt "mandocmsg" 228A prototype for a function to handle fatal error, error, and warning 229messages emitted by the parser. 230.El 231.Ss Functions 232.Bl -ohang 233.It Fn mandoc_escape 234Scan an escape sequence, i.e., a character string beginning with 235.Sq \e . 236Pass a pointer to the character after the 237.Sq \e 238as 239.Va end ; 240it will be set to the supremum of the parsed escape sequence unless 241returning 242.Dv ESCAPE_ERROR , 243in which case the string is bogus and should be 244thrown away. 245If not 246.Dv ESCAPE_ERROR 247or 248.Dv ESCAPE_IGNORE , 249.Va start 250is set to the first relevant character of the substring (font, glyph, 251whatever) of length 252.Va sz . 253Both 254.Va start 255and 256.Va sz 257may be 258.Dv NULL . 259Declared in 260.In mandoc.h , 261implemented in 262.Pa mandoc.c . 263.It Fn man_meta 264Obtain the meta-data of a successful parse. 265This may only be used on a pointer returned by 266.Fn mparse_result . 267Declared in 268.In man.h , 269implemented in 270.Pa man.c . 271.It Fn man_mparse 272Get the parser used for the current output. 273Declared in 274.In man.h , 275implemented in 276.Pa man.c . 277.It Fn man_node 278Obtain the root node of a successful parse. 279This may only be used on a pointer returned by 280.Fn mparse_result . 281Declared in 282.In man.h , 283implemented in 284.Pa man.c . 285.It Fn mchars_alloc 286Allocate an 287.Vt "struct mchars *" 288object for translating special characters into glyphs. 289See 290.Xr mandoc_char 7 291for an overview of special characters. 292The object must be freed with 293.Fn mchars_free . 294Declared in 295.In mandoc.h , 296implemented in 297.Pa chars.c . 298.It Fn mchars_free 299Free an object created with 300.Fn mchars_alloc . 301Declared in 302.In mandoc.h , 303implemented in 304.Pa chars.c . 305.It Fn mchars_num2char 306Convert a character index (e.g., the \eN\(aq\(aq escape) into a 307printable ASCII character. 308Returns \e0 (the nil character) if the input sequence is malformed. 309Declared in 310.In mandoc.h , 311implemented in 312.Pa chars.c . 313.It Fn mchars_num2uc 314Convert a hexadecimal character index (e.g., the \e[uNNNN] escape) into 315a Unicode codepoint. 316Returns \e0 (the nil character) if the input sequence is malformed. 317Declared in 318.In mandoc.h , 319implemented in 320.Pa chars.c . 321.It Fn mchars_spec2cp 322Convert a special character into a valid Unicode codepoint. 323Returns \-1 on failure or a non-zero Unicode codepoint on success. 324Declared in 325.In mandoc.h , 326implemented in 327.Pa chars.c . 328.It Fn mchars_spec2str 329Convert a special character into an ASCII string. 330Returns 331.Dv NULL 332on failure. 333Declared in 334.In mandoc.h , 335implemented in 336.Pa chars.c . 337.It Fn mdoc_meta 338Obtain the meta-data of a successful parse. 339This may only be used on a pointer returned by 340.Fn mparse_result . 341Declared in 342.In mdoc.h , 343implemented in 344.Pa mdoc.c . 345.It Fn mdoc_node 346Obtain the root node of a successful parse. 347This may only be used on a pointer returned by 348.Fn mparse_result . 349Declared in 350.In mdoc.h , 351implemented in 352.Pa mdoc.c . 353.It Fn mparse_alloc 354Allocate a parser. 355The same parser may be used for multiple files so long as 356.Fn mparse_reset 357is called between parses. 358.Fn mparse_free 359must be called to free the memory allocated by this function. 360Declared in 361.In mandoc.h , 362implemented in 363.Pa read.c . 364.It Fn mparse_free 365Free all memory allocated by 366.Fn mparse_alloc . 367Declared in 368.In mandoc.h , 369implemented in 370.Pa read.c . 371.It Fn mparse_getkeep 372Acquire the keep buffer. 373Must follow a call of 374.Fn mparse_keep . 375Declared in 376.In mandoc.h , 377implemented in 378.Pa read.c . 379.It Fn mparse_keep 380Instruct the parser to retain a copy of its parsed input. 381This can be acquired with subsequent 382.Fn mparse_getkeep 383calls. 384Declared in 385.In mandoc.h , 386implemented in 387.Pa read.c . 388.It Fn mparse_readfd 389Parse a file or file descriptor. 390If 391.Va fd 392is -1, 393.Va fname 394is opened for reading. 395Otherwise, 396.Va fname 397is assumed to be the name associated with 398.Va fd . 399This may be called multiple times with different parameters; however, 400.Fn mparse_reset 401should be invoked between parses. 402Declared in 403.In mandoc.h , 404implemented in 405.Pa read.c . 406.It Fn mparse_reset 407Reset a parser so that 408.Fn mparse_readfd 409may be used again. 410Declared in 411.In mandoc.h , 412implemented in 413.Pa read.c . 414.It Fn mparse_result 415Obtain the result of a parse. 416Only successful parses 417.Po 418i.e., those where 419.Fn mparse_readfd 420returned less than MANDOCLEVEL_FATAL 421.Pc 422should invoke this function, in which case one of the two pointers will 423be filled in. 424Declared in 425.In mandoc.h , 426implemented in 427.Pa read.c . 428.It Fn mparse_strerror 429Return a statically-allocated string representation of an error code. 430Declared in 431.In mandoc.h , 432implemented in 433.Pa read.c . 434.It Fn mparse_strlevel 435Return a statically-allocated string representation of a level code. 436Declared in 437.In mandoc.h , 438implemented in 439.Pa read.c . 440.El 441.Ss Variables 442.Bl -ohang 443.It Va man_macronames 444The string representation of a man macro as indexed by 445.Vt "enum mant" . 446.It Va mdoc_argnames 447The string representation of a mdoc macro argument as indexed by 448.Vt "enum mdocargt" . 449.It Va mdoc_macronames 450The string representation of a mdoc macro as indexed by 451.Vt "enum mdoct" . 452.El 453.Sh IMPLEMENTATION NOTES 454This section consists of structural documentation for 455.Xr mdoc 7 456and 457.Xr man 7 458syntax trees and strings. 459.Ss Man and Mdoc Strings 460Strings may be extracted from mdoc and man meta-data, or from text 461nodes (MDOC_TEXT and MAN_TEXT, respectively). 462These strings have special non-printing formatting cues embedded in the 463text itself, as well as 464.Xr roff 7 465escapes preserved from input. 466Implementing systems will need to handle both situations to produce 467human-readable text. 468In general, strings may be assumed to consist of 7-bit ASCII characters. 469.Pp 470The following non-printing characters may be embedded in text strings: 471.Bl -tag -width Ds 472.It Dv ASCII_NBRSP 473A non-breaking space character. 474.It Dv ASCII_HYPH 475A soft hyphen. 476.El 477.Pp 478Escape characters are also passed verbatim into text strings. 479An escape character is a sequence of characters beginning with the 480backslash 481.Pq Sq \e . 482To construct human-readable text, these should be intercepted with 483.Fn mandoc_escape 484and converted with one of 485.Fn mchars_num2char , 486.Fn mchars_spec2str , 487and so on. 488.Ss Man Abstract Syntax Tree 489This AST is governed by the ontological rules dictated in 490.Xr man 7 491and derives its terminology accordingly. 492.Pp 493The AST is composed of 494.Vt struct man_node 495nodes with element, root and text types as declared by the 496.Va type 497field. 498Each node also provides its parse point (the 499.Va line , 500.Va sec , 501and 502.Va pos 503fields), its position in the tree (the 504.Va parent , 505.Va child , 506.Va next 507and 508.Va prev 509fields) and some type-specific data. 510.Pp 511The tree itself is arranged according to the following normal form, 512where capitalised non-terminals represent nodes. 513.Pp 514.Bl -tag -width "ELEMENTXX" -compact 515.It ROOT 516\(<- mnode+ 517.It mnode 518\(<- ELEMENT | TEXT | BLOCK 519.It BLOCK 520\(<- HEAD BODY 521.It HEAD 522\(<- mnode* 523.It BODY 524\(<- mnode* 525.It ELEMENT 526\(<- ELEMENT | TEXT* 527.It TEXT 528\(<- [[:ascii:]]* 529.El 530.Pp 531The only elements capable of nesting other elements are those with 532next-lint scope as documented in 533.Xr man 7 . 534.Ss Mdoc Abstract Syntax Tree 535This AST is governed by the ontological 536rules dictated in 537.Xr mdoc 7 538and derives its terminology accordingly. 539.Qq In-line 540elements described in 541.Xr mdoc 7 542are described simply as 543.Qq elements . 544.Pp 545The AST is composed of 546.Vt struct mdoc_node 547nodes with block, head, body, element, root and text types as declared 548by the 549.Va type 550field. 551Each node also provides its parse point (the 552.Va line , 553.Va sec , 554and 555.Va pos 556fields), its position in the tree (the 557.Va parent , 558.Va child , 559.Va nchild , 560.Va next 561and 562.Va prev 563fields) and some type-specific data, in particular, for nodes generated 564from macros, the generating macro in the 565.Va tok 566field. 567.Pp 568The tree itself is arranged according to the following normal form, 569where capitalised non-terminals represent nodes. 570.Pp 571.Bl -tag -width "ELEMENTXX" -compact 572.It ROOT 573\(<- mnode+ 574.It mnode 575\(<- BLOCK | ELEMENT | TEXT 576.It BLOCK 577\(<- HEAD [TEXT] (BODY [TEXT])+ [TAIL [TEXT]] 578.It ELEMENT 579\(<- TEXT* 580.It HEAD 581\(<- mnode* 582.It BODY 583\(<- mnode* [ENDBODY mnode*] 584.It TAIL 585\(<- mnode* 586.It TEXT 587\(<- [[:ascii:]]* 588.El 589.Pp 590Of note are the TEXT nodes following the HEAD, BODY and TAIL nodes of 591the BLOCK production: these refer to punctuation marks. 592Furthermore, although a TEXT node will generally have a non-zero-length 593string, in the specific case of 594.Sq \&.Bd \-literal , 595an empty line will produce a zero-length string. 596Multiple body parts are only found in invocations of 597.Sq \&Bl \-column , 598where a new body introduces a new phrase. 599.Pp 600The 601.Xr mdoc 7 602syntax tree accommodates for broken block structures as well. 603The ENDBODY node is available to end the formatting associated 604with a given block before the physical end of that block. 605It has a non-null 606.Va end 607field, is of the BODY 608.Va type , 609has the same 610.Va tok 611as the BLOCK it is ending, and has a 612.Va pending 613field pointing to that BLOCK's BODY node. 614It is an indirect child of that BODY node 615and has no children of its own. 616.Pp 617An ENDBODY node is generated when a block ends while one of its child 618blocks is still open, like in the following example: 619.Bd -literal -offset indent 620\&.Ao ao 621\&.Bo bo ac 622\&.Ac bc 623\&.Bc end 624.Ed 625.Pp 626This example results in the following block structure: 627.Bd -literal -offset indent 628BLOCK Ao 629 HEAD Ao 630 BODY Ao 631 TEXT ao 632 BLOCK Bo, pending -> Ao 633 HEAD Bo 634 BODY Bo 635 TEXT bo 636 TEXT ac 637 ENDBODY Ao, pending -> Ao 638 TEXT bc 639TEXT end 640.Ed 641.Pp 642Here, the formatting of the 643.Sq \&Ao 644block extends from TEXT ao to TEXT ac, 645while the formatting of the 646.Sq \&Bo 647block extends from TEXT bo to TEXT bc. 648It renders as follows in 649.Fl T Ns Cm ascii 650mode: 651.Pp 652.Dl <ao [bo ac> bc] end 653.Pp 654Support for badly-nested blocks is only provided for backward 655compatibility with some older 656.Xr mdoc 7 657implementations. 658Using badly-nested blocks is 659.Em strongly discouraged ; 660for example, the 661.Fl T Ns Cm html 662and 663.Fl T Ns Cm xhtml 664front-ends to 665.Xr mandoc 1 666are unable to render them in any meaningful way. 667Furthermore, behaviour when encountering badly-nested blocks is not 668consistent across troff implementations, especially when using multiple 669levels of badly-nested blocks. 670.Sh SEE ALSO 671.Xr mandoc 1 , 672.Xr eqn 7 , 673.Xr man 7 , 674.Xr mandoc_char 7 , 675.Xr mdoc 7 , 676.Xr roff 7 , 677.Xr tbl 7 678.Sh AUTHORS 679The 680.Nm 681library was written by 682.An Kristaps Dzonsons Aq Mt kristaps@bsd.lv . 683