1 #ifndef MUPDF_FITZ_STRUCTURED_TEXT_H 2 #define MUPDF_FITZ_STRUCTURED_TEXT_H 3 4 #include "mupdf/fitz/system.h" 5 #include "mupdf/fitz/context.h" 6 #include "mupdf/fitz/geometry.h" 7 #include "mupdf/fitz/font.h" 8 #include "mupdf/fitz/image.h" 9 #include "mupdf/fitz/output.h" 10 #include "mupdf/fitz/device.h" 11 12 /** 13 Simple text layout (for use with annotation editing primarily). 14 */ 15 typedef struct fz_layout_char 16 { 17 float x, w; 18 const char *p; /* location in source text of character */ 19 struct fz_layout_char *next; 20 } fz_layout_char; 21 22 typedef struct fz_layout_line 23 { 24 float x, y, h; 25 const char *p; /* location in source text of start of line */ 26 fz_layout_char *text; 27 struct fz_layout_line *next; 28 } fz_layout_line; 29 30 typedef struct 31 { 32 fz_pool *pool; 33 fz_matrix matrix; 34 fz_matrix inv_matrix; 35 fz_layout_line *head, **tailp; 36 fz_layout_char **text_tailp; 37 } fz_layout_block; 38 39 /** 40 Create a new layout block, with new allocation pool, zero 41 matrices, and initialise linked pointers. 42 */ 43 fz_layout_block *fz_new_layout(fz_context *ctx); 44 45 /** 46 Drop layout block. Free the pool, and linked blocks. 47 48 Never throws exceptions. 49 */ 50 void fz_drop_layout(fz_context *ctx, fz_layout_block *block); 51 52 /** 53 Add a new line to the end of the layout block. 54 */ 55 void fz_add_layout_line(fz_context *ctx, fz_layout_block *block, float x, float y, float h, const char *p); 56 57 /** 58 Add a new char to the line at the end of the layout block. 59 */ 60 void fz_add_layout_char(fz_context *ctx, fz_layout_block *block, float x, float w, const char *p); 61 62 /** 63 Text extraction device: Used for searching, format conversion etc. 64 65 (In development - Subject to change in future versions) 66 */ 67 68 typedef struct fz_stext_char fz_stext_char; 69 typedef struct fz_stext_line fz_stext_line; 70 typedef struct fz_stext_block fz_stext_block; 71 72 /** 73 FZ_STEXT_PRESERVE_LIGATURES: If this option is activated 74 ligatures are passed through to the application in their 75 original form. If this option is deactivated ligatures are 76 expanded into their constituent parts, e.g. the ligature ffi is 77 expanded into three separate characters f, f and i. 78 79 FZ_STEXT_PRESERVE_WHITESPACE: If this option is activated 80 whitespace is passed through to the application in its original 81 form. If this option is deactivated any type of horizontal 82 whitespace (including horizontal tabs) will be replaced with 83 space characters of variable width. 84 85 FZ_STEXT_PRESERVE_IMAGES: If this option is set, then images 86 will be stored in the structured text structure. The default is 87 to ignore all images. 88 89 FZ_STEXT_INHIBIT_SPACES: If this option is set, we will not try 90 to add missing space characters where there are large gaps 91 between characters. 92 93 FZ_STEXT_DEHYPHENATE: If this option is set, hyphens at the 94 end of a line will be removed and the lines will be merged. 95 96 FZ_STEXT_PRESERVE_SPANS: If this option is set, spans on the same line 97 will not be merged. Each line will thus be a span of text with the same 98 font, colour, and size. 99 */ 100 enum 101 { 102 FZ_STEXT_PRESERVE_LIGATURES = 1, 103 FZ_STEXT_PRESERVE_WHITESPACE = 2, 104 FZ_STEXT_PRESERVE_IMAGES = 4, 105 FZ_STEXT_INHIBIT_SPACES = 8, 106 FZ_STEXT_DEHYPHENATE = 16, 107 FZ_STEXT_PRESERVE_SPANS = 32, 108 }; 109 110 /** 111 A text page is a list of blocks, together with an overall 112 bounding box. 113 */ 114 typedef struct 115 { 116 fz_pool *pool; 117 fz_rect mediabox; 118 fz_stext_block *first_block, *last_block; 119 } fz_stext_page; 120 121 enum 122 { 123 FZ_STEXT_BLOCK_TEXT = 0, 124 FZ_STEXT_BLOCK_IMAGE = 1 125 }; 126 127 /** 128 A text block is a list of lines of text (typically a paragraph), 129 or an image. 130 */ 131 struct fz_stext_block 132 { 133 int type; 134 fz_rect bbox; 135 union { 136 struct { fz_stext_line *first_line, *last_line; } t; 137 struct { fz_matrix transform; fz_image *image; } i; 138 } u; 139 fz_stext_block *prev, *next; 140 }; 141 142 /** 143 A text line is a list of characters that share a common baseline. 144 */ 145 struct fz_stext_line 146 { 147 int wmode; /* 0 for horizontal, 1 for vertical */ 148 fz_point dir; /* normalized direction of baseline */ 149 fz_rect bbox; 150 fz_stext_char *first_char, *last_char; 151 fz_stext_line *prev, *next; 152 }; 153 154 /** 155 A text char is a unicode character, the style in which is 156 appears, and the point at which it is positioned. 157 */ 158 struct fz_stext_char 159 { 160 int c; 161 int color; /* sRGB hex color */ 162 fz_point origin; 163 fz_quad quad; 164 float size; 165 fz_font *font; 166 fz_stext_char *next; 167 }; 168 169 extern const char *fz_stext_options_usage; 170 171 /** 172 Create an empty text page. 173 174 The text page is filled out by the text device to contain the 175 blocks and lines of text on the page. 176 177 mediabox: optional mediabox information. 178 */ 179 fz_stext_page *fz_new_stext_page(fz_context *ctx, fz_rect mediabox); 180 void fz_drop_stext_page(fz_context *ctx, fz_stext_page *page); 181 182 /** 183 Output structured text to a file in HTML (visual) format. 184 */ 185 void fz_print_stext_page_as_html(fz_context *ctx, fz_output *out, fz_stext_page *page, int id); 186 void fz_print_stext_header_as_html(fz_context *ctx, fz_output *out); 187 void fz_print_stext_trailer_as_html(fz_context *ctx, fz_output *out); 188 189 /** 190 Output structured text to a file in XHTML (semantic) format. 191 */ 192 void fz_print_stext_page_as_xhtml(fz_context *ctx, fz_output *out, fz_stext_page *page, int id); 193 void fz_print_stext_header_as_xhtml(fz_context *ctx, fz_output *out); 194 void fz_print_stext_trailer_as_xhtml(fz_context *ctx, fz_output *out); 195 196 /** 197 Output structured text to a file in XML format. 198 */ 199 void fz_print_stext_page_as_xml(fz_context *ctx, fz_output *out, fz_stext_page *page, int id); 200 201 /** 202 Output structured text to a file in JSON format. 203 */ 204 void fz_print_stext_page_as_json(fz_context *ctx, fz_output *out, fz_stext_page *page, float scale); 205 206 /** 207 Output structured text to a file in plain-text UTF-8 format. 208 */ 209 void fz_print_stext_page_as_text(fz_context *ctx, fz_output *out, fz_stext_page *page); 210 211 /** 212 Search for occurrence of 'needle' in text page. 213 214 Return the number of hits and store hit quads in the passed in 215 array. 216 217 NOTE: This is an experimental interface and subject to change 218 without notice. 219 */ 220 int fz_search_stext_page(fz_context *ctx, fz_stext_page *text, const char *needle, fz_quad *quads, int max_quads); 221 222 /** 223 Return a list of quads to highlight lines inside the selection 224 points. 225 */ 226 int fz_highlight_selection(fz_context *ctx, fz_stext_page *page, fz_point a, fz_point b, fz_quad *quads, int max_quads); 227 228 enum 229 { 230 FZ_SELECT_CHARS, 231 FZ_SELECT_WORDS, 232 FZ_SELECT_LINES, 233 }; 234 235 fz_quad fz_snap_selection(fz_context *ctx, fz_stext_page *page, fz_point *ap, fz_point *bp, int mode); 236 237 /** 238 Return a newly allocated UTF-8 string with the text for a given 239 selection. 240 241 crlf: If true, write "\r\n" style line endings (otherwise "\n" 242 only). 243 */ 244 char *fz_copy_selection(fz_context *ctx, fz_stext_page *page, fz_point a, fz_point b, int crlf); 245 246 /** 247 Return a newly allocated UTF-8 string with the text for a given 248 selection rectangle. 249 250 crlf: If true, write "\r\n" style line endings (otherwise "\n" 251 only). 252 */ 253 char *fz_copy_rectangle(fz_context *ctx, fz_stext_page *page, fz_rect area, int crlf); 254 255 /** 256 Options for creating a pixmap and draw device. 257 */ 258 typedef struct 259 { 260 int flags; 261 } fz_stext_options; 262 263 /** 264 Parse stext device options from a comma separated key-value 265 string. 266 */ 267 fz_stext_options *fz_parse_stext_options(fz_context *ctx, fz_stext_options *opts, const char *string); 268 269 /** 270 Create a device to extract the text on a page. 271 272 Gather the text on a page into blocks and lines. 273 274 The reading order is taken from the order the text is drawn in 275 the source file, so may not be accurate. 276 277 page: The text page to which content should be added. This will 278 usually be a newly created (empty) text page, but it can be one 279 containing data already (for example when merging multiple 280 pages, or watermarking). 281 282 options: Options to configure the stext device. 283 */ 284 fz_device *fz_new_stext_device(fz_context *ctx, fz_stext_page *page, const fz_stext_options *options); 285 286 /** 287 Create a device to OCR the text on the page. 288 289 Renders the page internally to a bitmap that is then OCRd. Text 290 is then forwarded onto the target device. 291 292 target: The target device to receive the OCRd text. 293 294 ctm: The transform to apply to the mediabox to get the size for 295 the rendered page image. Also used to calculate the resolution 296 for the page image. In general, this will be the same as the CTM 297 that you pass to fz_run_page (or fz_run_display_list) to feed 298 this device. 299 300 mediabox: The mediabox (in points). Combined with the CTM to get 301 the bounds of the pixmap used internally for the rendered page 302 image. 303 304 with_list: If with_list is false, then all non-text operations 305 are forwarded instantly to the target device. This results in 306 the target device seeing all NON-text operations, followed by 307 all the text operations (derived from OCR). 308 309 If with_list is true, then all the marking operations are 310 collated into a display list which is then replayed to the 311 target device at the end. 312 313 language: NULL (for "eng"), or a pointer to a string to describe 314 the languages/scripts that should be used for OCR (e.g. 315 "eng,ara"). 316 */ 317 fz_device *fz_new_ocr_device(fz_context *ctx, fz_device *target, fz_matrix ctm, fz_rect mediabox, int with_list, const char *language); 318 319 #endif 320