1 #ifndef MUPDF_FITZ_STRUCTURED_TEXT_H
2 #define MUPDF_FITZ_STRUCTURED_TEXT_H
3 
4 #include "mupdf/fitz/system.h"
5 #include "mupdf/fitz/context.h"
6 #include "mupdf/fitz/geometry.h"
7 #include "mupdf/fitz/font.h"
8 #include "mupdf/fitz/image.h"
9 #include "mupdf/fitz/output.h"
10 #include "mupdf/fitz/device.h"
11 
12 /**
13 	Simple text layout (for use with annotation editing primarily).
14 */
15 typedef struct fz_layout_char
16 {
17 	float x, w;
18 	const char *p; /* location in source text of character */
19 	struct fz_layout_char *next;
20 } fz_layout_char;
21 
22 typedef struct fz_layout_line
23 {
24 	float x, y, h;
25 	const char *p; /* location in source text of start of line */
26 	fz_layout_char *text;
27 	struct fz_layout_line *next;
28 } fz_layout_line;
29 
30 typedef struct
31 {
32 	fz_pool *pool;
33 	fz_matrix matrix;
34 	fz_matrix inv_matrix;
35 	fz_layout_line *head, **tailp;
36 	fz_layout_char **text_tailp;
37 } fz_layout_block;
38 
39 /**
40 	Create a new layout block, with new allocation pool, zero
41 	matrices, and initialise linked pointers.
42 */
43 fz_layout_block *fz_new_layout(fz_context *ctx);
44 
45 /**
46 	Drop layout block. Free the pool, and linked blocks.
47 
48 	Never throws exceptions.
49 */
50 void fz_drop_layout(fz_context *ctx, fz_layout_block *block);
51 
52 /**
53 	Add a new line to the end of the layout block.
54 */
55 void fz_add_layout_line(fz_context *ctx, fz_layout_block *block, float x, float y, float h, const char *p);
56 
57 /**
58 	Add a new char to the line at the end of the layout block.
59 */
60 void fz_add_layout_char(fz_context *ctx, fz_layout_block *block, float x, float w, const char *p);
61 
62 /**
63 	Text extraction device: Used for searching, format conversion etc.
64 
65 	(In development - Subject to change in future versions)
66 */
67 
68 typedef struct fz_stext_char fz_stext_char;
69 typedef struct fz_stext_line fz_stext_line;
70 typedef struct fz_stext_block fz_stext_block;
71 
72 /**
73 	FZ_STEXT_PRESERVE_LIGATURES: If this option is activated
74 	ligatures are passed through to the application in their
75 	original form. If this option is deactivated ligatures are
76 	expanded into their constituent parts, e.g. the ligature ffi is
77 	expanded into three separate characters f, f and i.
78 
79 	FZ_STEXT_PRESERVE_WHITESPACE: If this option is activated
80 	whitespace is passed through to the application in its original
81 	form. If this option is deactivated any type of horizontal
82 	whitespace (including horizontal tabs) will be replaced with
83 	space characters of variable width.
84 
85 	FZ_STEXT_PRESERVE_IMAGES: If this option is set, then images
86 	will be stored in the structured text structure. The default is
87 	to ignore all images.
88 
89 	FZ_STEXT_INHIBIT_SPACES: If this option is set, we will not try
90 	to add missing space characters where there are large gaps
91 	between characters.
92 
93 	FZ_STEXT_DEHYPHENATE: If this option is set, hyphens at the
94 	end of a line will be removed and the lines will be merged.
95 
96 	FZ_STEXT_PRESERVE_SPANS: If this option is set, spans on the same line
97 	will not be merged. Each line will thus be a span of text with the same
98 	font, colour, and size.
99 */
100 enum
101 {
102 	FZ_STEXT_PRESERVE_LIGATURES = 1,
103 	FZ_STEXT_PRESERVE_WHITESPACE = 2,
104 	FZ_STEXT_PRESERVE_IMAGES = 4,
105 	FZ_STEXT_INHIBIT_SPACES = 8,
106 	FZ_STEXT_DEHYPHENATE = 16,
107 	FZ_STEXT_PRESERVE_SPANS = 32,
108 };
109 
110 /**
111 	A text page is a list of blocks, together with an overall
112 	bounding box.
113 */
114 typedef struct
115 {
116 	fz_pool *pool;
117 	fz_rect mediabox;
118 	fz_stext_block *first_block, *last_block;
119 } fz_stext_page;
120 
121 enum
122 {
123 	FZ_STEXT_BLOCK_TEXT = 0,
124 	FZ_STEXT_BLOCK_IMAGE = 1
125 };
126 
127 /**
128 	A text block is a list of lines of text (typically a paragraph),
129 	or an image.
130 */
131 struct fz_stext_block
132 {
133 	int type;
134 	fz_rect bbox;
135 	union {
136 		struct { fz_stext_line *first_line, *last_line; } t;
137 		struct { fz_matrix transform; fz_image *image; } i;
138 	} u;
139 	fz_stext_block *prev, *next;
140 };
141 
142 /**
143 	A text line is a list of characters that share a common baseline.
144 */
145 struct fz_stext_line
146 {
147 	int wmode; /* 0 for horizontal, 1 for vertical */
148 	fz_point dir; /* normalized direction of baseline */
149 	fz_rect bbox;
150 	fz_stext_char *first_char, *last_char;
151 	fz_stext_line *prev, *next;
152 };
153 
154 /**
155 	A text char is a unicode character, the style in which is
156 	appears, and the point at which it is positioned.
157 */
158 struct fz_stext_char
159 {
160 	int c;
161 	int color; /* sRGB hex color */
162 	fz_point origin;
163 	fz_quad quad;
164 	float size;
165 	fz_font *font;
166 	fz_stext_char *next;
167 };
168 
169 extern const char *fz_stext_options_usage;
170 
171 /**
172 	Create an empty text page.
173 
174 	The text page is filled out by the text device to contain the
175 	blocks and lines of text on the page.
176 
177 	mediabox: optional mediabox information.
178 */
179 fz_stext_page *fz_new_stext_page(fz_context *ctx, fz_rect mediabox);
180 void fz_drop_stext_page(fz_context *ctx, fz_stext_page *page);
181 
182 /**
183 	Output structured text to a file in HTML (visual) format.
184 */
185 void fz_print_stext_page_as_html(fz_context *ctx, fz_output *out, fz_stext_page *page, int id);
186 void fz_print_stext_header_as_html(fz_context *ctx, fz_output *out);
187 void fz_print_stext_trailer_as_html(fz_context *ctx, fz_output *out);
188 
189 /**
190 	Output structured text to a file in XHTML (semantic) format.
191 */
192 void fz_print_stext_page_as_xhtml(fz_context *ctx, fz_output *out, fz_stext_page *page, int id);
193 void fz_print_stext_header_as_xhtml(fz_context *ctx, fz_output *out);
194 void fz_print_stext_trailer_as_xhtml(fz_context *ctx, fz_output *out);
195 
196 /**
197 	Output structured text to a file in XML format.
198 */
199 void fz_print_stext_page_as_xml(fz_context *ctx, fz_output *out, fz_stext_page *page, int id);
200 
201 /**
202 	Output structured text to a file in JSON format.
203 */
204 void fz_print_stext_page_as_json(fz_context *ctx, fz_output *out, fz_stext_page *page, float scale);
205 
206 /**
207 	Output structured text to a file in plain-text UTF-8 format.
208 */
209 void fz_print_stext_page_as_text(fz_context *ctx, fz_output *out, fz_stext_page *page);
210 
211 /**
212 	Search for occurrence of 'needle' in text page.
213 
214 	Return the number of hits and store hit quads in the passed in
215 	array.
216 
217 	NOTE: This is an experimental interface and subject to change
218 	without notice.
219 */
220 int fz_search_stext_page(fz_context *ctx, fz_stext_page *text, const char *needle, fz_quad *quads, int max_quads);
221 
222 /**
223 	Return a list of quads to highlight lines inside the selection
224 	points.
225 */
226 int fz_highlight_selection(fz_context *ctx, fz_stext_page *page, fz_point a, fz_point b, fz_quad *quads, int max_quads);
227 
228 enum
229 {
230 	FZ_SELECT_CHARS,
231 	FZ_SELECT_WORDS,
232 	FZ_SELECT_LINES,
233 };
234 
235 fz_quad fz_snap_selection(fz_context *ctx, fz_stext_page *page, fz_point *ap, fz_point *bp, int mode);
236 
237 /**
238 	Return a newly allocated UTF-8 string with the text for a given
239 	selection.
240 
241 	crlf: If true, write "\r\n" style line endings (otherwise "\n"
242 	only).
243 */
244 char *fz_copy_selection(fz_context *ctx, fz_stext_page *page, fz_point a, fz_point b, int crlf);
245 
246 /**
247 	Return a newly allocated UTF-8 string with the text for a given
248 	selection rectangle.
249 
250 	crlf: If true, write "\r\n" style line endings (otherwise "\n"
251 	only).
252 */
253 char *fz_copy_rectangle(fz_context *ctx, fz_stext_page *page, fz_rect area, int crlf);
254 
255 /**
256 	Options for creating a pixmap and draw device.
257 */
258 typedef struct
259 {
260 	int flags;
261 } fz_stext_options;
262 
263 /**
264 	Parse stext device options from a comma separated key-value
265 	string.
266 */
267 fz_stext_options *fz_parse_stext_options(fz_context *ctx, fz_stext_options *opts, const char *string);
268 
269 /**
270 	Create a device to extract the text on a page.
271 
272 	Gather the text on a page into blocks and lines.
273 
274 	The reading order is taken from the order the text is drawn in
275 	the source file, so may not be accurate.
276 
277 	page: The text page to which content should be added. This will
278 	usually be a newly created (empty) text page, but it can be one
279 	containing data already (for example when merging multiple
280 	pages, or watermarking).
281 
282 	options: Options to configure the stext device.
283 */
284 fz_device *fz_new_stext_device(fz_context *ctx, fz_stext_page *page, const fz_stext_options *options);
285 
286 /**
287 	Create a device to OCR the text on the page.
288 
289 	Renders the page internally to a bitmap that is then OCRd. Text
290 	is then forwarded onto the target device.
291 
292 	target: The target device to receive the OCRd text.
293 
294 	ctm: The transform to apply to the mediabox to get the size for
295 	the rendered page image. Also used to calculate the resolution
296 	for the page image. In general, this will be the same as the CTM
297 	that you pass to fz_run_page (or fz_run_display_list) to feed
298 	this device.
299 
300 	mediabox: The mediabox (in points). Combined with the CTM to get
301 	the bounds of the pixmap used internally for the rendered page
302 	image.
303 
304 	with_list: If with_list is false, then all non-text operations
305 	are forwarded instantly to the target device. This results in
306 	the target device seeing all NON-text operations, followed by
307 	all the text operations (derived from OCR).
308 
309 	If with_list is true, then all the marking operations are
310 	collated into a display list which is then replayed to the
311 	target device at the end.
312 
313 	language: NULL (for "eng"), or a pointer to a string to describe
314 	the languages/scripts that should be used for OCR (e.g.
315 	"eng,ara").
316 */
317 fz_device *fz_new_ocr_device(fz_context *ctx, fz_device *target, fz_matrix ctm, fz_rect mediabox, int with_list, const char *language);
318 
319 #endif
320