1 // Extracted from Bidi.cpp - version 26
2 
3 // Reference implementation for Unicode Bidirectional Algorithm
4 
5 // Bidi include file
6 #include "mupdf/fitz.h"
7 #include "bidi-imp.h"
8 
9 #include <assert.h>
10 
11 #ifndef TRUE
12 #define TRUE (1)
13 #endif
14 #ifndef FALSE
15 #define FALSE (0)
16 #endif
17 
18 /*------------------------------------------------------------------------
19 	File: Bidi.Cpp
20 
21 	Description
22 	-----------
23 
24 	Sample Implementation of the Unicode Bidirectional Algorithm as it
25 	was revised by Revision 5 of the Unicode Technical Report # 9
26 	(1999-8-17)
27 
28 	Verified for changes to the algorithm up through Unicode 5.2.0 (2009).
29 
30 	This implementation is organized into several passes, each implemen-
31 	ting one or more of the rules of the Unicode Bidi Algorithm. The
32 	resolution of Weak Types and of Neutrals each use a state table
33 	approach.
34 
35 	Both a printf based interface and a Windows DlgProc are provided for
36 	interactive testing.
37 
38 	A stress harness comparing this implementation (v24) to a Java based
39 	implementation was used by Doug Felt to verify that the two
40 	implementations produce identical results for all strings up to six
41 	bidi classes and stochastic strings up to length 20.
42 
43 	Version 26 was verified by the author against the Unicode 5.2.0
44 	file BidiTest.txt, which provides an exhaustive text of strings of
45 	length 4 or less, but covers some important cases where the language
46 	in UAX#9 had been clarified.
47 
48 	To see this code running in an actual Windows program,
49 	download the free Unibook utility from http://unicode.org/unibook
50 	The bidi demo is executed from the tools menu. It is build from
51 	this source file.
52 
53 	Build Notes
54 	-----------
55 
56 	To compile the sample implementation please set the #define
57 	directives above so the correct headers get included. Not all the
58 	files are needed for all purposes. For the commandline version
59 	only bidi.h and bidi.cpp are needed.
60 
61 	The Win32 version is provided as a dialog procedure. To use as a
62 	standalone program compile with the lbmain.cpp file. If all you
63 	need is the ability to run the code "as is", you can instead download
64 	the unibook utility from http://uincode.org/unibook/ which contains
65 	the bidi demo compiled from this source file.
66 
67 	This code uses an extension to C++ that gives variables declared in
68 	a for() statement function the same scope as the for() statement.
69 	If your compiler does not support this extension, you may need to
70 	move the declaration, e.g. int ich = 0; in front of the for statement.
71 
72 	Implementation Note
73 	-------------------
74 
75 	NOTE: The Unicode Bidirectional Algorithm removes all explicit
76 		formatting codes in rule X9, but states that this can be
77 		simulated by conformant implementations. This implementation
78 		attempts to demonstrate such a simulation
79 
80 		To demonstrate this, the current implementation does the
81 		following:
82 
83 		in resolveExplicit()
84 			- change LRE, LRO, RLE, RLO, PDF to BN
85 			- assign nested levels to BN
86 
87 		in resolveWeak and resolveNeutrals
88 			- assign L and R to BN's where they exist in place of
89 			  sor and eor by changing the last BN in front of a
90 			  level change to a strong type
91 			- skip over BN's for the purpose of determining actions
92 			- include BN in the count of deferred runs
93 				which will resolve some of them to EN, AN and N
94 
95 		in resolveWhiteSpace
96 			- set the level of any surviving BN to the base level,
97 				or the level of the preceding character
98 			- include LRE,LRO, RLE, RLO, PDF and BN in the count
99 			   whitespace to be reset
100 
101 		This will result in the same order for non-BN characters as
102 		if the BN characters had been removed.
103 
104 		The clean() function can be used to remove boundary marks for
105 		verification purposes.
106 
107 	Notation
108 	--------
109 	Pointer variables generally start with the letter p
110 	Counter variables generally start with the letter c
111 	Index variables generally start with the letter i
112 	Boolean variables generally start with the letter f
113 
114 	The enumerated bidirectional types have the same name as in the
115 	description for the Unicode Bidirectional Algorithm
116 
117 	Using this code outside a demo context
118 	--------------------------------------
119 
120 	The way the functions are broken down in this demo code is based
121 	on the needs of the demo to show the evolution in internal state
122 	as the algorithm proceeds. This obscures how the algorithm would
123 	be used in practice. These are the steps:
124 
125 	1. Allocate a pair of arrays large enough to hold bidi class
126 	   and calculated levels (one for each input character)
127 
128 	2. Provide your own function to assign directional types (bidi
129 	   classes) corresponding to each character in the input,
130 	   conflating ON, WS, S to N. Unlike the classify function in this
131 	   demo, the input would be actual Unicode characters.
132 
133 	3. Process the first paragraph by calling BidiParagraph. That
134 	   function changes B into BN and returns a length including the
135 	   paragraph separator. (The iteration over multiple paragraphs
136 	   is left as exercise for the reader).
137 
138 	4. Assign directional types again, but now assign specific types
139 	   to whitespace characters.
140 
141 	5. Instead of reordering the input in place it is often desirable
142 	   to calculate an array of offsets indicating the reordering.
143 	   To that end, allocate such an array here and use it instead
144 	   of the input array in the next step.
145 
146 	6. Resolve and reorder the lines by calling BidiLines. That
147 	   function 'breaks' lines on LS characters. Provide an optional
148 	   array of flags indicating the location of other line breaks as
149 	   needed.
150 
151 	Update History
152 	--------------
153 	Version 24 is the initial published and verified version of this
154 	reference implementation. Version 25 and its updates fix various
155 	minor issues with the scaffolding used for demonstrating the
156 	algorithm using pseudo-alphabets from the command line or dialog
157 	box. No changes to the implementation of the actual bidi algorithm
158 	are made in any of the minor updates to version 25. Version 26
159 	also makes no change to the actual algorithm but was verified
160 	against the official BidiTest.txt file for Unicode 5.2.0.
161 
162 	- updated pseudo-alphabet
163 
164 	- Last Revised 12-10-99 (25)
165 
166 	- enable demo mode for release builds - no other changes
167 
168 	- Last Revised 12-10-00 (25a)
169 
170 	- fix regression in pseudo alphabet use for Windows UI
171 
172 	- Last Revised 02-01-01 (25b)
173 
174 	- fixed a few comments, renamed a variable
175 
176 	- Last Revised 03-04-01 (25c)
177 
178 	- make base level settable, enable mirror by default,
179 	fix dialog size
180 
181 	- Last Revised 06-02-01 (25e)
182 
183 	- fixed some comments
184 
185 	- Last Revised 09-29-01 (25f)
186 
187 	- fixed classification for LS,RLM,LRM in pseudo alphabet,
188 	focus issues in UI, regression fix to commandline from 25(e)
189 	fix DEMO switch
190 
191 	- Last Revised 11-07-01 (25g)
192 
193 	- fixed classification for plus/minus in pseudo alphabet
194 	to track changes made in Unicode 4.0.1
195 
196 	- Last Revised 12-03-04 (25h)
197 
198 	- now compiles as dialog-only program for WINDOWS_UI==1
199 	using new bidimain.cpp
200 
201 	- Last Revised 12-02-07 (25i)
202 
203 	- cleaned up whitespace and indenting in the source,
204 	fixed two comments (table headers)
205 
206 	- Last Revised 15-03-07 (25j)
207 
208 	- named enumerations
209 
210 	- Last Revised 30-05-07 (25k)
211 
212 	- added usage notes, minor edits to comments, indentation, etc
213 	throughout. Added the bidiParagraph function. Checked against
214 	changes in the Unicode Bidi Algorithm for Unicode 5.2.0. No
215 	changes needed to this implementation to match the values in
216 	the BidiTest.txt file in the Unicode Character Database.
217 	Minor fixes to dialog/windows proc, updated preprocessor directives.
218 
219 	- Last Revised 03-08-09 (26)
220 
221 	Credits:
222 	-------
223 	Written by: Asmus Freytag
224 	Command line interface by: Rick McGowan
225 	Verification (v24): Doug Felt
226 
227 	Disclaimer and legal rights:
228 	---------------------------
229 	Copyright (C) 1999-2009, ASMUS, Inc. All Rights Reserved.
230 	Distributed under the Terms of Use in http://www.unicode.org/copyright.html.
231 
232 	THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
233 	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
234 	FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
235 	IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE
236 	BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES,
237 	OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
238 	WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
239 	ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THE SOFTWARE.
240 
241 	The file bid.rc is included in the software covered by the above.
242 ------------------------------------------------------------------------*/
243 
244 /* === HELPER FUNCTIONS AND DECLARATIONS ================================= */
245 
246 #define odd(x) ((x) & 1)
247 
248 /*----------------------------------------------------------------------
249 	The following array maps character codes to types for the purpose of
250 	this sample implementation. The legend string gives a human readable
251 	explanation of the pseudo alphabet.
252 
253 	For simplicity, characters entered by buttons are given a 1:1 mapping
254 	between their type and pseudo character value. Pseudo characters that
255 	can be typed from the keyboard are explained in the legend string.
256 
257 	Use the Unicode Character Database for the real values in real use.
258  ---------------------------------------------------------------------*/
259 
260 enum
261 {
262 	chLS = 0x15
263 };
264 
265 #if 0
266 static const fz_bidi_chartype types_from_char[] =
267 {
268 // 0	   1	   2	   3	   4	   5	   6	   7	   8	   9	   a	   b	   c	   d	   e	   f
269 BDI_BN, BDI_BN, BDI_BN, BDI_BN, BDI_L,  BDI_R,  BDI_BN, BDI_BN, BDI_BN, BDI_S,  BDI_B,  BDI_S,  BDI_WS, BDI_B,  BDI_BN, BDI_BN, /*00-0f*/
270 BDI_LRO,BDI_LRE,BDI_PDF,BDI_RLO,BDI_RLE,BDI_WS, BDI_L,  BDI_R,  BDI_BN, BDI_BN, BDI_BN, BDI_BN, BDI_B,  BDI_B,  BDI_B,  BDI_S,  /*10-1f*/
271 BDI_WS, BDI_ON, BDI_ON, BDI_ET, BDI_ET, BDI_ET, BDI_ON, BDI_ON, BDI_ON, BDI_ON, BDI_ON, BDI_ES, BDI_CS, BDI_ES, BDI_CS, BDI_ES, /*20-2f*/
272 BDI_EN, BDI_EN, BDI_EN, BDI_EN, BDI_EN, BDI_EN, BDI_AN, BDI_AN, BDI_AN, BDI_AN, BDI_CS, BDI_ON, BDI_ON, BDI_ON, BDI_ON, BDI_ON, /*30-3f*/
273 BDI_ON, BDI_AL, BDI_AL, BDI_AL, BDI_AL, BDI_AL, BDI_AL, BDI_R,  BDI_R,  BDI_R,  BDI_R,  BDI_R,  BDI_R,  BDI_R,  BDI_R,  BDI_R,  /*40-4f*/
274 BDI_R,  BDI_R,  BDI_R,  BDI_R,  BDI_R,  BDI_R,  BDI_R,  BDI_R,  BDI_R,  BDI_R,  BDI_R,  BDI_LRE,BDI_ON, BDI_RLE,BDI_PDF,BDI_S,  /*50-5f*/
275 BDI_NSM,BDI_L,  BDI_L,  BDI_L,  BDI_L,  BDI_L,  BDI_L,  BDI_L,  BDI_L,  BDI_L,  BDI_L,  BDI_L,  BDI_L,  BDI_L,  BDI_L,  BDI_L,  /*60-6f*/
276 BDI_L,  BDI_L,  BDI_L,  BDI_L,  BDI_L,  BDI_L,  BDI_L,  BDI_L,  BDI_L,  BDI_L,  BDI_L,  BDI_LRO,BDI_B,  BDI_RLO,BDI_BN, BDI_ON, /*70-7f*/
277 };
278 #endif
279 
280 /***************************************
281 	Reverse, human readable reference:
282 
283 	LRM:	0x4
284 	RLM:	0x5
285 	  L:	0x16,a-z
286 	LRE:	0x11,[
287 	LRO:	0x10,{
288 	  R:	0x17,G-Z
289 	 AL:	A-F
290 	RLE:	0x14,]
291 	RLO:	0x13,}
292 	PDF:	0x12,^
293 	 EN:	0-5
294 	 ES:	/,+,[hyphen]
295 	 ET:	#,$,%
296 	 AN:	6-9
297 	 CS:	[comma],.,:
298 	NSM:	`
299 	 BN:	0x0-0x8,0xe,0xf,0x18-0x1b,~
300 	  B:	0xa,0xd,0x1c-0x1e,|
301 	  S:	0x9,0xb,0x1f,_
302 	 WS:	0xc,0x15,[space]
303 	 ON:	!,",&,',(,),*,;,<,=,>,?,@,\,0x7f
304 ****************************************/
305 
306 // === HELPER FUNCTIONS ================================================
307 
308 #ifdef BIDI_LINE_AT_A_TIME
309 // reverse cch characters
310 static
reverse(uint32_t * psz,int cch)311 void reverse(uint32_t *psz, int cch)
312 {
313 	uint32_t ch_temp;
314 	int ich;
315 
316 	for (ich = 0; ich < --cch; ich++)
317 	{
318 		ch_temp = psz[ich];
319 		psz[ich] = psz[cch];
320 		psz[cch] = ch_temp;
321 	}
322 }
323 #endif
324 
325 // Set a run of cval values at locations all prior to, but not including
326 // iStart, to the new value nval.
327 static
set_deferred_run(fz_bidi_chartype * pval,size_t cval,size_t iStart,fz_bidi_chartype nval)328 void set_deferred_run(fz_bidi_chartype *pval, size_t cval, size_t iStart, fz_bidi_chartype nval)
329 {
330 	size_t i;
331 
332 	for (i = iStart; i > iStart - cval; )
333 	{
334 		pval[--i] = nval;
335 	}
336 }
337 
338 static
set_deferred_level_run(fz_bidi_level * pval,size_t cval,size_t iStart,fz_bidi_level nval)339 void set_deferred_level_run(fz_bidi_level *pval, size_t cval, size_t iStart, fz_bidi_level nval)
340 {
341 	size_t i;
342 
343 	for (i = iStart; i > iStart - cval; )
344 	{
345 		pval[--i] = nval;
346 	}
347 }
348 
349 // === ASSIGNING BIDI CLASSES ============================================
350 
351 // === THE PARAGRAPH LEVEL ===============================================
352 
353 /*------------------------------------------------------------------------
354 	Function: resolve_paragraphs
355 
356 	Resolves the input strings into blocks over which the algorithm
357 	is then applied.
358 
359 	Implements Rule P1 of the Unicode Bidi Algorithm
360 
361 	Input: Text string
362 		   Character count
363 
364 	Output: revised character count
365 
366 	Note:	This is a very simplistic function. In effect it restricts
367 			the action of the algorithm to the first paragraph in the input
368 			where a paragraph ends at the end of the first block separator
369 			or at the end of the input text.
370 
371 ------------------------------------------------------------------------*/
fz_bidi_resolve_paragraphs(fz_bidi_chartype * types,size_t cch)372 size_t fz_bidi_resolve_paragraphs(fz_bidi_chartype *types, size_t cch)
373 {
374 	size_t ich;
375 
376 	// skip characters not of type B
377 	for(ich = 0; ich < cch && types[ich] != BDI_B; ich++)
378 		;
379 	// stop after first B, make it a BN for use in the next steps
380 	if (ich < cch && types[ich] == BDI_B)
381 		types[ich++] = BDI_BN;
382 	return ich;
383 }
384 
385 #if 0
386 /*------------------------------------------------------------------------
387 	Function: base_level
388 
389 	Determines the base level
390 
391 	Implements rule P2 of the Unicode Bidi Algorithm.
392 
393 	Input: Array of directional classes
394 		   Character count
395 
396 	Note: Ignores explicit embeddings
397 ------------------------------------------------------------------------*/
398 static int base_level(const fz_bidi_chartype *pcls, int cch)
399 {
400 	int ich;
401 
402 	for (ich = 0; ich < cch; ich++)
403 	{
404 		switch (pcls[ich])
405 		{
406 		// strong left
407 		case BDI_L:
408 			return 0;
409 
410 		// strong right
411 		case BDI_R:
412 		case BDI_AL:
413 			return 1;
414 		}
415 	}
416 	return 0;
417 }
418 #endif
419 
420 //====== RESOLVE EXPLICIT ================================================
421 
greater_even(fz_bidi_level i)422 static fz_bidi_level greater_even(fz_bidi_level i)
423 {
424 	return odd(i) ? i + 1 : i + 2;
425 }
426 
greater_odd(fz_bidi_level i)427 static fz_bidi_level greater_odd(fz_bidi_level i)
428 {
429 	return odd(i) ? i + 2 : i + 1;
430 }
431 
embedding_direction(fz_bidi_chartype level)432 static fz_bidi_chartype embedding_direction(fz_bidi_chartype level)
433 {
434 	return odd(level) ? BDI_R : BDI_L;
435 }
436 
437 /*------------------------------------------------------------------------
438 	Function: resolveExplicit
439 
440 	Recursively resolves explicit embedding levels and overrides.
441 	Implements rules X1-X9, of the Unicode Bidirectional Algorithm.
442 
443 	Input: Base embedding level and direction
444 		   Character count
445 
446 	Output: Array of embedding levels
447 		  Caller must allocate (one level per input character)
448 
449 	In/Out: Array of direction classes
450 
451 	Note: The function uses two simple counters to keep track of
452 		  matching explicit codes and PDF. Use the default argument for
453 		  the outermost call. The nesting counter counts the recursion
454 		  depth and not the embedding level.
455 ------------------------------------------------------------------------*/
fz_bidi_resolve_explicit(fz_bidi_level level,fz_bidi_chartype dir,fz_bidi_chartype * pcls,fz_bidi_level * plevel,size_t cch,fz_bidi_level n_nest)456 size_t fz_bidi_resolve_explicit(fz_bidi_level level, fz_bidi_chartype dir, fz_bidi_chartype *pcls, fz_bidi_level *plevel, size_t cch,
457 				fz_bidi_level n_nest)
458 {
459 	size_t ich;
460 
461 	// always called with a valid nesting level
462 	// nesting levels are != embedding levels
463 	int nLastValid = n_nest;
464 
465 	// check input values
466 	assert(n_nest >= 0 && level >= 0 && level <= BIDI_LEVEL_MAX);
467 
468 	// process the text
469 	for (ich = 0; ich < cch; ich++)
470 	{
471 		fz_bidi_chartype cls = pcls[ich];
472 		switch (cls)
473 		{
474 		case BDI_LRO:
475 		case BDI_LRE:
476 			n_nest++;
477 			if (greater_even(level) <= BIDI_LEVEL_MAX)
478 			{
479 				plevel[ich] = greater_even(level);
480 				pcls[ich] = BDI_BN;
481 				ich += fz_bidi_resolve_explicit(plevel[ich], (cls == BDI_LRE ? BDI_N : BDI_L),
482 							&pcls[ich+1], &plevel[ich+1],
483 							 cch - (ich+1), n_nest);
484 				n_nest--;
485 				continue;
486 			}
487 			cls = pcls[ich] = BDI_BN;
488 			break;
489 
490 		case BDI_RLO:
491 		case BDI_RLE:
492 			n_nest++;
493 			if (greater_odd(level) <= BIDI_LEVEL_MAX)
494 			{
495 				plevel[ich] = greater_odd(level);
496 				pcls[ich] = BDI_BN;
497 				ich += fz_bidi_resolve_explicit(plevel[ich], (cls == BDI_RLE ? BDI_N : BDI_R),
498 								&pcls[ich+1], &plevel[ich+1],
499 								 cch - (ich+1), n_nest);
500 				n_nest--;
501 				continue;
502 			}
503 			cls = pcls[ich] = BDI_BN;
504 			break;
505 
506 		case BDI_PDF:
507 			cls = pcls[ich] = BDI_BN;
508 			if (n_nest)
509 			{
510 				if (nLastValid < n_nest)
511 				{
512 					n_nest--;
513 				}
514 				else
515 				{
516 					cch = ich; // break the loop, but complete body
517 				}
518 			}
519 			break;
520 		}
521 
522 		// Apply the override
523 		if (dir != BDI_N)
524 		{
525 			cls = dir;
526 		}
527 		plevel[ich] = level;
528 		if (pcls[ich] != BDI_BN)
529 			pcls[ich] = cls;
530 	}
531 
532 	return ich;
533 }
534 
535 // === RESOLVE WEAK TYPES ================================================
536 
537 enum bidi_state // possible states
538 {
539 	xa,	//	arabic letter
540 	xr,	//	right letter
541 	xl,	//	left letter
542 
543 	ao,	//	arabic lett. foll by ON
544 	ro,	//	right lett. foll by ON
545 	lo,	//	left lett. foll by ON
546 
547 	rt,	//	ET following R
548 	lt,	//	ET following L
549 
550 	cn,	//	EN, AN following AL
551 	ra,	//	arabic number foll R
552 	re,	//	european number foll R
553 	la,	//	arabic number foll L
554 	le,	//	european number foll L
555 
556 	ac,	//	CS following cn
557 	rc,	//	CS following ra
558 	rs,	//	CS,ES following re
559 	lc,	//	CS following la
560 	ls,	//	CS,ES following le
561 
562 	ret,	//	ET following re
563 	let	//	ET following le
564 } ;
565 
566 const unsigned char state_weak[][10] =
567 {
568 	//	N,  L,  R,  AN, EN, AL,NSM, CS, ES, ET,
569 /*xa*/  { ao, xl, xr, cn, cn, xa, xa, ao, ao, ao }, /* arabic letter		  */
570 /*xr*/  { ro, xl, xr, ra, re, xa, xr, ro, ro, rt }, /* right letter		   */
571 /*xl*/  { lo, xl, xr, la, le, xa, xl, lo, lo, lt }, /* left letter			  */
572 
573 /*ao*/  { ao, xl, xr, cn, cn, xa, ao, ao, ao, ao }, /* arabic lett. foll by ON*/
574 /*ro*/  { ro, xl, xr, ra, re, xa, ro, ro, ro, rt }, /* right lett. foll by ON */
575 /*lo*/  { lo, xl, xr, la, le, xa, lo, lo, lo, lt }, /* left lett. foll by ON  */
576 
577 /*rt*/  { ro, xl, xr, ra, re, xa, rt, ro, ro, rt }, /* ET following R		  */
578 /*lt*/  { lo, xl, xr, la, le, xa, lt, lo, lo, lt }, /* ET following L		  */
579 
580 /*cn*/  { ao, xl, xr, cn, cn, xa, cn, ac, ao, ao }, /* EN, AN following AL	  */
581 /*ra*/  { ro, xl, xr, ra, re, xa, ra, rc, ro, rt }, /* arabic number foll R   */
582 /*re*/  { ro, xl, xr, ra, re, xa, re, rs, rs,ret }, /* european number foll R */
583 /*la*/  { lo, xl, xr, la, le, xa, la, lc, lo, lt }, /* arabic number foll L   */
584 /*le*/  { lo, xl, xr, la, le, xa, le, ls, ls,let }, /* european number foll L */
585 
586 /*ac*/  { ao, xl, xr, cn, cn, xa, ao, ao, ao, ao }, /* CS following cn		  */
587 /*rc*/  { ro, xl, xr, ra, re, xa, ro, ro, ro, rt }, /* CS following ra		  */
588 /*rs*/  { ro, xl, xr, ra, re, xa, ro, ro, ro, rt }, /* CS,ES following re	  */
589 /*lc*/  { lo, xl, xr, la, le, xa, lo, lo, lo, lt }, /* CS following la		  */
590 /*ls*/  { lo, xl, xr, la, le, xa, lo, lo, lo, lt }, /* CS,ES following le	  */
591 
592 /*ret*/ { ro, xl, xr, ra, re, xa,ret, ro, ro,ret }, /* ET following re		  */
593 /*let*/ { lo, xl, xr, la, le, xa,let, lo, lo,let }  /* ET following le		  */
594 
595 };
596 
597 enum bidi_action // possible actions
598 {
599 	// primitives
600 	IX = 0x100,				  // increment
601 	XX = 0xF,					// no-op
602 
603 	// actions
604 	xxx = (XX << 4) + XX,		// no-op
605 	xIx = IX + xxx,			// increment run
606 	xxN = (XX << 4) + BDI_ON,	// set current to N
607 	xxE = (XX << 4) + BDI_EN,	// set current to EN
608 	xxA = (XX << 4) + BDI_AN,	// set current to AN
609 	xxR = (XX << 4) + BDI_R,	// set current to R
610 	xxL = (XX << 4) + BDI_L,	// set current to L
611 	Nxx = (BDI_ON << 4) + 0xF,	// set run to neutral
612 	Axx = (BDI_AN << 4) + 0xF,	// set run to AN
613 	ExE = (BDI_EN << 4) + BDI_EN,	// set run to EN, set current to EN
614 	NIx = (BDI_ON << 4) + 0xF + IX,	// set run to N, increment
615 	NxN = (BDI_ON << 4) + BDI_ON,	// set run to N, set current to N
616 	NxR = (BDI_ON << 4) + BDI_R,	// set run to N, set current to R
617 	NxE = (BDI_ON << 4) + BDI_EN,	// set run to N, set current to EN
618 
619 	AxA = (BDI_AN << 4) + BDI_AN,	// set run to AN, set current to AN
620 	NxL = (BDI_ON << 4) + BDI_L,	// set run to N, set current to L
621 	LxL = (BDI_L << 4) + BDI_L	// set run to L, set current to L
622 };
623 
624 typedef uint16_t fz_bidi_action;
625 
626 const fz_bidi_action action_weak[][10] =
627 {
628 	//   N,.. L,   R,  AN,  EN,  AL, NSM,  CS,..ES,  ET,
629 /*xa*/ { xxx, xxx, xxx, xxx, xxA, xxR, xxR, xxN, xxN, xxN }, /* arabic letter			*/
630 /*xr*/ { xxx, xxx, xxx, xxx, xxE, xxR, xxR, xxN, xxN, xIx }, /* right letter			 */
631 /*xl*/ { xxx, xxx, xxx, xxx, xxL, xxR, xxL, xxN, xxN, xIx }, /* left letter			 */
632 
633 /*ao*/ { xxx, xxx, xxx, xxx, xxA, xxR, xxN, xxN, xxN, xxN }, /* arabic lett. foll by ON	*/
634 /*ro*/ { xxx, xxx, xxx, xxx, xxE, xxR, xxN, xxN, xxN, xIx }, /* right lett. foll by ON	*/
635 /*lo*/ { xxx, xxx, xxx, xxx, xxL, xxR, xxN, xxN, xxN, xIx }, /* left lett. foll by ON	*/
636 
637 /*rt*/ { Nxx, Nxx, Nxx, Nxx, ExE, NxR, xIx, NxN, NxN, xIx }, /* ET following R			*/
638 /*lt*/ { Nxx, Nxx, Nxx, Nxx, LxL, NxR, xIx, NxN, NxN, xIx }, /* ET following L			*/
639 
640 /*cn*/ { xxx, xxx, xxx, xxx, xxA, xxR, xxA, xIx, xxN, xxN }, /* EN, AN following  AL	*/
641 /*ra*/ { xxx, xxx, xxx, xxx, xxE, xxR, xxA, xIx, xxN, xIx }, /* arabic number foll R	*/
642 /*re*/ { xxx, xxx, xxx, xxx, xxE, xxR, xxE, xIx, xIx, xxE }, /* european number foll R	*/
643 /*la*/ { xxx, xxx, xxx, xxx, xxL, xxR, xxA, xIx, xxN, xIx }, /* arabic number foll L	*/
644 /*le*/ { xxx, xxx, xxx, xxx, xxL, xxR, xxL, xIx, xIx, xxL }, /* european number foll L	*/
645 
646 /*ac*/ { Nxx, Nxx, Nxx, Axx, AxA, NxR, NxN, NxN, NxN, NxN }, /* CS following cn		 */
647 /*rc*/ { Nxx, Nxx, Nxx, Axx, NxE, NxR, NxN, NxN, NxN, NIx }, /* CS following ra		 */
648 /*rs*/ { Nxx, Nxx, Nxx, Nxx, ExE, NxR, NxN, NxN, NxN, NIx }, /* CS,ES following re		*/
649 /*lc*/ { Nxx, Nxx, Nxx, Axx, NxL, NxR, NxN, NxN, NxN, NIx }, /* CS following la		 */
650 /*ls*/ { Nxx, Nxx, Nxx, Nxx, LxL, NxR, NxN, NxN, NxN, NIx }, /* CS,ES following le		*/
651 
652 /*ret*/{ xxx, xxx, xxx, xxx, xxE, xxR, xxE, xxN, xxN, xxE }, /* ET following re			*/
653 /*let*/{ xxx, xxx, xxx, xxx, xxL, xxR, xxL, xxN, xxN, xxL }  /* ET following le			*/
654 };
655 
656 static
get_deferred_type(fz_bidi_action action)657 fz_bidi_chartype get_deferred_type(fz_bidi_action action)
658 {
659 	return (action >> 4) & 0xF;
660 }
661 
662 static
get_resolved_type(fz_bidi_action action)663 fz_bidi_chartype get_resolved_type(fz_bidi_action action)
664 {
665 	return action & 0xF;
666 }
667 
668 /* Note on action table:
669 
670 	States can be of two kinds:
671 	 - Immediate Resolution State, where each input token
672 	   is resolved as soon as it is seen. These states have
673 	   only single action codes (xxN) or the no-op (xxx)
674 	   for static input tokens.
675 	 - Deferred Resolution State, where input tokens either
676 	   either extend the run (xIx) or resolve its Type (e.g. Nxx).
677 
678 	Input classes are of three kinds
679 	 - Static Input Token, where the class of the token remains
680 	   unchanged on output (AN, L, N, R)
681 	 - Replaced Input Token, where the class of the token is
682 	   always replaced on output (AL, BDI_BN, NSM, CS, ES, ET)
683 	 - Conditional Input Token, where the class of the token is
684 	   changed on output in some but not all cases (EN)
685 
686 	 Where tokens are subject to change, a double action
687 	 (e.g. NxA, or NxN) is _required_ after deferred states,
688 	 resolving both the deferred state and changing the current token.
689 
690 	These properties of the table are verified by assertions below.
691 	This code is needed only during debugging and maintenance
692 */
693 
694 /*------------------------------------------------------------------------
695 	Function: resolveWeak
696 
697 	Resolves the directionality of numeric and other weak character types
698 
699 	Implements rules X10 and W1-W6 of the Unicode Bidirectional Algorithm.
700 
701 	Input: Array of embedding levels
702 		   Character count
703 
704 	In/Out: Array of directional classes
705 
706 	Note: On input only these directional classes are expected
707 		  AL, HL, R, L,  ON, BDI_BN, NSM, AN, EN, ES, ET, CS,
708 ------------------------------------------------------------------------*/
fz_bidi_resolve_weak(fz_context * ctx,fz_bidi_level baselevel,fz_bidi_chartype * pcls,fz_bidi_level * plevel,size_t cch)709 void fz_bidi_resolve_weak(fz_context *ctx, fz_bidi_level baselevel, fz_bidi_chartype *pcls, fz_bidi_level *plevel, size_t cch)
710 {
711 	int state = odd(baselevel) ? xr : xl;
712 	fz_bidi_chartype cls;
713 	size_t ich;
714 	fz_bidi_action action;
715 	fz_bidi_chartype cls_run;
716 	fz_bidi_chartype cls_new;
717 
718 	fz_bidi_level level = baselevel;
719 
720 	size_t cch_run = 0;
721 
722 	for (ich = 0; ich < cch; ich++)
723 	{
724 		if (pcls[ich] > BDI_BN) {
725 			fz_warn(ctx, "error: pcls[%zu] > BN (%d)\n", ich, pcls[ich]);
726 		}
727 
728 		// ignore boundary neutrals
729 		if (pcls[ich] == BDI_BN)
730 		{
731 			// must flatten levels unless at a level change;
732 			plevel[ich] = level;
733 
734 			// lookahead for level changes
735 			if (ich + 1 == cch && level != baselevel)
736 			{
737 				// have to fixup last BN before end of the loop, since
738 				// its fix-upped value will be needed below the assert
739 				pcls[ich] = embedding_direction(level);
740 			}
741 			else if (ich + 1 < cch && level != plevel[ich+1] && pcls[ich+1] != BDI_BN)
742 			{
743 				// fixup LAST BN in front / after a level run to make
744 				// it act like the SOR/EOR in rule X10
745 				int newlevel = plevel[ich+1];
746 				if (level > newlevel) {
747 					newlevel = level;
748 				}
749 				plevel[ich] = newlevel;
750 
751 				// must match assigned level
752 				pcls[ich] = embedding_direction(newlevel);
753 				level = plevel[ich+1];
754 			}
755 			else
756 			{
757 				// don't interrupt runs
758 				if (cch_run)
759 				{
760 					cch_run++;
761 				}
762 				continue;
763 			}
764 		}
765 
766 		assert(pcls[ich] <= BDI_BN);
767 		cls = pcls[ich];
768 
769 		action = action_weak[state][cls];
770 
771 		// resolve the directionality for deferred runs
772 		cls_run = get_deferred_type(action);
773 		if (cls_run != XX)
774 		{
775 			set_deferred_run(pcls, cch_run, ich, cls_run);
776 			cch_run = 0;
777 		}
778 
779 		// resolve the directionality class at the current location
780 		cls_new = get_resolved_type(action);
781 		if (cls_new != XX)
782 			pcls[ich] = cls_new;
783 
784 		// increment a deferred run
785 		if (IX & action)
786 			cch_run++;
787 
788 		state = state_weak[state][cls];
789 	}
790 
791 	// resolve any deferred runs
792 	// use the direction of the current level to emulate PDF
793 	cls = embedding_direction(level);
794 
795 	// resolve the directionality for deferred runs
796 	cls_run = get_deferred_type(action_weak[state][cls]);
797 	if (cls_run != XX)
798 		set_deferred_run(pcls, cch_run, ich, cls_run);
799 }
800 
801 // === RESOLVE NEUTRAL TYPES ==============================================
802 
803 // action values
804 enum neutral_action
805 {
806 	// action to resolve previous input
807 	nL = BDI_L,		// resolve EN to L
808 	En = 3 << 4,		// resolve neutrals run to embedding level direction
809 	Rn = BDI_R << 4,	// resolve neutrals run to strong right
810 	Ln = BDI_L << 4,	// resolved neutrals run to strong left
811 	In = (1<<8),		// increment count of deferred neutrals
812 	LnL = (1<<4)+BDI_L	// set run and EN to L
813 };
814 
815 static fz_bidi_chartype
get_deferred_neutrals(fz_bidi_action action,fz_bidi_level level)816 get_deferred_neutrals(fz_bidi_action action, fz_bidi_level level)
817 {
818 	action = (action >> 4) & 0xF;
819 	if (action == (En >> 4))
820 		return embedding_direction(level);
821 	else
822 		return action;
823 }
824 
get_resolved_neutrals(fz_bidi_action action)825 static fz_bidi_chartype get_resolved_neutrals(fz_bidi_action action)
826 {
827 	action = action & 0xF;
828 	if (action == In)
829 		return 0;
830 	else
831 		return action;
832 }
833 
834 // state values
835 enum neutral_state
836 {
837 	// new temporary class
838 	r,	// R and characters resolved to R
839 	l,	// L and characters resolved to L
840 	rn,	// N preceded by right
841 	ln,	// N preceded by left
842 	a,	// AN preceded by left (the abbrev 'la' is used up above)
843 	na	// N preceded by a
844 } ;
845 
846 /*------------------------------------------------------------------------
847 	Notes:
848 
849 	By rule W7, whenever a EN is 'dominated' by an L (including start of
850 	run with embedding direction = L) it is resolved to, and further treated
851 	as L.
852 
853 	This leads to the need for 'a' and 'na' states.
854 ------------------------------------------------------------------------*/
855 
856 const int action_neutrals[][5] =
857 {
858 //	N,	L,	R, AN, EN, = cls
859 					// state =
860 	{In,  0,  0,  0,  0},		// r	right
861 	{In,  0,  0,  0,  BDI_L},	// l	left
862 
863 	{In, En, Rn, Rn, Rn},		// rn	N preceded by right
864 	{In, Ln, En, En, LnL},		// ln	N preceded by left
865 
866 	{In,  0,  0,  0,  BDI_L},	// a   AN preceded by left
867 	{In, En, Rn, Rn, En}		// na	N  preceded by a
868 } ;
869 
870 const int state_neutrals[][5] =
871 {
872 //	 N, L,	R,	AN, EN = cls
873 					// state =
874 	{rn, l,	r,	r,	r},	// r   right
875 	{ln, l,	r,	a,	l},	// l   left
876 
877 	{rn, l,	r,	r,	r},	// rn  N preceded by right
878 	{ln, l,	r,	a,	l},	// ln  N preceded by left
879 
880 	{na, l,	r,	a,	l},	// a  AN preceded by left
881 	{na, l,	r,	a,	l}	// na  N preceded by la
882 } ;
883 
884 /*------------------------------------------------------------------------
885 	Function: resolveNeutrals
886 
887 	Resolves the directionality of neutral character types.
888 
889 	Implements rules W7, N1 and N2 of the Unicode Bidi Algorithm.
890 
891 	Input: Array of embedding levels
892 		   Character count
893 		   Baselevel
894 
895 	In/Out: Array of directional classes
896 
897 	Note: On input only these directional classes are expected
898 		  R,  L,  N, AN, EN and BN
899 
900 		  W8 resolves a number of ENs to L
901 ------------------------------------------------------------------------*/
fz_bidi_resolve_neutrals(fz_bidi_level baselevel,fz_bidi_chartype * pcls,const fz_bidi_level * plevel,size_t cch)902 void fz_bidi_resolve_neutrals(fz_bidi_level baselevel, fz_bidi_chartype *pcls, const fz_bidi_level *plevel, size_t cch)
903 {
904 	// the state at the start of text depends on the base level
905 	int state = odd(baselevel) ? r : l;
906 	fz_bidi_chartype cls;
907 	size_t ich;
908 	fz_bidi_chartype cls_run;
909 
910 	size_t cch_run = 0;
911 	fz_bidi_level level = baselevel;
912 
913 	for (ich = 0; ich < cch; ich++)
914 	{
915 		int action;
916 		fz_bidi_chartype cls_new;
917 
918 		// ignore boundary neutrals
919 		if (pcls[ich] == BDI_BN)
920 		{
921 			// include in the count for a deferred run
922 			if (cch_run)
923 				cch_run++;
924 
925 			// skip any further processing
926 			continue;
927 		}
928 
929 		assert(pcls[ich] < 5); // "Only N, L, R, AN, EN are allowed"
930 		cls = pcls[ich];
931 
932 		action = action_neutrals[state][cls];
933 
934 		// resolve the directionality for deferred runs
935 		cls_run = get_deferred_neutrals(action, level);
936 		if (cls_run != BDI_N)
937 		{
938 			set_deferred_run(pcls, cch_run, ich, cls_run);
939 			cch_run = 0;
940 		}
941 
942 		// resolve the directionality class at the current location
943 		cls_new = get_resolved_neutrals(action);
944 		if (cls_new != BDI_N)
945 			pcls[ich] = cls_new;
946 
947 		if (In & action)
948 			cch_run++;
949 
950 		state = state_neutrals[state][cls];
951 		level = plevel[ich];
952 	}
953 
954 	// resolve any deferred runs
955 	cls = embedding_direction(level);	// eor has type of current level
956 
957 	// resolve the directionality for deferred runs
958 	cls_run = get_deferred_neutrals(action_neutrals[state][cls], level);
959 	if (cls_run != BDI_N)
960 		set_deferred_run(pcls, cch_run, ich, cls_run);
961 }
962 
963 // === RESOLVE IMPLICITLY =================================================
964 
965 /*------------------------------------------------------------------------
966 	Function: resolveImplicit
967 
968 	Recursively resolves implicit embedding levels.
969 	Implements rules I1 and I2 of the Unicode Bidirectional Algorithm.
970 
971 	Input: Array of direction classes
972 		   Character count
973 		   Base level
974 
975 	In/Out: Array of embedding levels
976 
977 	Note: levels may exceed 15 on output.
978 		  Accepted subset of direction classes
979 		  R, L, AN, EN
980 ------------------------------------------------------------------------*/
981 const fz_bidi_level add_level[][4] =
982 {
983 	// L,  R,	AN, EN = cls
984 					// level =
985 /* even */ { 0,	1,	2,	2 },	// EVEN
986 /* odd	*/ { 1,	0,	1,	1 }	// ODD
987 
988 };
989 
fz_bidi_resolve_implicit(const fz_bidi_chartype * pcls,fz_bidi_level * plevel,size_t cch)990 void fz_bidi_resolve_implicit(const fz_bidi_chartype *pcls, fz_bidi_level *plevel, size_t cch)
991 {
992 	size_t ich;
993 
994 	for (ich = 0; ich < cch; ich++)
995 	{
996 		// cannot resolve bn here, since some bn were resolved to strong
997 		// types in resolveWeak. To remove these we need the original
998 		// types, which are available again in resolveWhiteSpace
999 		if (pcls[ich] == BDI_BN)
1000 		{
1001 			continue;
1002 		}
1003 		assert(pcls[ich] > 0); // "No Neutrals allowed to survive here."
1004 		assert(pcls[ich] < 5); // "Out of range."
1005 		plevel[ich] += add_level[odd(plevel[ich])][pcls[ich] - 1];
1006 	}
1007 }
1008 
1009 #if 0
1010 // === REORDER ===========================================================
1011 /*------------------------------------------------------------------------
1012 	Function: resolve_lines
1013 
1014 	Breaks a paragraph into lines
1015 
1016 	Input:	Character count
1017 			Array of line break flags
1018 	In/Out:	Array of characters
1019 
1020 	Returns the count of characters on the first line
1021 
1022 	Note: This function only breaks lines at hard line breaks. Other
1023 	line breaks can be passed in. If pbrk[n] is true, then a break
1024 	occurs after the character in psz_input[n]. Breaks before the first
1025 	character are not allowed.
1026 ------------------------------------------------------------------------*/
1027 static int resolve_lines(uint32_t *psz_input, int *pbrk, int cch)
1028 {
1029 	int ich;
1030 
1031 	// skip characters not of type LS
1032 	for(ich = 0; ich < cch; ich++)
1033 	{
1034 		if (psz_input[ich] == chLS || (pbrk && pbrk[ich]))
1035 		{
1036 			ich++;
1037 			break;
1038 		}
1039 	}
1040 
1041 	return ich;
1042 }
1043 #endif
1044 
1045 /*------------------------------------------------------------------------
1046 	Function: fz_bidi_resolve_whitespace
1047 
1048 	Resolves levels for WS and S
1049 	Implements rule L1 of the Unicode bidi Algorithm.
1050 
1051 	Input:	Base embedding level
1052 			Character count
1053 			Array of direction classes (for one line of text)
1054 
1055 	In/Out: Array of embedding levels (for one line of text)
1056 
1057 	Note: this should be applied a line at a time. The default driver
1058 		  code supplied in this file assumes a single line of text; for
1059 		  a real implementation, cch and the initial pointer values
1060 		  would have to be adjusted.
1061 ------------------------------------------------------------------------*/
fz_bidi_resolve_whitespace(fz_bidi_level baselevel,const fz_bidi_chartype * pcls,fz_bidi_level * plevel,size_t cch)1062 void fz_bidi_resolve_whitespace(fz_bidi_level baselevel, const fz_bidi_chartype *pcls, fz_bidi_level *plevel,
1063 				size_t cch)
1064 {
1065 	size_t cchrun = 0;
1066 	fz_bidi_level oldlevel = baselevel;
1067 	size_t ich;
1068 
1069 	for (ich = 0; ich < cch; ich++)
1070 	{
1071 		switch(pcls[ich])
1072 		{
1073 		default:
1074 			cchrun = 0; // any other character breaks the run
1075 			break;
1076 		case BDI_WS:
1077 			cchrun++;
1078 			break;
1079 
1080 		case BDI_RLE:
1081 		case BDI_LRE:
1082 		case BDI_LRO:
1083 		case BDI_RLO:
1084 		case BDI_PDF:
1085 		case BDI_BN:
1086 			plevel[ich] = oldlevel;
1087 			cchrun++;
1088 			break;
1089 
1090 		case BDI_S:
1091 		case BDI_B:
1092 			// reset levels for WS before eot
1093 			set_deferred_level_run(plevel, cchrun, ich, baselevel);
1094 			cchrun = 0;
1095 			plevel[ich] = baselevel;
1096 			break;
1097 		}
1098 		oldlevel = plevel[ich];
1099 	}
1100 	// reset level before eot
1101 	set_deferred_level_run(plevel, cchrun, ich, baselevel);
1102 }
1103 
1104 #ifdef BIDI_LINE_AT_A_TIME
1105 /*------------------------------------------------------------------------
1106 	Functions: reorder/reorderLevel
1107 
1108 	Recursively reorders the display string
1109 	"From the highest level down, reverse all characters at that level and
1110 	higher, down to the lowest odd level"
1111 
1112 	Implements rule L2 of the Unicode bidi Algorithm.
1113 
1114 	Input: Array of embedding levels
1115 		   Character count
1116 		   Flag enabling reversal (set to false by initial caller)
1117 
1118 	In/Out: Text to reorder
1119 
1120 	Note: levels may exceed 15 resp. 61 on input.
1121 
1122 	Rule L3 - reorder combining marks is not implemented here
1123 	Rule L4 - glyph mirroring is implemented as a display option below
1124 
1125 	Note: this should be applied a line at a time
1126 -------------------------------------------------------------------------*/
reorderLevel(fz_bidi_level level,uint32_t * psz_text,const fz_bidi_level * plevel,int cch,int f_reverse)1127 static int reorderLevel(fz_bidi_level level, uint32_t *psz_text, const fz_bidi_level *plevel, int cch,
1128 				 int f_reverse)
1129 {
1130 	int ich;
1131 
1132 	// true as soon as first odd level encountered
1133 	f_reverse = f_reverse || odd(level);
1134 
1135 	for (ich = 0; ich < cch; ich++)
1136 	{
1137 		if (plevel[ich] < level)
1138 		{
1139 			break;
1140 		}
1141 		else if (plevel[ich] > level)
1142 		{
1143 			ich += reorderLevel(level + 1, psz_text + ich, plevel + ich,
1144 				cch - ich, f_reverse) - 1;
1145 		}
1146 	}
1147 	if (f_reverse)
1148 	{
1149 		reverse(psz_text, ich);
1150 	}
1151 	return ich;
1152 }
1153 
Bidi_reorder(fz_bidi_level baselevel,uint32_t * psz_text,const fz_bidi_level * plevel,int cch)1154 int Bidi_reorder(fz_bidi_level baselevel, uint32_t *psz_text, const fz_bidi_level *plevel, int cch)
1155 {
1156 	int ich = 0;
1157 
1158 	while (ich < cch)
1159 	{
1160 		ich += reorderLevel(baselevel, psz_text + ich, plevel + ich,
1161 			cch - ich, FALSE);
1162 	}
1163 	return ich;
1164 }
1165 #endif
1166