1 /*******************************************************************************
2  * Copyright (c) 2010, 2011 IBM Corporation and others.
3  *
4  * This program and the accompanying materials
5  * are made available under the terms of the Eclipse Public License 2.0
6  * which accompanies this distribution, and is available at
7  * https://www.eclipse.org/legal/epl-2.0/
8  *
9  * SPDX-License-Identifier: EPL-2.0
10  *
11  * Contributors:
12  *     IBM Corporation - initial API and implementation
13  ******************************************************************************/
14 package org.eclipse.equinox.bidi.internal;
15 
16 import org.eclipse.equinox.bidi.advanced.*;
17 import org.eclipse.equinox.bidi.custom.*;
18 
19 /**
20  * Implementation for IStructuredTextExpert.
21  */
22 public class StructuredTextImpl implements IStructuredTextExpert {
23 
24 	static final String EMPTY_STRING = ""; //$NON-NLS-1$
25 
26 	// In the following lines, B, L, R and AL represent bidi categories
27 	// as defined in the Unicode Bidirectional Algorithm
28 	// ( http://www.unicode.org/reports/tr9/ ).
29 	// B  represents the category Block Separator.
30 	// L  represents the category Left to Right character.
31 	// R  represents the category Right to Left character.
32 	// AL represents the category Arabic Letter.
33 	// AN represents the category Arabic Number.
34 	// EN  represents the category European Number.
35 	static final byte B = Character.DIRECTIONALITY_PARAGRAPH_SEPARATOR;
36 	static final byte L = Character.DIRECTIONALITY_LEFT_TO_RIGHT;
37 	static final byte R = Character.DIRECTIONALITY_RIGHT_TO_LEFT;
38 	static final byte AL = Character.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC;
39 	static final byte AN = Character.DIRECTIONALITY_ARABIC_NUMBER;
40 	static final byte EN = Character.DIRECTIONALITY_EUROPEAN_NUMBER;
41 
42 	static final char LRM = 0x200E;
43 	static final char RLM = 0x200F;
44 	static final char LRE = 0x202A;
45 	static final char RLE = 0x202B;
46 	static final char PDF = 0x202C;
47 	static final char[] MARKS = {LRM, RLM};
48 	static final char[] EMBEDS = {LRE, RLE};
49 	static final int PREFIX_LENGTH = 2;
50 	static final int SUFFIX_LENGTH = 2;
51 	static final int FIXES_LENGTH = PREFIX_LENGTH + SUFFIX_LENGTH;
52 	static final int[] EMPTY_INT_ARRAY = new int[0];
53 
54 	/**
55 	 * The structured text handler utilized by this expert.
56 	 */
57 	protected final StructuredTextTypeHandler handler;
58 	/**
59 	 * The environment associated with the expert.
60 	 */
61 	protected final StructuredTextEnvironment environment;
62 	/**
63 	 * Flag which is true if the expert is stateful.
64 	 */
65 	protected final boolean sharedExpert;
66 	/**
67 	 * Last state value set by {@link #setState} or {@link #clearState}.
68 	 */
69 	protected Object state;
70 
71 	/**
72 	 * Constructor used in {@link StructuredTextExpertFactory}.
73 	 *
74 	 * @param structuredTextHandler the structured text handler used by this expert.
75 	 * @param environment the environment associated with this expert.
76 	 * @param shared flag which is true if the expert is stateful.
77 	 */
StructuredTextImpl(StructuredTextTypeHandler structuredTextHandler, StructuredTextEnvironment environment, boolean shared)78 	public StructuredTextImpl(StructuredTextTypeHandler structuredTextHandler, StructuredTextEnvironment environment, boolean shared) {
79 		this.handler = structuredTextHandler;
80 		this.environment = environment;
81 		sharedExpert = shared;
82 	}
83 
84 	@Override
getTypeHandler()85 	public StructuredTextTypeHandler getTypeHandler() {
86 		return handler;
87 	}
88 
89 	@Override
getEnvironment()90 	public StructuredTextEnvironment getEnvironment() {
91 		return environment;
92 	}
93 
94 	@Override
getTextDirection(String text)95 	public int getTextDirection(String text) {
96 		return handler.getDirection(this, text);
97 	}
98 
99 	@Override
clearState()100 	public void clearState() {
101 		if (sharedExpert)
102 			state = null;
103 	}
104 
105 	@Override
setState(Object newState)106 	public void setState(Object newState) {
107 		if (sharedExpert)
108 			state = newState;
109 	}
110 
111 	@Override
getState()112 	public Object getState() {
113 		return state;
114 	}
115 
computeNextLocation(String text, StructuredTextCharTypes charTypes, StructuredTextOffsets offsets, int[] locations, int curPos)116 	long computeNextLocation(String text, StructuredTextCharTypes charTypes, StructuredTextOffsets offsets, int[] locations, int curPos) {
117 		String separators = handler.getSeparators(this);
118 		int separCount = separators.length();
119 		int specialsCount = handler.getSpecialsCount(this);
120 		int len = text.length();
121 		int nextLocation = len;
122 		int idxLocation = 0;
123 		// Start with special sequences to give them precedence over simple
124 		// separators. This may apply to cases like slash+asterisk versus slash.
125 		for (int i = 0; i < specialsCount; i++) {
126 			int location = locations[separCount + i];
127 			if (location < curPos) {
128 				location = handler.indexOfSpecial(this, text, charTypes, offsets, i + 1, curPos);
129 				if (location < 0)
130 					location = len;
131 				locations[separCount + i] = location;
132 			}
133 			if (location < nextLocation) {
134 				nextLocation = location;
135 				idxLocation = separCount + i;
136 			}
137 		}
138 		for (int i = 0; i < separCount; i++) {
139 			int location = locations[i];
140 			if (location < curPos) {
141 				location = text.indexOf(separators.charAt(i), curPos);
142 				if (location < 0)
143 					location = len;
144 				locations[i] = location;
145 			}
146 			if (location < nextLocation) {
147 				nextLocation = location;
148 				idxLocation = i;
149 			}
150 		}
151 		return nextLocation + (((long) idxLocation) << 32);
152 	}
153 
154 	/**
155 	 * @see StructuredTextTypeHandler#processSeparator StructuredTextTypeHandler.processSeparator
156 	 */
processSeparator(String text, StructuredTextCharTypes charTypes, StructuredTextOffsets offsets, int separLocation)157 	static public void processSeparator(String text, StructuredTextCharTypes charTypes, StructuredTextOffsets offsets, int separLocation) {
158 		int len = text.length();
159 		int direction = charTypes.getDirection();
160 		if (direction == DIR_RTL) {
161 			// the structured text base direction is RTL
162 			for (int i = separLocation - 1; i >= 0; i--) {
163 				byte charType = charTypes.getBidiTypeAt(i);
164 				if (charType == R || charType == AL)
165 					return;
166 				if (charType == L) {
167 					for (int j = separLocation; j < len; j++) {
168 						charType = charTypes.getBidiTypeAt(j);
169 						if (charType == R || charType == AL)
170 							return;
171 						if (charType == L || charType == EN) {
172 							offsets.insertOffset(charTypes, separLocation);
173 							return;
174 						}
175 					}
176 					return;
177 				}
178 			}
179 			return;
180 		}
181 
182 		// the structured text base direction is LTR
183 		boolean doneAN = false;
184 		for (int i = separLocation - 1; i >= 0; i--) {
185 			byte charType = charTypes.getBidiTypeAt(i);
186 			if (charType == L)
187 				return;
188 			if (charType == R || charType == AL) {
189 				for (int j = separLocation; j < len; j++) {
190 					charType = charTypes.getBidiTypeAt(j);
191 					if (charType == L)
192 						return;
193 					if (charType == R || charType == EN || charType == AL || charType == AN) {
194 						offsets.insertOffset(charTypes, separLocation);
195 						return;
196 					}
197 				}
198 				return;
199 			}
200 			if (charType == AN && !doneAN) {
201 				for (int j = separLocation; j < len; j++) {
202 					charType = charTypes.getBidiTypeAt(j);
203 					if (charType == L)
204 						return;
205 					if (charType == AL || charType == AN || charType == R) {
206 						offsets.insertOffset(charTypes, separLocation);
207 						return;
208 					}
209 				}
210 				doneAN = true;
211 			}
212 		}
213 	}
214 
215 	/**
216 	 * When the orientation is <code>ORIENT_LTR</code> and the
217 	 * structured text has a RTL base direction,
218 	 * {@link IStructuredTextExpert#leanToFullText leanToFullText}
219 	 * adds RLE+RLM at the head of the <i>full</i> text and RLM+PDF at its
220 	 * end.
221 	 * <p>
222 	 * When the orientation is <code>ORIENT_RTL</code> and the
223 	 * structured text has a LTR base direction,
224 	 * {@link IStructuredTextExpert#leanToFullText leanToFullText}
225 	 * adds LRE+LRM at the head of the <i>full</i> text and LRM+PDF at its
226 	 * end.
227 	 * <p>
228 	 * When the orientation is <code>ORIENT_CONTEXTUAL_LTR</code> or
229 	 * <code>ORIENT_CONTEXTUAL_RTL</code> and the data content would resolve
230 	 * to a RTL orientation while the structured text has a LTR base
231 	 * direction, {@link IStructuredTextExpert#leanToFullText leanToFullText}
232 	 * adds LRM at the head of the <i>full</i> text.
233 	 * <p>
234 	 * When the orientation is <code>ORIENT_CONTEXTUAL_LTR</code> or
235 	 * <code>ORIENT_CONTEXTUAL_RTL</code> and the data content would resolve
236 	 * to a LTR orientation while the structured text has a RTL base
237 	 * direction, {@link IStructuredTextExpert#leanToFullText leanToFullText}
238 	 * adds RLM at the head of the <i>full</i> text.
239 	 * <p>
240 	 * When the orientation is <code>ORIENT_UNKNOWN</code> and the
241 	 * structured text has a LTR base direction,
242 	 * {@link IStructuredTextExpert#leanToFullText leanToFullText}
243 	 * adds LRE+LRM at the head of the <i>full</i> text and LRM+PDF at its
244 	 * end.
245 	 * <p>
246 	 * When the orientation is <code>ORIENT_UNKNOWN</code> and the
247 	 * structured text has a RTL base direction,
248 	 * {@link IStructuredTextExpert#leanToFullText leanToFullText}
249 	 * adds RLE+RLM at the head of the <i>full</i> text and RLM+PDF at its
250 	 * end.
251 	 * <p>
252 	 * When the orientation is <code>ORIENT_IGNORE</code>,
253 	 * {@link IStructuredTextExpert#leanToFullText leanToFullText} does not add any directional
254 	 * formatting characters as either prefix or suffix of the <i>full</i> text.
255 	 * <p>
256 	 */
257 	@Override
leanToFullText(String text)258 	public String leanToFullText(String text) {
259 		int len = text.length();
260 		if (len == 0)
261 			return text;
262 		StructuredTextCharTypes charTypes = new StructuredTextCharTypes(this, text);
263 		StructuredTextOffsets offsets = leanToFullCommon(text, charTypes);
264 		int prefixLength = offsets.getPrefixLength();
265 		int direction = charTypes.getDirection();
266 		return insertMarks(text, offsets.getOffsets(), direction, prefixLength);
267 	}
268 
269 	@Override
leanToFullMap(String text)270 	public int[] leanToFullMap(String text) {
271 		int len = text.length();
272 		if (len == 0)
273 			return EMPTY_INT_ARRAY;
274 		StructuredTextCharTypes charTypes = new StructuredTextCharTypes(this, text);
275 		StructuredTextOffsets offsets = leanToFullCommon(text, charTypes);
276 		int prefixLength = offsets.getPrefixLength();
277 		int[] map = new int[len];
278 		int count = offsets.getCount(); // number of used entries
279 		int added = prefixLength;
280 		for (int pos = 0, i = 0; pos < len; pos++) {
281 			if (i < count && pos == offsets.getOffset(i)) {
282 				added++;
283 				i++;
284 			}
285 			map[pos] = pos + added;
286 		}
287 		return map;
288 	}
289 
290 	@Override
leanBidiCharOffsets(String text)291 	public int[] leanBidiCharOffsets(String text) {
292 		int len = text.length();
293 		if (len == 0)
294 			return EMPTY_INT_ARRAY;
295 		StructuredTextCharTypes charTypes = new StructuredTextCharTypes(this, text);
296 		StructuredTextOffsets offsets = leanToFullCommon(text, charTypes);
297 		return offsets.getOffsets();
298 	}
299 
leanToFullCommon(String text, StructuredTextCharTypes charTypes)300 	private StructuredTextOffsets leanToFullCommon(String text, StructuredTextCharTypes charTypes) {
301 		int len = text.length();
302 		int direction = handler.getDirection(this, text, charTypes);
303 		StructuredTextOffsets offsets = new StructuredTextOffsets();
304 		if (!handler.skipProcessing(this, text, charTypes)) {
305 			// initialize locations
306 			int separCount = handler.getSeparators(this).length();
307 			int[] locations = new int[separCount + handler.getSpecialsCount(this)];
308 			for (int i = 0, k = locations.length; i < k; i++) {
309 				locations[i] = -1;
310 			}
311 			// current position
312 			int curPos = 0;
313 			if (state != null) {
314 				curPos = handler.processSpecial(this, text, charTypes, offsets, 0, -1);
315 			}
316 			while (true) {
317 				// location of next token to handle
318 				int nextLocation;
319 				// index of next token to handle (if < separCount, this is a separator; otherwise a special case
320 				int idxLocation;
321 				long res = computeNextLocation(text, charTypes, offsets, locations, curPos);
322 				nextLocation = (int) (res & 0x00000000FFFFFFFF); /* low word */
323 				if (nextLocation >= len)
324 					break;
325 				idxLocation = (int) (res >> 32); /* high word */
326 				if (idxLocation < separCount) {
327 					processSeparator(text, charTypes, offsets, nextLocation);
328 					curPos = nextLocation + 1;
329 				} else {
330 					idxLocation -= (separCount - 1); // because caseNumber starts from 1
331 					curPos = handler.processSpecial(this, text, charTypes, offsets, idxLocation, nextLocation);
332 				}
333 				if (curPos >= len)
334 					break;
335 			} // end while
336 		} // end if (!handler.skipProcessing())
337 		int prefixLength;
338 		int orientation = environment.getOrientation();
339 		if (orientation == StructuredTextEnvironment.ORIENT_IGNORE)
340 			prefixLength = 0;
341 		else {
342 			int resolvedOrientation = charTypes.resolveOrientation();
343 			if (orientation != StructuredTextEnvironment.ORIENT_UNKNOWN && resolvedOrientation == direction)
344 				prefixLength = 0;
345 			else if ((orientation & StructuredTextEnvironment.ORIENT_CONTEXTUAL) != 0)
346 				prefixLength = 1;
347 			else
348 				prefixLength = 2;
349 		}
350 		offsets.setPrefixLength(prefixLength);
351 		return offsets;
352 	}
353 
354 	@Override
fullToLeanText(String full)355 	public String fullToLeanText(String full) {
356 		if (full.length() == 0)
357 			return full;
358 		int dir = handler.getDirection(this, full);
359 		char curMark = MARKS[dir];
360 		char curEmbed = EMBEDS[dir];
361 		int i; // used as loop index
362 		// remove any prefix and leading mark
363 		int lenFull = full.length();
364 		for (i = 0; i < lenFull; i++) {
365 			char c = full.charAt(i);
366 			if (c != curEmbed && c != curMark)
367 				break;
368 		}
369 		if (i > 0) { // found at least one prefix or leading mark
370 			full = full.substring(i);
371 			lenFull = full.length();
372 		}
373 		// remove any suffix and trailing mark
374 		for (i = lenFull - 1; i >= 0; i--) {
375 			char c = full.charAt(i);
376 			if (c != PDF && c != curMark)
377 				break;
378 		}
379 		if (i < 0) // only suffix and trailing marks, no real data
380 			return EMPTY_STRING;
381 		if (i < (lenFull - 1)) { // found at least one suffix or trailing mark
382 			full = full.substring(0, i + 1);
383 			lenFull = full.length();
384 		}
385 		char[] chars = full.toCharArray();
386 		// remove marks from chars
387 		int cnt = 0;
388 		for (i = 0; i < lenFull; i++) {
389 			char c = chars[i];
390 			if (c == curMark)
391 				cnt++;
392 			else if (cnt > 0)
393 				chars[i - cnt] = c;
394 		}
395 		String lean = new String(chars, 0, lenFull - cnt);
396 		String full2 = leanToFullText(lean);
397 		// strip prefix and suffix
398 		int beginIndex = 0, endIndex = full2.length();
399 		if (full2.charAt(0) == curMark)
400 			beginIndex = 1;
401 		else {
402 			if (full2.charAt(0) == curEmbed) {
403 				beginIndex = 1;
404 				if (full2.charAt(0) == curMark)
405 					beginIndex = 2;
406 			}
407 			if (full2.charAt(endIndex - 1) == PDF) {
408 				endIndex--;
409 				if (full2.charAt(endIndex - 1) == curMark)
410 					endIndex--;
411 			}
412 		}
413 		if (beginIndex > 0 || endIndex < full2.length())
414 			full2 = full2.substring(beginIndex, endIndex);
415 		if (full2.equals(full))
416 			return lean;
417 
418 		// There are some marks in full which are not in full2 and/or vice versa.
419 		// We need to add to lean any mark appearing in full and not in full2.
420 		// The completed lean can never be longer than full itself.
421 		char[] newChars = new char[lenFull];
422 		char cFull, cFull2;
423 		int idxFull, idxFull2, idxLean, newCharsPos;
424 		int lenFull2 = full2.length();
425 		idxFull = idxFull2 = idxLean = newCharsPos = 0;
426 		while (idxFull < lenFull && idxFull2 < lenFull2) {
427 			cFull2 = full2.charAt(idxFull2);
428 			cFull = full.charAt(idxFull);
429 			if (cFull2 == cFull) { /* chars are equal, proceed */
430 				if (cFull2 != curMark)
431 					newChars[newCharsPos++] = chars[idxLean++];
432 				idxFull++;
433 				idxFull2++;
434 				continue;
435 			}
436 			if (cFull2 == curMark) { /* extra Mark in full2 text */
437 				idxFull2++;
438 				continue;
439 			}
440 			if (cFull == curMark) { /* extra Mark in source full text */
441 				idxFull++;
442 				// idxFull-2 always >= 0 since leading Marks were removed from full
443 				if (full.charAt(idxFull - 2) == curMark)
444 					continue; // ignore successive Marks in full after the first one
445 				newChars[newCharsPos++] = curMark;
446 				continue;
447 			}
448 			// we should never get here (extra char which is not a Mark)
449 			throw new IllegalStateException("Internal error: extra character not a Mark."); //$NON-NLS-1$
450 		}
451 		if (idxFull < lenFull) /* full2 ended before full - this should never happen since
452 								              we removed all marks and PDFs at the end of full */
453 			throw new IllegalStateException("Internal error: unexpected EOL."); //$NON-NLS-1$
454 
455 		lean = new String(newChars, 0, newCharsPos);
456 		return lean;
457 	}
458 
459 	@Override
fullToLeanMap(String full)460 	public int[] fullToLeanMap(String full) {
461 		int lenFull = full.length();
462 		if (lenFull == 0)
463 			return EMPTY_INT_ARRAY;
464 		String lean = fullToLeanText(full);
465 		int lenLean = lean.length();
466 		int dir = handler.getDirection(this, lean);
467 		char curMark = MARKS[dir];
468 		char curEmbed = EMBEDS[dir];
469 		int[] map = new int[lenFull];
470 		int idxFull, idxLean;
471 		// skip any prefix and leading mark
472 		for (idxFull = 0; idxFull < lenFull; idxFull++) {
473 			char c = full.charAt(idxFull);
474 			if (c != curEmbed && c != curMark)
475 				break;
476 			map[idxFull] = -1;
477 		}
478 		// lean must be a subset of Full, so we only check on iLean < leanLen
479 		for (idxLean = 0; idxLean < lenLean; idxFull++) {
480 			if (full.charAt(idxFull) == lean.charAt(idxLean)) {
481 				map[idxFull] = idxLean;
482 				idxLean++;
483 			} else
484 				map[idxFull] = -1;
485 		}
486 		for (; idxFull < lenFull; idxFull++)
487 			map[idxFull] = -1;
488 		return map;
489 	}
490 
491 	@Override
fullBidiCharOffsets(String full)492 	public int[] fullBidiCharOffsets(String full) {
493 		int lenFull = full.length();
494 		if (lenFull == 0)
495 			return EMPTY_INT_ARRAY;
496 		String lean = fullToLeanText(full);
497 		StructuredTextOffsets offsets = new StructuredTextOffsets();
498 		int lenLean = lean.length();
499 		int idxLean, idxFull;
500 		// lean must be a subset of Full, so we only check on iLean < leanLen
501 		for (idxLean = idxFull = 0; idxLean < lenLean; idxFull++) {
502 			if (full.charAt(idxFull) == lean.charAt(idxLean))
503 				idxLean++;
504 			else
505 				offsets.insertOffset(null, idxFull);
506 		}
507 		for (; idxFull < lenFull; idxFull++)
508 			offsets.insertOffset(null, idxFull);
509 		return offsets.getOffsets();
510 	}
511 
512 	@Override
insertMarks(String text, int[] offsets, int direction, int affixLength)513 	public String insertMarks(String text, int[] offsets, int direction, int affixLength) {
514 		if (direction != DIR_LTR && direction != DIR_RTL)
515 			throw new IllegalArgumentException("Invalid direction"); //$NON-NLS-1$
516 		if (affixLength < 0 || affixLength > 2)
517 			throw new IllegalArgumentException("Invalid affix length"); //$NON-NLS-1$
518 		int count = offsets == null ? 0 : offsets.length;
519 		if (count == 0 && affixLength == 0)
520 			return text;
521 		int textLength = text.length();
522 		if (textLength == 0)
523 			return text;
524 		int newLen = textLength + count;
525 		if (affixLength == 1)
526 			newLen++; /* +1 for a mark char */
527 		else if (affixLength == 2)
528 			newLen += FIXES_LENGTH;
529 		char[] fullChars = new char[newLen];
530 		int added = affixLength;
531 		// add marks at offsets
532 		char curMark = MARKS[direction];
533 		for (int i = 0, j = 0; i < textLength; i++) {
534 			char c = text.charAt(i);
535 			if (j < count && i == offsets[j]) {
536 				fullChars[i + added] = curMark;
537 				added++;
538 				j++;
539 			}
540 			fullChars[i + added] = c;
541 		}
542 		if (affixLength > 0) { /* add prefix/suffix ? */
543 			if (affixLength == 1) { /* contextual orientation */
544 				fullChars[0] = curMark;
545 			} else {
546 				// When the orientation is RTL, we need to add EMBED at the
547 				// start of the text and PDF at its end.
548 				// However, because of a bug in Windows' handling of LRE/RLE/PDF,
549 				// we add LRM or RLM (according to the direction) after the
550 				// LRE/RLE and again before the PDF.
551 				char curEmbed = EMBEDS[direction];
552 				fullChars[0] = curEmbed;
553 				fullChars[1] = curMark;
554 				fullChars[newLen - 1] = PDF;
555 				fullChars[newLen - 2] = curMark;
556 			}
557 		}
558 		return new String(fullChars);
559 	}
560 
561 	@Override
toString()562 	public String toString() {
563 		return super.toString() + " [handler=" + handler.toString() + "]"; //$NON-NLS-1$ //$NON-NLS-2$
564 	}
565 }
566