1 /*******************************************************************************
2  * Copyright (c) 2010, 2011 IBM Corporation and others.
3  *
4  * This program and the accompanying materials
5  * are made available under the terms of the Eclipse Public License 2.0
6  * which accompanies this distribution, and is available at
7  * https://www.eclipse.org/legal/epl-2.0/
8  *
9  * SPDX-License-Identifier: EPL-2.0
10  *
11  * Contributors:
12  *     IBM Corporation - initial API and implementation
13  ******************************************************************************/
14 package org.eclipse.equinox.bidi.internal.consumable;
15 
16 import org.eclipse.equinox.bidi.advanced.IStructuredTextExpert;
17 import org.eclipse.equinox.bidi.advanced.StructuredTextEnvironment;
18 import org.eclipse.equinox.bidi.custom.*;
19 
20 /**
21  *  Handler for regular expressions.
22  *  Such expressions may span multiple lines.
23  *  <p>
24  *  In applications like an editor where parts of the text might be modified
25  *  while other parts are not, the user may want to call
26  *  {@link IStructuredTextExpert#leanToFullText}
27  *  separately on each line and save the initial state of each line (this is
28  *  the final state of the previous line which can be retrieved using
29  *  {@link IStructuredTextExpert#getState()}.
30  *  If both the content
31  *  of a line and its initial state have not changed, the user can be sure that
32  *  the last <i>full</i> text computed for this line has not changed either.
33  *
34  *  @see IStructuredTextExpert explanation of state
35  */
36 public class StructuredTextRegex extends StructuredTextTypeHandler {
37 	static final String[] startStrings = {"", /*  0 *//* dummy *///$NON-NLS-1$
38 			"(?#", /*  1 *//* comment (?#...) *///$NON-NLS-1$
39 			"(?<", /*  2 *//* named group (?<name> *///$NON-NLS-1$
40 			"(?'", /*  3 *//* named group (?'name' *///$NON-NLS-1$
41 			"(?(<", /*  4 *//* conditional named back reference (?(<name>) *///$NON-NLS-1$
42 			"(?('", /*  5 *//* conditional named back reference (?('name') *///$NON-NLS-1$
43 			"(?(", /*  6 *//* conditional named back reference (?(name) *///$NON-NLS-1$
44 			"(?&", /*  7 *//* named parentheses reference (?&name) *///$NON-NLS-1$
45 			"(?P<", /*  8 *//* named group (?P<name> *///$NON-NLS-1$
46 			"\\k<", /*  9 *//* named back reference \k<name> *///$NON-NLS-1$
47 			"\\k'", /* 10 *//* named back reference \k'name' *///$NON-NLS-1$
48 			"\\k{", /* 11 *//* named back reference \k{name} *///$NON-NLS-1$
49 			"(?P=", /* 12 *//* named back reference (?P=name) *///$NON-NLS-1$
50 			"\\g{", /* 13 *//* named back reference \g{name} *///$NON-NLS-1$
51 			"\\g<", /* 14 *//* subroutine call \g<name> *///$NON-NLS-1$
52 			"\\g'", /* 15 *//* subroutine call \g'name' *///$NON-NLS-1$
53 			"(?(R&", /* 16 *//* named back reference recursion (?(R&name) *///$NON-NLS-1$
54 			"\\Q" /* 17 *//* quoted sequence \Q...\E *///$NON-NLS-1$
55 	};
56 	static final char[] endChars = {
57 			// 0    1    2    3     4    5    6    7    8    9    10   11   12   13   14    15   16
58 			'.', ')', '>', '\'', ')', ')', ')', ')', '>', '>', '\'', '}', ')', '}', '>', '\'', ')'};
59 	static final int numberOfStrings = startStrings.length; /* 18 */
60 	static final int maxSpecial = numberOfStrings;
61 	static final byte L = Character.DIRECTIONALITY_LEFT_TO_RIGHT;
62 	static final byte R = Character.DIRECTIONALITY_RIGHT_TO_LEFT;
63 	static final byte AL = Character.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC;
64 	static final byte AN = Character.DIRECTIONALITY_ARABIC_NUMBER;
65 	static final byte EN = Character.DIRECTIONALITY_EUROPEAN_NUMBER;
66 	private static final Integer STATE_COMMENT = Integer.valueOf(1);
67 	private static final Integer STATE_QUOTED_SEQUENCE = Integer.valueOf(17);
68 
69 	/**
70 	 *  Retrieves the number of special cases handled by this handler.
71 	 *
72 	 *  @return the number of special cases for this handler.
73 	 */
74 	@Override
getSpecialsCount(IStructuredTextExpert expert)75 	public int getSpecialsCount(IStructuredTextExpert expert) {
76 		return maxSpecial;
77 	}
78 
79 	/**
80 	 *  Locates occurrences of the syntactic strings and of
81 	 *  R, AL, EN, AN characters.
82 	 */
83 	@Override
indexOfSpecial(IStructuredTextExpert expert, String text, StructuredTextCharTypes charTypes, StructuredTextOffsets offsets, int caseNumber, int fromIndex)84 	public int indexOfSpecial(IStructuredTextExpert expert, String text, StructuredTextCharTypes charTypes, StructuredTextOffsets offsets, int caseNumber, int fromIndex) {
85 		// In this method, L, R, AL, AN and EN represent bidi categories
86 		// as defined in the Unicode Bidirectional Algorithm
87 		// ( http://www.unicode.org/reports/tr9/ ).
88 		// L  represents the category Left to Right character.
89 		// R  represents the category Right to Left character.
90 		// AL represents the category Arabic Letter.
91 		// AN represents the category Arabic Number.
92 		// EN  represents the category European Number.
93 		byte charType;
94 
95 		if (caseNumber < numberOfStrings) {
96 			/*  1 *//* comment (?#...) */
97 			/*  2 *//* named group (?<name> */
98 			/*  3 *//* named group (?'name' */
99 			/*  4 *//* conditional named back reference (?(name) */
100 			/*  5 *//* conditional named back reference (?(<name>) */
101 			/*  6 *//* conditional named back reference (?('name') */
102 			/*  7 *//* named parentheses reference (?&name) */
103 			/*  8 *//* named group (?P<name> */
104 			/*  9 *//* named back reference \k<name> */
105 			/* 10 *//* named back reference \k'name' */
106 			/* 11 *//* named back reference \k{name} */
107 			/* 12 *//* named back reference (?P=name) */
108 			/* 13 *//* named back reference \g{name} */
109 			/* 14 *//* subroutine call \g<name> */
110 			/* 15 *//* subroutine call \g'name' */
111 			/* 16 *//* named back reference recursion (?(R&name) */
112 			/* 17 *//* quoted sequence \Q...\E */
113 			return text.indexOf(startStrings[caseNumber], fromIndex);
114 		}
115 		// there never is a need for a mark before the first char
116 		if (fromIndex <= 0)
117 			fromIndex = 1;
118 		// look for R, AL, AN, EN which are potentially needing a mark
119 		for (; fromIndex < text.length(); fromIndex++) {
120 			charType = charTypes.getBidiTypeAt(fromIndex);
121 			// R and AL will always be examined using processSeparator()
122 			if (charType == R || charType == AL)
123 				return fromIndex;
124 
125 			if (charType == EN || charType == AN) {
126 				// no need for a mark after the first digit in a number
127 				if (charTypes.getBidiTypeAt(fromIndex - 1) == charType)
128 					continue;
129 
130 				for (int i = fromIndex - 1; i >= 0; i--) {
131 					charType = charTypes.getBidiTypeAt(i);
132 					// after a L char, no need for a mark
133 					if (charType == L)
134 						continue;
135 
136 					// digit after R or AL or AN need a mark, except for EN
137 					//   following AN, but this is a contrived case, so we
138 					//   don't check for it (and calling processSeparator()
139 					//   for it will do no harm)
140 					if (charType == R || charType == AL || charType == AN)
141 						return fromIndex;
142 				}
143 				continue;
144 			}
145 		}
146 		return -1;
147 	}
148 
149 	/**
150 	 *  Processes the special cases.
151 	 */
152 	@Override
processSpecial(IStructuredTextExpert expert, String text, StructuredTextCharTypes charTypes, StructuredTextOffsets offsets, int caseNumber, int separLocation)153 	public int processSpecial(IStructuredTextExpert expert, String text, StructuredTextCharTypes charTypes, StructuredTextOffsets offsets, int caseNumber, int separLocation) {
154 		int location;
155 
156 		if (separLocation < 0) {
157 			caseNumber = ((Integer) expert.getState()).intValue(); // TBD guard against "undefined"
158 			expert.clearState();
159 		}
160 		switch (caseNumber) {
161 			case 1 : /* comment (?#...) */
162 				if (separLocation < 0) {
163 					// initial state from previous line
164 					location = 0;
165 				} else {
166 					StructuredTextTypeHandler.processSeparator(text, charTypes, offsets, separLocation);
167 					// skip the opening "(?#"
168 					location = separLocation + 3;
169 				}
170 				location = text.indexOf(')', location);
171 				if (location < 0) {
172 					expert.setState(STATE_COMMENT);
173 					return text.length();
174 				}
175 				return location + 1;
176 			case 2 : /* named group (?<name> */
177 			case 3 : /* named group (?'name' */
178 			case 4 : /* conditional named back reference (?(name) */
179 			case 5 : /* conditional named back reference (?(<name>) */
180 			case 6 : /* conditional named back reference (?('name') */
181 			case 7 : /* named parentheses reference (?&name) */
182 				StructuredTextTypeHandler.processSeparator(text, charTypes, offsets, separLocation);
183 				// no need for calling processSeparator() for the following cases
184 				//   since the starting string contains a L char
185 			case 8 : /* named group (?P<name> */
186 			case 9 : /* named back reference \k<name> */
187 			case 10 : /* named back reference \k'name' */
188 			case 11 : /* named back reference \k{name} */
189 			case 12 : /* named back reference (?P=name) */
190 			case 13 : /* named back reference \g{name} */
191 			case 14 : /* subroutine call \g<name> */
192 			case 15 : /* subroutine call \g'name' */
193 			case 16 : /* named back reference recursion (?(R&name) */
194 				// skip the opening string
195 				location = separLocation + startStrings[caseNumber].length();
196 				// look for ending character
197 				location = text.indexOf(endChars[caseNumber], location);
198 				if (location < 0)
199 					return text.length();
200 				return location + 1;
201 			case 17 : /* quoted sequence \Q...\E */
202 				if (separLocation < 0) {
203 					// initial state from previous line
204 					location = 0;
205 				} else {
206 					StructuredTextTypeHandler.processSeparator(text, charTypes, offsets, separLocation);
207 					// skip the opening "\Q"
208 					location = separLocation + 2;
209 				}
210 				location = text.indexOf("\\E", location); //$NON-NLS-1$
211 				if (location < 0) {
212 					expert.setState(STATE_QUOTED_SEQUENCE);
213 					return text.length();
214 				}
215 				// set the charType for the "E" to L (Left to Right character)
216 				charTypes.setBidiTypeAt(location + 1, L);
217 				return location + 2;
218 			case 18 : /* R, AL, AN, EN */
219 				StructuredTextTypeHandler.processSeparator(text, charTypes, offsets, separLocation);
220 				return separLocation + 1;
221 
222 		}
223 		// we should never get here
224 		return text.length();
225 	}
226 
227 	@Override
getDirection(IStructuredTextExpert expert, String text)228 	public int getDirection(IStructuredTextExpert expert, String text) {
229 		return getDirection(expert, text, new StructuredTextCharTypes(expert, text));
230 	}
231 
232 	/**
233 	 *  @return {@link IStructuredTextExpert#DIR_RTL DIR_RTL} if the following
234 	 *          conditions are satisfied:
235 	 *          <ul>
236 	 *            <li>The current locale (as expressed by the environment
237 	 *                language) is Arabic.</li>
238 	 *            <li>The first strong character has an RTL direction.</li>
239 	 *            <li>If there is no strong character in the text, the
240 	 *                GUI is mirrored.
241 	 *          </ul>
242 	 *          Otherwise, returns {@link IStructuredTextExpert#DIR_LTR DIR_LTR}.
243 	 */
244 	@Override
getDirection(IStructuredTextExpert expert, String text, StructuredTextCharTypes charTypes)245 	public int getDirection(IStructuredTextExpert expert, String text, StructuredTextCharTypes charTypes) {
246 		StructuredTextEnvironment environment = expert.getEnvironment();
247 		String language = environment.getLanguage();
248 		if (!language.equals("ar")) //$NON-NLS-1$
249 			return IStructuredTextExpert.DIR_LTR;
250 		for (int i = 0; i < text.length(); i++) {
251 			byte charType = charTypes.getBidiTypeAt(i);
252 			if (charType == AL || charType == R)
253 				return IStructuredTextExpert.DIR_RTL;
254 			if (charType == L)
255 				return IStructuredTextExpert.DIR_LTR;
256 		}
257 		if (environment.getMirrored())
258 			return IStructuredTextExpert.DIR_RTL;
259 		return IStructuredTextExpert.DIR_LTR;
260 	}
261 
262 }
263