internal/misc/StringMatcher.java

/*******************************************************************************
 * Copyright (c) 2000, 2020 IBM Corporation and others.
 *
 * This program and the accompanying materials
 * are made available under the terms of the Eclipse Public License 2.0
 * which accompanies this distribution, and is available at
 * https://www.eclipse.org/legal/epl-2.0/
 *
 * SPDX-License-Identifier: EPL-2.0
 *
 * Contributors:
 *     IBM Corporation - initial API and implementation
 *     Lucas Bullen (Red Hat Inc.) - [Bug 203792] filter should support multiple keywords
 *     Mickael Istria (Red Hat Inc.) - [534277] erroneous filtering with multiple words
 *******************************************************************************/
package org.eclipse.ui.internal.misc;

import java.util.ArrayList;
import java.util.regex.Pattern;

/**
 * A string pattern matcher, supporting "*" and "?" wildcards.
 */
public class StringMatcher {
	protected String fPattern;

	protected int fLength; // pattern length

	protected boolean fIgnoreWildCards;

	protected boolean fIgnoreCase;

	protected String[] patternWords;

	protected Word wholePatternWord;
	protected Word[] splittedPatternWords;

	protected static final char fSingleWildCard = '\u0000';
	private static final Pattern NON_WORD = Pattern.compile("\\W+", Pattern.UNICODE_CHARACTER_CLASS); //$NON-NLS-1$

	class Word {
		private boolean hasTrailingStar = false;
		private boolean hasLeadingStar = false;
		private int bound = 0;
		private String[] fragments = null;
		private final String pattern;

		Word(String pattern) {
			this.pattern = pattern;
		}

		public Word(String pattern, int fLength, String[] wordsSplitted) {
			this(pattern);
			this.bound = fLength;
			this.fragments = wordsSplitted;
		}

		private void parseWildcards() {
			if (this.pattern.startsWith("*")) { //$NON-NLS-1$
				this.hasLeadingStar = true;
			}
			if (this.pattern.endsWith("*")) {//$NON-NLS-1$
				/* make sure it's not an escaped wildcard */
				if (this.pattern.length() > 1 && this.pattern.charAt(this.pattern.length() - 2) != '\\') {
					this.hasTrailingStar = true;
				}
			}

			ArrayList<String> temp = new ArrayList<>();

			int pos = 0;
			StringBuilder buf = new StringBuilder();
			while (pos < this.pattern.length()) {
				char c = this.pattern.charAt(pos++);
				switch (c) {
				case '\\':
					if (pos >= this.pattern.length()) {
						buf.append(c);
					} else {
						char next = this.pattern.charAt(pos++);
						/* if it's an escape sequence */
						if (next == '*' || next == '?' || next == '\\') {
							buf.append(next);
						} else {
							/* not an escape sequence, just insert literally */
							buf.append(c);
							buf.append(next);
						}
					}
					break;
				case '*':
					if (buf.length() > 0) {
						/* new segment */
						temp.add(buf.toString());
						this.bound += buf.length();
						buf.setLength(0);
					}
					break;
				case '?':
					/* append special character representing single match wildcard */
					buf.append(fSingleWildCard);
					break;
				default:
					buf.append(c);
				}
			}

			/* add last buffer to segment list */
			if (buf.length() > 0) {
				temp.add(buf.toString());
				this.bound += buf.length();
			}
			this.fragments = temp.toArray(new String[temp.size()]);
		}

		boolean match(String text, int start, int end) {
			boolean found = true;
			if (fIgnoreWildCards) {
				if ((end - start == this.pattern.length())
						&& this.pattern.regionMatches(fIgnoreCase, 0, text, start, this.pattern.length()))
					return true;
				return false;
			}
			String[] segments = null;
			segments = this.fragments;
			int segCount = segments.length;
			if (segCount == 0 && (this.hasLeadingStar || this.hasTrailingStar)) {
				return true;
			}
			if (start == end) {
				if (this.pattern.length() == 0)
					return true;
				return false;
			}
			if (this.pattern.length() == 0) {
				if (start == end)
					return true;
				return false;
			}

			int tCurPos = start;
			int bound = end - this.bound;
			if (bound < 0) {
				return false;
			}
			int i = 0;
			String current = segments[i];
			int segLength = current.length();

			/* process first segment */
			if (!hasLeadingStar) {
				if (!regExpRegionMatches(text, start, current, 0, segLength)) {
					return false;
				}
				++i;
				tCurPos = tCurPos + segLength;
			}
			if ((segments.length == 1) && (!hasLeadingStar) && (!hasTrailingStar)) {
				// only one segment to match, no wildcards specified
				if (tCurPos == end)
					return true;
				return false;
			}
			/* process middle segments */
			while (i < segCount && found) {
				current = segments[i];
				int currentMatch;
				int k = current.indexOf(fSingleWildCard);
				if (k < 0) {
					currentMatch = textPosIn(text, tCurPos, end, current);
					if (currentMatch < 0) {
						found = false;
					}
				} else {
					currentMatch = regExpPosIn(text, tCurPos, end, current);
					if (currentMatch < 0) {
						found = false;
					}
				}
				if (!found)
					return false;
				tCurPos = currentMatch + current.length();
				i++;
			}

			/* process final segment */
			if (!hasTrailingStar && tCurPos != end) {
				int clen = current.length();
				if (regExpRegionMatches(text, end - clen, current, 0, clen))
					return true;
				return false;
			}
			if (i == segCount)
				return true;
			return false;
		}

		/**
		 * @param text
		 * @param start
		 * @param end
		 * @return whether the current pattern word matches at least one word in the
		 *         given text
		 */
		public boolean matchTextWord(String text, int start, int end) {
			String[] textWords = getWords(text.substring(start, end));
			if (textWords.length == 0) {
				return pattern.isEmpty();
			}
			for (String subword : textWords) {
				if (match(subword, 0, subword.length())) {
					return true;
				}
			}
			return false;
		}

	}

	public static class Position {
		int start; // inclusive

		int end; // exclusive

		public Position(int start, int end) {
			this.start = start;
			this.end = end;
		}

		public int getStart() {
			return start;
		}

		public int getEnd() {
			return end;
		}
	}

	/**
	 * StringMatcher constructor takes in a String object that is a simple pattern
	 * which may contain '*' for 0 and many characters and '?' for exactly one
	 * character.
	 *
	 * Literal '*' and '?' characters must be escaped in the pattern e.g., "\*"
	 * means literal "*", etc.
	 *
	 * Escaping any other character (including the escape character itself), just
	 * results in that character in the pattern. e.g., "\a" means "a" and "\\" means
	 * "\"
	 *
	 * If invoking the StringMatcher with string literals in Java, don't forget
	 * escape characters are represented by "\\".
	 *
	 * @param pattern         the pattern to match text against
	 * @param ignoreCase      if true, case is ignored
	 * @param ignoreWildCards if true, wild cards and their escape sequences are
	 *                        ignored (everything is taken literally).
	 */
	public StringMatcher(String pattern, boolean ignoreCase, boolean ignoreWildCards) {
		if (pattern == null) {
			throw new IllegalArgumentException();
		}
		fIgnoreCase = ignoreCase;
		fIgnoreWildCards = ignoreWildCards;
		fPattern = pattern;
		fLength = pattern.length();

		parsePatternIntoWords();

		if (fIgnoreWildCards) {
			parseNoWildCards();
		} else {
			if (wholePatternWord != null) {
				wholePatternWord.parseWildcards();
			}
			if (splittedPatternWords != null && splittedPatternWords.length > 1) {
				for (Word word : splittedPatternWords) {
					word.parseWildcards();
				}
			}
		}
	}

	/**
	 * match the given <code>text</code> with the pattern
	 *
	 * @return true if matched otherwise false
	 * @param text a String object
	 */
	public boolean match(String text) {
		if (text == null) {
			return false;
		}
		return match(text, 0, text.length());
	}

	/**
	 * Given the starting (inclusive) and the ending (exclusive) positions in the
	 * <code>text</code>, determine if the given substring matches with aPattern
	 *
	 * @return true if the specified portion of the text matches the pattern
	 * @param text  a String object that contains the substring to match
	 * @param start marks the starting position (inclusive) of the substring
	 * @param end   marks the ending index (exclusive) of the substring
	 */
	public boolean match(String text, int start, int end) {
		if (null == text) {
			throw new IllegalArgumentException();
		}
		if (start > end) {
			return false;
		}
		int tlen = text.length();
		start = Math.max(0, start);
		end = Math.min(end, tlen);

		if (wholePatternWord != null
				&& (wholePatternWord.match(text, start, end) || wholePatternWord.matchTextWord(text, start, end))) {
			return true;
		}
		if (splittedPatternWords != null && splittedPatternWords.length > 0) {
			for (Word word : splittedPatternWords) {
				if (!word.match(text, start, end) && !word.matchTextWord(text, start, end)) {
					return false;
				}
			}
			return true;
		}
		return false;
	}

	/**
	 * This method parses the given pattern into words separated by spaces
	 * characters. Since wildcards are not being used in this case, the pattern
	 * consists of a single segment.
	 */
	private void parsePatternIntoWords() {
		String trimedPattern = fPattern.trim();
		if (!trimedPattern.isEmpty()) {
			this.wholePatternWord = new Word(trimedPattern);
			patternWords = trimedPattern.split("\\s+"); //$NON-NLS-1$
			if (patternWords.length > 1) {
				this.splittedPatternWords = new Word[patternWords.length];
				for (int i = 0; i < patternWords.length; i++) {
					String patternWord = patternWords[i];
					if (!patternWord.endsWith("*")) { //$NON-NLS-1$
						patternWord += '*';
					}
					this.splittedPatternWords[i] = new Word(patternWord);
					// words may be found anywhere in the line
				}
			}
		}
	}

	/**
	 * This method parses the given pattern into segments seperated by wildcard '*'
	 * characters. Since wildcards are not being used in this case, the pattern
	 * consists of a single segment.
	 */
	private void parseNoWildCards() {
		this.wholePatternWord = new Word(fPattern, fLength, patternWords);
		this.wholePatternWord.bound = fLength;
		this.wholePatternWord.fragments = patternWords;
	}

	/**
	 * @param text  a string which contains no wildcard
	 * @param start the starting index in the text for search, inclusive
	 * @param end   the stopping point of search, exclusive
	 * @return the starting index in the text of the pattern , or -1 if not found
	 */
	protected int posIn(String text, int start, int end) {// no wild card in pattern
		int max = end - fLength;

		if (!fIgnoreCase) {
			int i = text.indexOf(fPattern, start);
			if (i == -1 || i > max) {
				return -1;
			}
			return i;
		}

		for (int i = start; i <= max; ++i) {
			if (text.regionMatches(true, i, fPattern, 0, fLength)) {
				return i;
			}
		}

		return -1;
	}

	/**
	 * @param text  a simple regular expression that may only contain '?'(s)
	 * @param start the starting index in the text for search, inclusive
	 * @param end   the stopping point of search, exclusive
	 * @param p     a simple regular expression that may contains '?'
	 * @return the starting index in the text of the pattern , or -1 if not found
	 */
	protected int regExpPosIn(String text, int start, int end, String p) {
		int plen = p.length();

		int max = end - plen;
		for (int i = start; i <= max; ++i) {
			if (regExpRegionMatches(text, i, p, 0, plen)) {
				return i;
			}
		}
		return -1;
	}

	/**
	 *
	 * @return boolean
	 * @param text       a String to match
	 * @param start      int that indicates the starting index of match, inclusive
	 * @param end        int that indicates the ending index of match, exclusive
	 * @param p          String, String, a simple regular expression that may
	 *                   contain '?'
	 * @param ignoreCase boolean indicating whether <code>p</code> is case sensitive
	 */
	protected boolean regExpRegionMatches(String text, int tStart, String p, int pStart, int plen) {
		while (plen-- > 0) {
			char tchar = text.charAt(tStart++);
			char pchar = p.charAt(pStart++);

			/* process wild cards */
			if (!fIgnoreWildCards) {
				/* skip single wild cards */
				if (pchar == fSingleWildCard) {
					continue;
				}
			}
			if (pchar == tchar) {
				continue;
			}
			if (fIgnoreCase) {
				if (Character.toUpperCase(tchar) == Character.toUpperCase(pchar)) {
					continue;
				}
				// comparing after converting to upper case doesn't handle all cases;
				// also compare after converting to lower case
				if (Character.toLowerCase(tchar) == Character.toLowerCase(pchar)) {
					continue;
				}
			}
			return false;
		}
		return true;
	}

	/**
	 * @param text  the string to match
	 * @param start the starting index in the text for search, inclusive
	 * @param end   the stopping point of search, exclusive
	 * @param p     a pattern string that has no wildcard
	 * @return the starting index in the text of the pattern , or -1 if not found
	 */
	protected int textPosIn(String text, int start, int end, String p) {

		int plen = p.length();
		int max = end - plen;

		if (!fIgnoreCase) {
			int i = text.indexOf(p, start);
			if (i == -1 || i > max) {
				return -1;
			}
			return i;
		}

		for (int i = start; i <= max; ++i) {
			if (text.regionMatches(true, i, p, 0, plen)) {
				return i;
			}
		}

		return -1;
	}

	/**
	 * Take the given filter text and break it down into words using a
	 * BreakIterator.
	 *
	 * @param text
	 * @return an array of words
	 */
	public static String[] getWords(String text) {

		return NON_WORD.split(text, 0);
	}

}