activation/registries/MailcapTokenizer.java

/*
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
 *
 * Copyright (c) 1997-2017 Oracle and/or its affiliates. All rights reserved.
 *
 * The contents of this file are subject to the terms of either the GNU
 * General Public License Version 2 only ("GPL") or the Common Development
 * and Distribution License("CDDL") (collectively, the "License").  You
 * may not use this file except in compliance with the License.  You can
 * obtain a copy of the License at
 * https://oss.oracle.com/licenses/CDDL+GPL-1.1
 * or LICENSE.txt.  See the License for the specific
 * language governing permissions and limitations under the License.
 *
 * When distributing the software, include this License Header Notice in each
 * file and include the License file at LICENSE.txt.
 *
 * GPL Classpath Exception:
 * Oracle designates this particular file as subject to the "Classpath"
 * exception as provided by Oracle in the GPL Version 2 section of the License
 * file that accompanied this code.
 *
 * Modifications:
 * If applicable, add the following below the License Header, with the fields
 * enclosed by brackets [] replaced by your own identifying information:
 * "Portions Copyright [year] [name of copyright owner]"
 *
 * Contributor(s):
 * If you wish your version of this file to be governed by only the CDDL or
 * only the GPL Version 2, indicate your decision by adding "[Contributor]
 * elects to include this software in this distribution under the [CDDL or GPL
 * Version 2] license."  If you don't indicate a single choice of license, a
 * recipient has the option to distribute your version of this file under
 * either the CDDL, the GPL Version 2 or to extend the choice of license to
 * its licensees as provided above.  However, if you add GPL Version 2 code
 * and therefore, elected the GPL Version 2 license, then the option applies
 * only if the new code is made subject to such option by the copyright
 * holder.
 */

package	com.sun.activation.registries;

/**
 *	A tokenizer for strings in the form of "foo/bar; prop1=val1; ... ".
 *	Useful for parsing MIME content types.
 */
public class MailcapTokenizer {

    public static final int UNKNOWN_TOKEN = 0;
    public static final int START_TOKEN = 1;
    public static final int STRING_TOKEN = 2;
    public static final int EOI_TOKEN = 5;
    public static final int SLASH_TOKEN = '/';
    public static final int SEMICOLON_TOKEN = ';';
    public static final int EQUALS_TOKEN = '=';

    /**
     *  Constructor
     *
     *  @param  inputString the string to tokenize
     */
    public MailcapTokenizer(String inputString) {
	data = inputString;
	dataIndex = 0;
	dataLength = inputString.length();

	currentToken = START_TOKEN;
	currentTokenValue = "";

	isAutoquoting = false;
	autoquoteChar = ';';
    }

    /**
     *  Set whether auto-quoting is on or off.
     *
     *  Auto-quoting means that all characters after the first
     *  non-whitespace, non-control character up to the auto-quote
     *  terminator character or EOI (minus any whitespace immediatley
     *  preceeding it) is considered a token.
     *
     *  This is required for handling command strings in a mailcap entry.
     */
    public void setIsAutoquoting(boolean value) {
	isAutoquoting = value;
    }

    /**
     *  Retrieve current token.
     *
     *  @return    The current token value
     */
    public int getCurrentToken() {
	return currentToken;
    }

    /*
     *  Get a String that describes the given token.
     */
    public static String nameForToken(int token) {
	String name = "really unknown";

	switch(token) {
	    case UNKNOWN_TOKEN:
		name = "unknown";
		break;
	    case START_TOKEN:
		name = "start";
		break;
	    case STRING_TOKEN:
		name = "string";
		break;
	    case EOI_TOKEN:
		name = "EOI";
		break;
	    case SLASH_TOKEN:
		name = "'/'";
		break;
	    case SEMICOLON_TOKEN:
		name = "';'";
		break;
	    case EQUALS_TOKEN:
		name = "'='";
		break;
	}

	return name;
    }

    /*
     *  Retrieve current token value.
     *
     *  @return    A String containing the current token value
     */
    public String getCurrentTokenValue() {
	return currentTokenValue;
    }

    /*
     *  Process the next token.
     *
     *  @return    the next token
     */
    public int nextToken() {
	if (dataIndex < dataLength) {
	    //  skip white space
	    while ((dataIndex < dataLength) &&
		    (isWhiteSpaceChar(data.charAt(dataIndex)))) {
		++dataIndex;
	    }

	    if (dataIndex < dataLength) {
		//  examine the current character and see what kind of token we have
		char c = data.charAt(dataIndex);
		if (isAutoquoting) {
		    if (c == ';' || c == '=') {
			currentToken = c;
			currentTokenValue = new Character(c).toString();
			++dataIndex;
		    } else {
			processAutoquoteToken();
		    }
		} else {
		    if (isStringTokenChar(c)) {
			processStringToken();
		    } else if ((c == '/') || (c == ';') || (c == '=')) {
			currentToken = c;
			currentTokenValue = new Character(c).toString();
			++dataIndex;
		    } else {
			currentToken = UNKNOWN_TOKEN;
			currentTokenValue = new Character(c).toString();
			++dataIndex;
		    }
		}
	    } else {
		currentToken = EOI_TOKEN;
		currentTokenValue = null;
	    }
	} else {
	    currentToken = EOI_TOKEN;
	    currentTokenValue = null;
	}

	return currentToken;
    }

    private void processStringToken() {
	//  capture the initial index
	int initialIndex = dataIndex;

	//  skip to 1st non string token character
	while ((dataIndex < dataLength) &&
		isStringTokenChar(data.charAt(dataIndex))) {
	    ++dataIndex;
	}

	currentToken = STRING_TOKEN;
	currentTokenValue = data.substring(initialIndex, dataIndex);
    }

    private void processAutoquoteToken() {
	//  capture the initial index
	int initialIndex = dataIndex;

	//  now skip to the 1st non-escaped autoquote termination character
	//  XXX - doesn't actually consider escaping
	boolean foundTerminator = false;
	while ((dataIndex < dataLength) && !foundTerminator) {
	    char c = data.charAt(dataIndex);
	    if (c != autoquoteChar) {
		++dataIndex;
	    } else {
		foundTerminator = true;
	    }
	}

	currentToken = STRING_TOKEN;
	currentTokenValue =
	    fixEscapeSequences(data.substring(initialIndex, dataIndex));
    }

    private static boolean isSpecialChar(char c) {
	boolean lAnswer = false;

	switch(c) {
	    case '(':
	    case ')':
	    case '<':
	    case '>':
	    case '@':
	    case ',':
	    case ';':
	    case ':':
	    case '\\':
	    case '"':
	    case '/':
	    case '[':
	    case ']':
	    case '?':
	    case '=':
		lAnswer = true;
		break;
	}

	return lAnswer;
    }

    private static boolean isControlChar(char c) {
	return Character.isISOControl(c);
    }

    private static boolean isWhiteSpaceChar(char c) {
	return Character.isWhitespace(c);
    }

    private static boolean isStringTokenChar(char c) {
	return !isSpecialChar(c) && !isControlChar(c) && !isWhiteSpaceChar(c);
    }

    private static String fixEscapeSequences(String inputString) {
	int inputLength = inputString.length();
	StringBuffer buffer = new StringBuffer();
	buffer.ensureCapacity(inputLength);

	for (int i = 0; i < inputLength; ++i) {
	    char currentChar = inputString.charAt(i);
	    if (currentChar != '\\') {
		buffer.append(currentChar);
	    } else {
		if (i < inputLength - 1) {
		    char nextChar = inputString.charAt(i + 1);
		    buffer.append(nextChar);

		    //  force a skip over the next character too
		    ++i;
		} else {
		    buffer.append(currentChar);
		}
	    }
	}

	return buffer.toString();
    }

    private String  data;
    private int     dataIndex;
    private int     dataLength;
    private int     currentToken;
    private String  currentTokenValue;
    private boolean isAutoquoting;
    private char    autoquoteChar;

    /*
    public static void main(String[] args) {
	for (int i = 0; i < args.length; ++i) {
	    MailcapTokenizer tokenizer = new MailcapTokenizer(args[i]);

	    System.out.println("Original: |" + args[i] + "|");

	    int currentToken = tokenizer.nextToken();
	    while (currentToken != EOI_TOKEN) {
		switch(currentToken) {
		    case UNKNOWN_TOKEN:
			System.out.println("  Unknown Token:           |" + tokenizer.getCurrentTokenValue() + "|");
			break;
		    case START_TOKEN:
			System.out.println("  Start Token:             |" + tokenizer.getCurrentTokenValue() + "|");
			break;
		    case STRING_TOKEN:
			System.out.println("  String Token:            |" + tokenizer.getCurrentTokenValue() + "|");
			break;
		    case EOI_TOKEN:
			System.out.println("  EOI Token:               |" + tokenizer.getCurrentTokenValue() + "|");
			break;
		    case SLASH_TOKEN:
			System.out.println("  Slash Token:             |" + tokenizer.getCurrentTokenValue() + "|");
			break;
		    case SEMICOLON_TOKEN:
			System.out.println("  Semicolon Token:         |" + tokenizer.getCurrentTokenValue() + "|");
			break;
		    case EQUALS_TOKEN:
			System.out.println("  Equals Token:            |" + tokenizer.getCurrentTokenValue() + "|");
			break;
		    default:
			System.out.println("  Really Unknown Token:    |" + tokenizer.getCurrentTokenValue() + "|");
			break;
		}

		currentToken = tokenizer.nextToken();
	    }

	    System.out.println("");
	}
    }
    */
}