javadoc/main/JavaScriptScanner.java

/*
 * Copyright (c) 2012,2016, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 only, as
 * published by the Free Software Foundation.  Oracle designates this
 * particular file as subject to the "Classpath" exception as provided
 * by Oracle in the LICENSE file that accompanied this code.
 *
 * This code is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 * version 2 for more details (a copy is included in the LICENSE file that
 * accompanied this code).
 *
 * You should have received a copy of the GNU General Public License version
 * 2 along with this work; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 *
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 * or visit www.oracle.com if you need additional information or have any
 * questions.
 */

package com.sun.tools.javadoc.main;

import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Locale;
import java.util.Map;
import java.util.Set;

import com.sun.tools.javadoc.main.JavaScriptScanner.TagParser.Kind;

import static com.sun.tools.javac.util.LayoutCharacters.EOI;

/**
 * Parser to detect use of JavaScript in documentation comments.
 */
@Deprecated(since="9", forRemoval=true)
@SuppressWarnings("removal")
public class JavaScriptScanner {
    public static interface Reporter {
        void report();
    }

    static class ParseException extends Exception {
        private static final long serialVersionUID = 0;
        ParseException(String key) {
            super(key);
        }
    }

    private Reporter reporter;

    /** The input buffer, index of most recent character read,
     *  index of one past last character in buffer.
     */
    protected char[] buf;
    protected int bp;
    protected int buflen;

    /** The current character.
     */
    protected char ch;

    private boolean newline = true;

    Map<String, TagParser> tagParsers;
    Set<String> uriAttrs;

    public JavaScriptScanner() {
        initTagParsers();
        initURIAttrs();
    }

    public void parse(String comment, Reporter r) {
        reporter = r;
        String c = comment;
        buf = new char[c.length() + 1];
        c.getChars(0, c.length(), buf, 0);
        buf[buf.length - 1] = EOI;
        buflen = buf.length - 1;
        bp = -1;
        newline = true;
        nextChar();

        blockContent();
        blockTags();
    }

    private void checkHtmlTag(String tag) {
        if (tag.equalsIgnoreCase("script")) {
            reporter.report();
        }
    }

    private void checkHtmlAttr(String name, String value) {
        String n = name.toLowerCase(Locale.ENGLISH);
        // See https://www.w3.org/TR/html52/webappapis.html#events-event-handlers
        // An event handler has a name, which always starts with "on" and is followed by
        // the name of the event for which it is intended.
        if (n.startsWith("on")
                || uriAttrs.contains(n)
                    && value != null && value.toLowerCase(Locale.ENGLISH).trim().startsWith("javascript:")) {
            reporter.report();
        }
    }

    void nextChar() {
        ch = buf[bp < buflen ? ++bp : buflen];
        switch (ch) {
            case '\f': case '\n': case '\r':
                newline = true;
        }
    }

    /**
     * Read block content, consisting of text, html and inline tags.
     * Terminated by the end of input, or the beginning of the next block tag:
     * i.e. @ as the first non-whitespace character on a line.
     */
    @SuppressWarnings("fallthrough")
    protected void blockContent() {

        loop:
        while (bp < buflen) {
            switch (ch) {
                case '\n': case '\r': case '\f':
                    newline = true;
                    // fallthrough

                case ' ': case '\t':
                    nextChar();
                    break;

                case '&':
                    entity(null);
                    break;

                case '<':
                    html();
                    break;

                case '>':
                    newline = false;
                    nextChar();
                    break;

                case '{':
                    inlineTag(null);
                    break;

                case '@':
                    if (newline) {
                        break loop;
                    }
                    // fallthrough

                default:
                    newline = false;
                    nextChar();
            }
        }
    }

    /**
     * Read a series of block tags, including their content.
     * Standard tags parse their content appropriately.
     * Non-standard tags are represented by {@link UnknownBlockTag}.
     */
    protected void blockTags() {
        while (ch == '@')
            blockTag();
    }

    /**
     * Read a single block tag, including its content.
     * Standard tags parse their content appropriately.
     * Non-standard tags are represented by {@link UnknownBlockTag}.
     */
    protected void blockTag() {
        int p = bp;
        try {
            nextChar();
            if (isIdentifierStart(ch)) {
                String name = readTagName();
                TagParser tp = tagParsers.get(name);
                if (tp == null) {
                    blockContent();
                } else {
                    switch (tp.getKind()) {
                        case BLOCK:
                            tp.parse(p);
                            return;
                        case INLINE:
                            return;
                    }
                }
            }
            blockContent();
        } catch (ParseException e) {
            blockContent();
        }
    }

    protected void inlineTag(Void list) {
        newline = false;
        nextChar();
        if (ch == '@') {
            inlineTag();
        }
    }

    /**
     * Read a single inline tag, including its content.
     * Standard tags parse their content appropriately.
     * Non-standard tags are represented by {@link UnknownBlockTag}.
     * Malformed tags may be returned as {@link Erroneous}.
     */
    protected void inlineTag() {
        int p = bp - 1;
        try {
            nextChar();
            if (isIdentifierStart(ch)) {
                String name = readTagName();
                TagParser tp = tagParsers.get(name);

                if (tp == null) {
                    skipWhitespace();
                    inlineText(WhitespaceRetentionPolicy.REMOVE_ALL);
                    nextChar();
                } else {
                    skipWhitespace();
                    if (tp.getKind() == TagParser.Kind.INLINE) {
                        tp.parse(p);
                    } else { // handle block tags (ex: @see) in inline content
                        inlineText(WhitespaceRetentionPolicy.REMOVE_ALL); // skip content
                        nextChar();
                    }
                }
            }
        } catch (ParseException e) {
        }
    }

    private static enum WhitespaceRetentionPolicy {
        RETAIN_ALL,
        REMOVE_FIRST_SPACE,
        REMOVE_ALL
    }

    /**
     * Read plain text content of an inline tag.
     * Matching pairs of { } are skipped; the text is terminated by the first
     * unmatched }. It is an error if the beginning of the next tag is detected.
     */
    private void inlineText(WhitespaceRetentionPolicy whitespacePolicy) throws ParseException {
        switch (whitespacePolicy) {
            case REMOVE_ALL:
                skipWhitespace();
                break;
            case REMOVE_FIRST_SPACE:
                if (ch == ' ')
                    nextChar();
                break;
            case RETAIN_ALL:
            default:
                // do nothing
                break;

        }
        int pos = bp;
        int depth = 1;

        loop:
        while (bp < buflen) {
            switch (ch) {
                case '\n': case '\r': case '\f':
                    newline = true;
                    break;

                case ' ': case '\t':
                    break;

                case '{':
                    newline = false;
                    depth++;
                    break;

                case '}':
                    if (--depth == 0) {
                        return;
                    }
                    newline = false;
                    break;

                case '@':
                    if (newline)
                        break loop;
                    newline = false;
                    break;

                default:
                    newline = false;
                    break;
            }
            nextChar();
        }
        throw new ParseException("dc.unterminated.inline.tag");
    }

    /**
     * Read Java class name, possibly followed by member
     * Matching pairs of {@literal < >} are skipped. The text is terminated by the first
     * unmatched }. It is an error if the beginning of the next tag is detected.
     */
    // TODO: boolean allowMember should be enum FORBID, ALLOW, REQUIRE
    // TODO: improve quality of parse to forbid bad constructions.
    // TODO: update to use ReferenceParser
    @SuppressWarnings("fallthrough")
    protected void reference(boolean allowMember) throws ParseException {
        int pos = bp;
        int depth = 0;

        // scan to find the end of the signature, by looking for the first
        // whitespace not enclosed in () or <>, or the end of the tag
        loop:
        while (bp < buflen) {
            switch (ch) {
                case '\n': case '\r': case '\f':
                    newline = true;
                    // fallthrough

                case ' ': case '\t':
                    if (depth == 0)
                        break loop;
                    break;

                case '(':
                case '<':
                    newline = false;
                    depth++;
                    break;

                case ')':
                case '>':
                    newline = false;
                    --depth;
                    break;

                case '}':
                    if (bp == pos)
                        return;
                    newline = false;
                    break loop;

                case '@':
                    if (newline)
                        break loop;
                    // fallthrough

                default:
                    newline = false;

            }
            nextChar();
        }

        if (depth != 0)
            throw new ParseException("dc.unterminated.signature");
    }

    /**
     * Read Java identifier
     * Matching pairs of { } are skipped; the text is terminated by the first
     * unmatched }. It is an error if the beginning of the next tag is detected.
     */
    @SuppressWarnings("fallthrough")
    protected void identifier() throws ParseException {
        skipWhitespace();
        int pos = bp;

        if (isJavaIdentifierStart(ch)) {
            readJavaIdentifier();
            return;
        }

        throw new ParseException("dc.identifier.expected");
    }

    /**
     * Read a quoted string.
     * It is an error if the beginning of the next tag is detected.
     */
    @SuppressWarnings("fallthrough")
    protected void quotedString() {
        int pos = bp;
        nextChar();

        loop:
        while (bp < buflen) {
            switch (ch) {
                case '\n': case '\r': case '\f':
                    newline = true;
                    break;

                case ' ': case '\t':
                    break;

                case '"':
                    nextChar();
                    // trim trailing white-space?
                    return;

                case '@':
                    if (newline)
                        break loop;

            }
            nextChar();
        }
    }

    /**
     * Read a term ie. one word.
     * It is an error if the beginning of the next tag is detected.
     */
    @SuppressWarnings("fallthrough")
    protected void inlineWord() {
        int pos = bp;
        int depth = 0;
        loop:
        while (bp < buflen) {
            switch (ch) {
                case '\n':
                    newline = true;
                    // fallthrough

                case '\r': case '\f': case ' ': case '\t':
                    return;

                case '@':
                    if (newline)
                        break loop;

                case '{':
                    depth++;
                    break;

                case '}':
                    if (depth == 0 || --depth == 0)
                        return;
                    break;
            }
            newline = false;
            nextChar();
        }
    }

    /**
     * Read general text content of an inline tag, including HTML entities and elements.
     * Matching pairs of { } are skipped; the text is terminated by the first
     * unmatched }. It is an error if the beginning of the next tag is detected.
     */
    @SuppressWarnings("fallthrough")
    private void inlineContent() {

        skipWhitespace();
        int pos = bp;
        int depth = 1;

        loop:
        while (bp < buflen) {

            switch (ch) {
                case '\n': case '\r': case '\f':
                    newline = true;
                    // fall through

                case ' ': case '\t':
                    nextChar();
                    break;

                case '&':
                    entity(null);
                    break;

                case '<':
                    newline = false;
                    html();
                    break;

                case '{':
                    newline = false;
                    depth++;
                    nextChar();
                    break;

                case '}':
                    newline = false;
                    if (--depth == 0) {
                        nextChar();
                        return;
                    }
                    nextChar();
                    break;

                case '@':
                    if (newline)
                        break loop;
                    // fallthrough

                default:
                    nextChar();
                    break;
            }
        }

    }

    protected void entity(Void list) {
        newline = false;
        entity();
    }

    /**
     * Read an HTML entity.
     * {@literal &identifier; } or {@literal &#digits; } or {@literal &#xhex-digits; }
     */
    protected void entity() {
        nextChar();
        String name = null;
        if (ch == '#') {
            int namep = bp;
            nextChar();
            if (isDecimalDigit(ch)) {
                nextChar();
                while (isDecimalDigit(ch))
                    nextChar();
                name = new String(buf, namep, bp - namep);
            } else if (ch == 'x' || ch == 'X') {
                nextChar();
                if (isHexDigit(ch)) {
                    nextChar();
                    while (isHexDigit(ch))
                        nextChar();
                    name = new String(buf, namep, bp - namep);
                }
            }
        } else if (isIdentifierStart(ch)) {
            name = readIdentifier();
        }

        if (name != null) {
            if (ch != ';')
                return;
            nextChar();
        }
    }

    /**
     * Read the start or end of an HTML tag, or an HTML comment
     * {@literal <identifier attrs> } or {@literal </identifier> }
     */
    protected void html() {
        int p = bp;
        nextChar();
        if (isIdentifierStart(ch)) {
            String name = readIdentifier();
            checkHtmlTag(name);
            htmlAttrs();
            if (ch == '/') {
                nextChar();
            }
            if (ch == '>') {
                nextChar();
                return;
            }
        } else if (ch == '/') {
            nextChar();
            if (isIdentifierStart(ch)) {
                readIdentifier();
                skipWhitespace();
                if (ch == '>') {
                    nextChar();
                    return;
                }
            }
        } else if (ch == '!') {
            nextChar();
            if (ch == '-') {
                nextChar();
                if (ch == '-') {
                    nextChar();
                    while (bp < buflen) {
                        int dash = 0;
                        while (ch == '-') {
                            dash++;
                            nextChar();
                        }
                        // Strictly speaking, a comment should not contain "--"
                        // so dash > 2 is an error, dash == 2 implies ch == '>'
                        // See http://www.w3.org/TR/html-markup/syntax.html#syntax-comments
                        // for more details.
                        if (dash >= 2 && ch == '>') {
                            nextChar();
                            return;
                        }

                        nextChar();
                    }
                }
            }
        }

        bp = p + 1;
        ch = buf[bp];
    }

    /**
     * Read a series of HTML attributes, terminated by {@literal > }.
     * Each attribute is of the form {@literal identifier[=value] }.
     * "value" may be unquoted, single-quoted, or double-quoted.
     */
    protected void htmlAttrs() {
        skipWhitespace();

        loop:
        while (isIdentifierStart(ch)) {
            int namePos = bp;
            String name = readAttributeName();
            skipWhitespace();
            StringBuilder value = new StringBuilder();
            if (ch == '=') {
                nextChar();
                skipWhitespace();
                if (ch == '\'' || ch == '"') {
                    char quote = ch;
                    nextChar();
                    while (bp < buflen && ch != quote) {
                        if (newline && ch == '@') {
                            // No point trying to read more.
                            // In fact, all attrs get discarded by the caller
                            // and superseded by a malformed.html node because
                            // the html tag itself is not terminated correctly.
                            break loop;
                        }
                        value.append(ch);
                        nextChar();
                    }
                    nextChar();
                } else {
                    while (bp < buflen && !isUnquotedAttrValueTerminator(ch)) {
                        value.append(ch);
                        nextChar();
                    }
                }
                skipWhitespace();
            }
            checkHtmlAttr(name, value.toString());
        }
    }

    protected void attrValueChar(Void list) {
        switch (ch) {
            case '&':
                entity(list);
                break;

            case '{':
                inlineTag(list);
                break;

            default:
                nextChar();
        }
    }

    protected boolean isIdentifierStart(char ch) {
        return Character.isUnicodeIdentifierStart(ch);
    }

    protected String readIdentifier() {
        int start = bp;
        nextChar();
        while (bp < buflen && Character.isUnicodeIdentifierPart(ch))
            nextChar();
        return new String(buf, start, bp - start);
    }

    protected String readAttributeName() {
        int start = bp;
        nextChar();
        while (bp < buflen && (Character.isUnicodeIdentifierPart(ch) || ch == '-'))
            nextChar();
        return new String(buf, start, bp - start);
    }

    protected String readTagName() {
        int start = bp;
        nextChar();
        while (bp < buflen
                && (Character.isUnicodeIdentifierPart(ch) || ch == '.'
                || ch == '-' || ch == ':')) {
            nextChar();
        }
        return new String(buf, start, bp - start);
    }

    protected boolean isJavaIdentifierStart(char ch) {
        return Character.isJavaIdentifierStart(ch);
    }

    protected String readJavaIdentifier() {
        int start = bp;
        nextChar();
        while (bp < buflen && Character.isJavaIdentifierPart(ch))
            nextChar();
        return new String(buf, start, bp - start);
    }

    protected boolean isDecimalDigit(char ch) {
        return ('0' <= ch && ch <= '9');
    }

    protected boolean isHexDigit(char ch) {
        return ('0' <= ch && ch <= '9')
                || ('a' <= ch && ch <= 'f')
                || ('A' <= ch && ch <= 'F');
    }

    protected boolean isUnquotedAttrValueTerminator(char ch) {
        switch (ch) {
            case '\f': case '\n': case '\r': case '\t':
            case ' ':
            case '"': case '\'': case '`':
            case '=': case '<': case '>':
                return true;
            default:
                return false;
        }
    }

    protected boolean isWhitespace(char ch) {
        return Character.isWhitespace(ch);
    }

    protected void skipWhitespace() {
        while (isWhitespace(ch)) {
            nextChar();
        }
    }

    /**
     * @param start position of first character of string
     * @param end position of character beyond last character to be included
     */
    String newString(int start, int end) {
        return new String(buf, start, end - start);
    }

    static abstract class TagParser {
        enum Kind { INLINE, BLOCK }

        final Kind kind;
        final String name;


        TagParser(Kind k, String tk) {
            kind = k;
            name = tk;
        }

        TagParser(Kind k, String tk, boolean retainWhiteSpace) {
            this(k, tk);
        }

        Kind getKind() {
            return kind;
        }

        String getName() {
            return name;
        }

        abstract void parse(int pos) throws ParseException;
    }

    /**
     * @see <a href="http://docs.oracle.com/javase/8/docs/technotes/tools/unix/javadoc.html#CHDJGIJB">Javadoc Tags</a>
     */
    @SuppressWarnings("deprecation")
    private void initTagParsers() {
        TagParser[] parsers = {
            // @author name-text
            new TagParser(Kind.BLOCK, "author") {
                @Override
                public void parse(int pos) {
                    blockContent();
                }
            },

            // {@code text}
            new TagParser(Kind.INLINE, "code", true) {
                @Override
                public void parse(int pos) throws ParseException {
                    inlineText(WhitespaceRetentionPolicy.REMOVE_FIRST_SPACE);
                    nextChar();
                }
            },

            // @deprecated deprecated-text
            new TagParser(Kind.BLOCK, "deprecated") {
                @Override
                public void parse(int pos) {
                    blockContent();
                }
            },

            // {@docRoot}
            new TagParser(Kind.INLINE, "docRoot") {
                @Override
                public void parse(int pos) throws ParseException {
                    if (ch == '}') {
                        nextChar();
                        return;
                    }
                    inlineText(WhitespaceRetentionPolicy.REMOVE_ALL); // skip unexpected content
                    nextChar();
                    throw new ParseException("dc.unexpected.content");
                }
            },

            // @exception class-name description
            new TagParser(Kind.BLOCK, "exception") {
                @Override
                public void parse(int pos) throws ParseException {
                    skipWhitespace();
                    reference(false);
                    blockContent();
                }
            },

            // @hidden hidden-text
            new TagParser(Kind.BLOCK, "hidden") {
                @Override
                public void parse(int pos) {
                    blockContent();
                }
            },

            // @index search-term options-description
            new TagParser(Kind.INLINE, "index") {
                @Override
                public void parse(int pos) throws ParseException {
                    skipWhitespace();
                    if (ch == '}') {
                        throw new ParseException("dc.no.content");
                    }
                    if (ch == '"') quotedString(); else inlineWord();
                    skipWhitespace();
                    if (ch != '}') {
                        inlineContent();
                    } else {
                        nextChar();
                    }
                }
            },

            // {@inheritDoc}
            new TagParser(Kind.INLINE, "inheritDoc") {
                @Override
                public void parse(int pos) throws ParseException {
                    if (ch == '}') {
                        nextChar();
                        return;
                    }
                    inlineText(WhitespaceRetentionPolicy.REMOVE_ALL); // skip unexpected content
                    nextChar();
                    throw new ParseException("dc.unexpected.content");
                }
            },

            // {@link package.class#member label}
            new TagParser(Kind.INLINE, "link") {
                @Override
                public void parse(int pos) throws ParseException {
                    reference(true);
                    inlineContent();
                }
            },

            // {@linkplain package.class#member label}
            new TagParser(Kind.INLINE, "linkplain") {
                @Override
                public void parse(int pos) throws ParseException {
                    reference(true);
                    inlineContent();
                }
            },

            // {@literal text}
            new TagParser(Kind.INLINE, "literal", true) {
                @Override
                public void parse(int pos) throws ParseException {
                    inlineText(WhitespaceRetentionPolicy.REMOVE_FIRST_SPACE);
                    nextChar();
                }
            },

            // @param parameter-name description
            new TagParser(Kind.BLOCK, "param") {
                @Override
                public void parse(int pos) throws ParseException {
                    skipWhitespace();

                    boolean typaram = false;
                    if (ch == '<') {
                        typaram = true;
                        nextChar();
                    }

                    identifier();

                    if (typaram) {
                        if (ch != '>')
                            throw new ParseException("dc.gt.expected");
                        nextChar();
                    }

                    skipWhitespace();
                    blockContent();
                }
            },

            // @return description
            new TagParser(Kind.BLOCK, "return") {
                @Override
                public void parse(int pos) {
                    blockContent();
                }
            },

            // @see reference | quoted-string | HTML
            new TagParser(Kind.BLOCK, "see") {
                @Override
                public void parse(int pos) throws ParseException {
                    skipWhitespace();
                    switch (ch) {
                        case '"':
                            quotedString();
                            skipWhitespace();
                            if (ch == '@'
                                    || ch == EOI && bp == buf.length - 1) {
                                return;
                            }
                            break;

                        case '<':
                            blockContent();
                            return;

                        case '@':
                            if (newline)
                                throw new ParseException("dc.no.content");
                            break;

                        case EOI:
                            if (bp == buf.length - 1)
                                throw new ParseException("dc.no.content");
                            break;

                        default:
                            if (isJavaIdentifierStart(ch) || ch == '#') {
                                reference(true);
                                blockContent();
                            }
                    }
                    throw new ParseException("dc.unexpected.content");
                }
            },

            // @serialData data-description
            new TagParser(Kind.BLOCK, "@serialData") {
                @Override
                public void parse(int pos) {
                    blockContent();
                }
            },

            // @serialField field-name field-type description
            new TagParser(Kind.BLOCK, "serialField") {
                @Override
                public void parse(int pos) throws ParseException {
                    skipWhitespace();
                    identifier();
                    skipWhitespace();
                    reference(false);
                    if (isWhitespace(ch)) {
                        skipWhitespace();
                        blockContent();
                    }
                }
            },

            // @serial field-description | include | exclude
            new TagParser(Kind.BLOCK, "serial") {
                @Override
                public void parse(int pos) {
                    blockContent();
                }
            },

            // @since since-text
            new TagParser(Kind.BLOCK, "since") {
                @Override
                public void parse(int pos) {
                    blockContent();
                }
            },

            // @throws class-name description
            new TagParser(Kind.BLOCK, "throws") {
                @Override
                public void parse(int pos) throws ParseException {
                    skipWhitespace();
                    reference(false);
                    blockContent();
                }
            },

            // {@value package.class#field}
            new TagParser(Kind.INLINE, "value") {
                @Override
                public void parse(int pos) throws ParseException {
                    reference(true);
                    skipWhitespace();
                    if (ch == '}') {
                        nextChar();
                        return;
                    }
                    nextChar();
                    throw new ParseException("dc.unexpected.content");
                }
            },

            // @version version-text
            new TagParser(Kind.BLOCK, "version") {
                @Override
                public void parse(int pos) {
                    blockContent();
                }
            },
        };

        tagParsers = new HashMap<>();
        for (TagParser p: parsers)
            tagParsers.put(p.getName(), p);

    }

    private void initURIAttrs() {
        uriAttrs = new HashSet<>(Arrays.asList(
            // See https://www.w3.org/TR/html4/sgml/dtd.html
            //     https://www.w3.org/TR/html5/
            // These are all the attributes that take a %URI or a valid URL potentially surrounded
            // by spaces
            "action",  "cite",  "classid",  "codebase",  "data",
            "datasrc",  "for",  "href",  "longdesc",  "profile",
            "src",  "usemap"
        ));
    }

}