1 /* 2 * Copyright (c) 2013, 2017, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 */ 23 24 package tidystats; 25 26 import java.io.IOException; 27 import java.nio.charset.Charset; 28 import java.nio.file.FileSystem; 29 import java.nio.file.FileSystems; 30 import java.nio.file.Files; 31 import java.nio.file.Path; 32 import java.util.ArrayList; 33 import java.util.Comparator; 34 import java.util.HashMap; 35 import java.util.List; 36 import java.util.Map; 37 import java.util.Set; 38 import java.util.TreeMap; 39 import java.util.TreeSet; 40 import java.util.regex.Matcher; 41 import java.util.regex.Pattern; 42 43 /** 44 * Generate statistics from the files generated by tidy.sh. 45 * 46 * <p>The tidy.sh script is used to run tidy on all the HTML files 47 * in a directory, creating files in a new directory, and for each 48 * HTML file, it writes the console output from tidy into a file 49 * beside the fixed up file, with an additional .tidy extension. 50 * 51 * <p>This program will scan a directory for *.tidy files and 52 * analyze the messages reported by tidy, in order to generate a 53 * report with statistics on the various messages that were 54 * reported by tidy. 55 * 56 * <p>Typical usage: 57 * <pre> 58 * $ bash /path/to/tidy.sh /path/to/htmldir 59 * $ javac -d /path/to/classes /path/to/Main.java 60 * $ java -cp /path/to/classes tidystats.Main /path/to/htmldir.tidy 61 * </pre> 62 * 63 * <p>Internally, the program works by matching lines in the *.tidy 64 * files against a series of regular expressions that are used to 65 * categorize the messages. The set of regular expressions was 66 * empirically determined by running the program on the output from 67 * running tidy.sh on all the generated JDK documentation. It is 68 * possible that tidy may generate more/different messages on other 69 * doc sets, in which case, the set of regexes in the program should 70 * be updated. 71 */ 72 public class Main { main(String... args)73 public static void main(String... args) throws IOException { 74 new Main().run(args); 75 } 76 run(String... args)77 void run(String... args) throws IOException { 78 FileSystem fs = FileSystems.getDefault(); 79 List<Path> paths = new ArrayList<>(); 80 81 int i; 82 for (i = 0; i < args.length; i++) { 83 String arg = args[i]; 84 if (arg.startsWith("-")) 85 throw new IllegalArgumentException(arg); 86 else 87 break; 88 } 89 90 for ( ; i < args.length; i++) { 91 Path p = fs.getPath(args[i]); 92 paths.add(p); 93 } 94 95 for (Path p: paths) { 96 scan(p); 97 } 98 99 print("%6d files read", files); 100 print("%6d files had no errors or warnings", ok); 101 print("%6d files reported \"Not all warnings/errors were shown.\"", overflow); 102 print("%6d errors found", errs); 103 print("%6d warnings found", warns); 104 print("%6d recommendations to use CSS", css); 105 print(""); 106 107 Map<Integer, Set<String>> sortedCounts = new TreeMap<>( 108 new Comparator<Integer>() { 109 @Override 110 public int compare(Integer o1, Integer o2) { 111 return o2.compareTo(o1); 112 } 113 }); 114 115 for (Map.Entry<Pattern, Integer> e: counts.entrySet()) { 116 Pattern p = e.getKey(); 117 Integer n = e.getValue(); 118 Set<String> set = sortedCounts.get(n); 119 if (set == null) 120 sortedCounts.put(n, (set = new TreeSet<>())); 121 set.add(p.toString()); 122 } 123 124 for (Map.Entry<Integer, Set<String>> e: sortedCounts.entrySet()) { 125 for (String p: e.getValue()) { 126 if (p.startsWith(".*")) p = p.substring(2); 127 print("%6d: %s", e.getKey(), p); 128 } 129 } 130 } 131 scan(Path p)132 void scan(Path p) throws IOException { 133 if (Files.isDirectory(p)) { 134 for (Path c: Files.newDirectoryStream(p)) { 135 scan(c); 136 } 137 } else if (isTidyFile(p)) { 138 scan(Files.readAllLines(p, Charset.defaultCharset())); 139 } 140 } 141 isTidyFile(Path p)142 boolean isTidyFile(Path p) { 143 return Files.isRegularFile(p) && p.getFileName().toString().endsWith(".tidy"); 144 } 145 scan(List<String> lines)146 void scan(List<String> lines) { 147 Matcher m; 148 files++; 149 for (String line: lines) { 150 if (okPattern.matcher(line).matches()) { 151 ok++; 152 } else if ((m = countPattern.matcher(line)).matches()) { 153 warns += Integer.valueOf(m.group(1)); 154 errs += Integer.valueOf(m.group(2)); 155 if (m.group(3) != null) 156 overflow++; 157 } else if ((m = countPattern2.matcher(line)).matches()) { 158 warns += Integer.valueOf(m.group(1)); 159 errs += Integer.valueOf(m.group(2)); 160 if (m.group(3) != null) 161 overflow++; 162 } else if ((m = guardPattern.matcher(line)).matches()) { 163 boolean found = false; 164 for (Pattern p: patterns) { 165 if ((m = p.matcher(line)).matches()) { 166 found = true; 167 count(p); 168 break; 169 } 170 } 171 if (!found) 172 System.err.println("Unrecognized line: " + line); 173 } else if (cssPattern.matcher(line).matches()) { 174 css++; 175 } 176 } 177 } 178 179 Map<Pattern, Integer> counts = new HashMap<>(); count(Pattern p)180 void count(Pattern p) { 181 Integer i = counts.get(p); 182 counts.put(p, (i == null) ? 1 : i + 1); 183 } 184 print(String format, Object... args)185 void print(String format, Object... args) { 186 System.out.println(String.format(format, args)); 187 } 188 189 Pattern okPattern = Pattern.compile("No warnings or errors were found."); 190 Pattern countPattern = Pattern.compile("([0-9]+) warnings, ([0-9]+) errors were found!.*?(Not all warnings/errors were shown.)?"); 191 Pattern countPattern2 = Pattern.compile("Tidy found ([0-9]+) warning[s]? and ([0-9]+) error[s]?!.*?(Not all warnings/errors were shown.)?"); 192 Pattern cssPattern = Pattern.compile("You are recommended to use CSS.*"); 193 Pattern guardPattern = Pattern.compile("line [0-9]+ column [0-9]+ - (Error|Warning):.*"); 194 195 Pattern[] patterns = { 196 Pattern.compile(".*Error: <.*> is not recognized!"), 197 Pattern.compile(".*Error: missing quote mark for attribute value"), 198 Pattern.compile(".*Warning: <.*> anchor \".*\" already defined"), 199 Pattern.compile(".*Warning: <.*> attribute \".*\" has invalid value \".*\""), 200 Pattern.compile(".*Warning: <.*> attribute \".*\" lacks value"), 201 Pattern.compile(".*Warning: <.*> attribute \".*\" lacks value"), 202 Pattern.compile(".*Warning: <.*> attribute with missing trailing quote mark"), 203 Pattern.compile(".*Warning: <.*> dropping value \".*\" for repeated attribute \".*\""), 204 Pattern.compile(".*Warning: <.*> inserting \".*\" attribute"), 205 Pattern.compile(".*Warning: <.*> is probably intended as </.*>"), 206 Pattern.compile(".*Warning: <.*> isn't allowed in <.*> elements"), 207 Pattern.compile(".*Warning: <.*> lacks \".*\" attribute"), 208 Pattern.compile(".*Warning: <.*> missing '>' for end of tag"), 209 Pattern.compile(".*Warning: <.*> proprietary attribute \".*\""), 210 Pattern.compile(".*Warning: <.*> unexpected or duplicate quote mark"), 211 Pattern.compile(".*Warning: <a> cannot copy name attribute to id"), 212 Pattern.compile(".*Warning: <a> escaping malformed URI reference"), 213 Pattern.compile(".*Warning: <blockquote> proprietary attribute \"pre\""), 214 Pattern.compile(".*Warning: discarding unexpected <.*>"), 215 Pattern.compile(".*Warning: discarding unexpected </.*>"), 216 Pattern.compile(".*Warning: entity \".*\" doesn't end in ';'"), 217 Pattern.compile(".*Warning: inserting implicit <.*>"), 218 Pattern.compile(".*Warning: inserting missing 'title' element"), 219 Pattern.compile(".*Warning: missing <!DOCTYPE> declaration"), 220 Pattern.compile(".*Warning: missing <.*>"), 221 Pattern.compile(".*Warning: missing </.*> before <.*>"), 222 Pattern.compile(".*Warning: nested emphasis <.*>"), 223 Pattern.compile(".*Warning: plain text isn't allowed in <.*> elements"), 224 Pattern.compile(".*Warning: replacing <p> by <br>"), 225 Pattern.compile(".*Warning: replacing invalid numeric character reference .*"), 226 Pattern.compile(".*Warning: replacing unexpected .* by </.*>"), 227 Pattern.compile(".*Warning: trimming empty <.*>"), 228 Pattern.compile(".*Warning: unescaped & or unknown entity \".*\""), 229 Pattern.compile(".*Warning: unescaped & which should be written as &"), 230 Pattern.compile(".*Warning: using <br> in place of <p>"), 231 Pattern.compile(".*Warning: <.*> element removed from HTML5"), 232 Pattern.compile(".*Warning: <.*> attribute \".*\" not allowed for HTML5"), 233 Pattern.compile(".*Warning: The summary attribute on the <table> element is obsolete in HTML5"), 234 Pattern.compile(".*Warning: replacing invalid UTF-8 bytes \\(char. code U\\+.*\\)") 235 }; 236 237 int files; 238 int ok; 239 int warns; 240 int errs; 241 int css; 242 int overflow; 243 } 244 245