1 /*
2  * Copyright (c) 2013, 2017, Oracle and/or its affiliates. All rights reserved.
3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4  *
5  * This code is free software; you can redistribute it and/or modify it
6  * under the terms of the GNU General Public License version 2 only, as
7  * published by the Free Software Foundation.
8  *
9  * This code is distributed in the hope that it will be useful, but WITHOUT
10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
12  * version 2 for more details (a copy is included in the LICENSE file that
13  * accompanied this code).
14  *
15  * You should have received a copy of the GNU General Public License version
16  * 2 along with this work; if not, write to the Free Software Foundation,
17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18  *
19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20  * or visit www.oracle.com if you need additional information or have any
21  * questions.
22  */
23 
24 package tidystats;
25 
26 import java.io.IOException;
27 import java.nio.charset.Charset;
28 import java.nio.file.FileSystem;
29 import java.nio.file.FileSystems;
30 import java.nio.file.Files;
31 import java.nio.file.Path;
32 import java.util.ArrayList;
33 import java.util.Comparator;
34 import java.util.HashMap;
35 import java.util.List;
36 import java.util.Map;
37 import java.util.Set;
38 import java.util.TreeMap;
39 import java.util.TreeSet;
40 import java.util.regex.Matcher;
41 import java.util.regex.Pattern;
42 
43 /**
44  * Generate statistics from the files generated by tidy.sh.
45  *
46  * <p>The tidy.sh script is used to run tidy on all the HTML files
47  * in a directory, creating files in a new directory, and for each
48  * HTML file, it writes the console output from tidy into a file
49  * beside the fixed up file, with an additional .tidy extension.
50  *
51  * <p>This program will scan a directory for *.tidy files and
52  * analyze the messages reported by tidy, in order to generate a
53  * report with statistics on the various messages that were
54  * reported by tidy.
55  *
56  * <p>Typical usage:
57  * <pre>
58  * $ bash /path/to/tidy.sh /path/to/htmldir
59  * $ javac -d /path/to/classes /path/to/Main.java
60  * $ java -cp /path/to/classes tidystats.Main /path/to/htmldir.tidy
61  * </pre>
62  *
63  * <p>Internally, the program works by matching lines in the *.tidy
64  * files against a series of regular expressions that are used to
65  * categorize the messages.  The set of regular expressions was
66  * empirically determined by running the program on the output from
67  * running tidy.sh on all the generated JDK documentation. It is
68  * possible that tidy may generate more/different messages on other
69  * doc sets, in which case, the set of regexes in the program should
70  * be updated.
71  */
72 public class Main {
main(String... args)73     public static void main(String... args) throws IOException {
74         new Main().run(args);
75     }
76 
run(String... args)77     void run(String... args) throws IOException {
78         FileSystem fs = FileSystems.getDefault();
79         List<Path> paths = new ArrayList<>();
80 
81         int i;
82         for (i = 0; i < args.length; i++) {
83             String arg = args[i];
84             if (arg.startsWith("-"))
85                 throw new IllegalArgumentException(arg);
86             else
87                 break;
88         }
89 
90         for ( ; i < args.length; i++) {
91             Path p = fs.getPath(args[i]);
92             paths.add(p);
93         }
94 
95         for (Path p: paths) {
96             scan(p);
97         }
98 
99         print("%6d files read", files);
100         print("%6d files had no errors or warnings", ok);
101         print("%6d files reported \"Not all warnings/errors were shown.\"", overflow);
102         print("%6d errors found", errs);
103         print("%6d warnings found", warns);
104         print("%6d recommendations to use CSS", css);
105         print("");
106 
107         Map<Integer, Set<String>> sortedCounts = new TreeMap<>(
108                 new Comparator<Integer>() {
109                     @Override
110                     public int compare(Integer o1, Integer o2) {
111                         return o2.compareTo(o1);
112                     }
113                 });
114 
115         for (Map.Entry<Pattern, Integer> e: counts.entrySet()) {
116             Pattern p = e.getKey();
117             Integer n = e.getValue();
118             Set<String> set = sortedCounts.get(n);
119             if (set == null)
120                 sortedCounts.put(n, (set = new TreeSet<>()));
121             set.add(p.toString());
122         }
123 
124         for (Map.Entry<Integer, Set<String>> e: sortedCounts.entrySet()) {
125             for (String p: e.getValue()) {
126                 if (p.startsWith(".*")) p = p.substring(2);
127                 print("%6d: %s", e.getKey(), p);
128             }
129         }
130     }
131 
scan(Path p)132     void scan(Path p) throws IOException {
133         if (Files.isDirectory(p)) {
134             for (Path c: Files.newDirectoryStream(p)) {
135                 scan(c);
136             }
137         } else if (isTidyFile(p)) {
138             scan(Files.readAllLines(p, Charset.defaultCharset()));
139         }
140     }
141 
isTidyFile(Path p)142     boolean isTidyFile(Path p) {
143         return Files.isRegularFile(p) && p.getFileName().toString().endsWith(".tidy");
144     }
145 
scan(List<String> lines)146     void scan(List<String> lines) {
147         Matcher m;
148         files++;
149         for (String line: lines) {
150             if (okPattern.matcher(line).matches()) {
151                 ok++;
152             } else if ((m = countPattern.matcher(line)).matches()) {
153                 warns += Integer.valueOf(m.group(1));
154                 errs += Integer.valueOf(m.group(2));
155                 if (m.group(3) != null)
156                     overflow++;
157             } else if ((m = countPattern2.matcher(line)).matches()) {
158                 warns += Integer.valueOf(m.group(1));
159                 errs += Integer.valueOf(m.group(2));
160                 if (m.group(3) != null)
161                     overflow++;
162             } else if ((m = guardPattern.matcher(line)).matches()) {
163                 boolean found = false;
164                 for (Pattern p: patterns) {
165                     if ((m = p.matcher(line)).matches()) {
166                         found = true;
167                         count(p);
168                         break;
169                     }
170                 }
171                 if (!found)
172                     System.err.println("Unrecognized line: " + line);
173             } else if (cssPattern.matcher(line).matches()) {
174                 css++;
175             }
176         }
177     }
178 
179     Map<Pattern, Integer> counts = new HashMap<>();
count(Pattern p)180     void count(Pattern p) {
181         Integer i = counts.get(p);
182         counts.put(p, (i == null) ? 1 : i + 1);
183     }
184 
print(String format, Object... args)185     void print(String format, Object... args) {
186         System.out.println(String.format(format, args));
187     }
188 
189     Pattern okPattern = Pattern.compile("No warnings or errors were found.");
190     Pattern countPattern = Pattern.compile("([0-9]+) warnings, ([0-9]+) errors were found!.*?(Not all warnings/errors were shown.)?");
191     Pattern countPattern2 = Pattern.compile("Tidy found ([0-9]+) warning[s]? and ([0-9]+) error[s]?!.*?(Not all warnings/errors were shown.)?");
192     Pattern cssPattern = Pattern.compile("You are recommended to use CSS.*");
193     Pattern guardPattern = Pattern.compile("line [0-9]+ column [0-9]+ - (Error|Warning):.*");
194 
195     Pattern[] patterns = {
196         Pattern.compile(".*Error: <.*> is not recognized!"),
197         Pattern.compile(".*Error: missing quote mark for attribute value"),
198         Pattern.compile(".*Warning: <.*> anchor \".*\" already defined"),
199         Pattern.compile(".*Warning: <.*> attribute \".*\" has invalid value \".*\""),
200         Pattern.compile(".*Warning: <.*> attribute \".*\" lacks value"),
201         Pattern.compile(".*Warning: <.*> attribute \".*\" lacks value"),
202         Pattern.compile(".*Warning: <.*> attribute with missing trailing quote mark"),
203         Pattern.compile(".*Warning: <.*> dropping value \".*\" for repeated attribute \".*\""),
204         Pattern.compile(".*Warning: <.*> inserting \".*\" attribute"),
205         Pattern.compile(".*Warning: <.*> is probably intended as </.*>"),
206         Pattern.compile(".*Warning: <.*> isn't allowed in <.*> elements"),
207         Pattern.compile(".*Warning: <.*> lacks \".*\" attribute"),
208         Pattern.compile(".*Warning: <.*> missing '>' for end of tag"),
209         Pattern.compile(".*Warning: <.*> proprietary attribute \".*\""),
210         Pattern.compile(".*Warning: <.*> unexpected or duplicate quote mark"),
211         Pattern.compile(".*Warning: <a> cannot copy name attribute to id"),
212         Pattern.compile(".*Warning: <a> escaping malformed URI reference"),
213         Pattern.compile(".*Warning: <blockquote> proprietary attribute \"pre\""),
214         Pattern.compile(".*Warning: discarding unexpected <.*>"),
215         Pattern.compile(".*Warning: discarding unexpected </.*>"),
216         Pattern.compile(".*Warning: entity \".*\" doesn't end in ';'"),
217         Pattern.compile(".*Warning: inserting implicit <.*>"),
218         Pattern.compile(".*Warning: inserting missing 'title' element"),
219         Pattern.compile(".*Warning: missing <!DOCTYPE> declaration"),
220         Pattern.compile(".*Warning: missing <.*>"),
221         Pattern.compile(".*Warning: missing </.*> before <.*>"),
222         Pattern.compile(".*Warning: nested emphasis <.*>"),
223         Pattern.compile(".*Warning: plain text isn't allowed in <.*> elements"),
224         Pattern.compile(".*Warning: replacing <p> by <br>"),
225         Pattern.compile(".*Warning: replacing invalid numeric character reference .*"),
226         Pattern.compile(".*Warning: replacing unexpected .* by </.*>"),
227         Pattern.compile(".*Warning: trimming empty <.*>"),
228         Pattern.compile(".*Warning: unescaped & or unknown entity \".*\""),
229         Pattern.compile(".*Warning: unescaped & which should be written as &amp;"),
230         Pattern.compile(".*Warning: using <br> in place of <p>"),
231         Pattern.compile(".*Warning: <.*> element removed from HTML5"),
232         Pattern.compile(".*Warning: <.*> attribute \".*\" not allowed for HTML5"),
233         Pattern.compile(".*Warning: The summary attribute on the <table> element is obsolete in HTML5"),
234         Pattern.compile(".*Warning: replacing invalid UTF-8 bytes \\(char. code U\\+.*\\)")
235     };
236 
237     int files;
238     int ok;
239     int warns;
240     int errs;
241     int css;
242     int overflow;
243 }
244 
245