1%% ===================================================================== 2%% Licensed under the Apache License, Version 2.0 (the "License"); you may 3%% not use this file except in compliance with the License. You may obtain 4%% a copy of the License at <http://www.apache.org/licenses/LICENSE-2.0> 5%% 6%% Unless required by applicable law or agreed to in writing, software 7%% distributed under the License is distributed on an "AS IS" BASIS, 8%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 9%% See the License for the specific language governing permissions and 10%% limitations under the License. 11%% 12%% Alternatively, you may use this file under the terms of the GNU Lesser 13%% General Public License (the "LGPL") as published by the Free Software 14%% Foundation; either version 2.1, or (at your option) any later version. 15%% If you wish to allow use of your version of this file only under the 16%% terms of the LGPL, you should delete the provisions above and replace 17%% them with the notice and other provisions required by the LGPL; see 18%% <http://www.gnu.org/licenses/>. If you do not delete the provisions 19%% above, a recipient may use your version of this file under the terms of 20%% either the Apache License or the LGPL. 21%% 22%% ===================================================================== 23%% @copyright 1997-2006 Richard Carlsson 24%% @author Richard Carlsson <carlsson.richard@gmail.com> 25%% @end 26%% ===================================================================== 27 28%% @doc Functions for reading comment lines from Erlang source code. 29 30-module(erl_comment_scan). 31 32-export([file/1, join_lines/1, scan_lines/1, string/1]). 33 34-export_type([comment/0]). 35 36%% ===================================================================== 37 38-type comment() :: {Line:: integer(), 39 Column:: integer(), 40 Indentation :: integer(), 41 Text :: [string()]}. 42-type commentLine() :: {Line :: integer(), 43 Column :: integer(), 44 Indent :: integer(), 45 Text :: string()}. 46 47%% ===================================================================== 48%% @spec file(FileName::file:filename()) -> [Comment] 49%% 50%% Comment = {Line, Column, Indentation, Text} 51%% Line = integer() 52%% Column = integer() 53%% Indentation = integer() 54%% Text = [string()] 55%% 56%% @doc Extracts comments from an Erlang source code file. Returns a 57%% list of entries representing <em>multi-line</em> comments, listed in 58%% order of increasing line-numbers. For each entry, `Text' 59%% is a list of strings representing the consecutive comment lines in 60%% top-down order; the strings contain <em>all</em> characters following 61%% (but not including) the first comment-introducing `%' 62%% character on the line, up to (but not including) the line-terminating 63%% newline. 64%% 65%% Furthermore, `Line' is the line number and 66%% `Column' the left column of the comment (i.e., the column 67%% of the comment-introducing `%' character). 68%% `Indent' is the indentation (or padding), measured in 69%% character positions between the last non-whitespace character before 70%% the comment (or the left margin), and the left column of the comment. 71%% `Line' and `Column' are always positive 72%% integers, and `Indentation' is a nonnegative integer. 73%% 74%% Evaluation exits with reason `{read, Reason}' if a read 75%% error occurred, where `Reason' is an atom corresponding to 76%% a Posix error code; see the module {@link //kernel/file} for details. 77 78-spec file(file:filename()) -> [comment()]. 79 80file(Name) -> 81 Name1 = filename(Name), 82 case catch {ok, file:read_file(Name1)} of 83 {ok, V} -> 84 case V of 85 {ok, B} -> 86 Encoding = epp:read_encoding_from_binary(B), 87 Enc = case Encoding of 88 none -> epp:default_encoding(); 89 Enc0 -> Enc0 90 end, 91 case catch unicode:characters_to_list(B, Enc) of 92 String when is_list(String) -> 93 string(String); 94 R when Encoding =:= none -> 95 case 96 catch unicode:characters_to_list(B, latin1) 97 of 98 String when is_list(String) -> 99 string(String); 100 _ -> 101 error_read_file(Name1), 102 exit(R) 103 end; 104 R -> 105 error_read_file(Name1), 106 exit(R) 107 end; 108 {error, E} -> 109 error_read_file(Name1), 110 exit({read, E}) 111 end; 112 {'EXIT', E} -> 113 error_read_file(Name1), 114 exit(E); 115 R -> 116 error_read_file(Name1), 117 throw(R) 118 end. 119 120 121%% ===================================================================== 122%% @spec string(string()) -> [Comment] 123%% 124%% Comment = {Line, Column, Indentation, Text} 125%% Line = integer() 126%% Column = integer() 127%% Indentation = integer() 128%% Text = [string()] 129%% 130%% @doc Extracts comments from a string containing Erlang source code. 131%% Except for reading directly from a string, the behaviour is the same 132%% as for {@link file/1}. 133%% 134%% @see file/1 135 136-spec string(string()) -> [comment()]. 137 138string(Text) -> 139 lists:reverse(join_lines(scan_lines(Text))). 140 141 142%% ===================================================================== 143%% @spec scan_lines(string()) -> [CommentLine] 144%% 145%% CommentLine = {Line, Column, Indent, Text} 146%% Line = integer() 147%% Column = integer() 148%% Indent = integer() 149%% Text = string() 150%% 151%% @doc Extracts individual comment lines from a source code string. 152%% Returns a list of comment lines found in the text, listed in order of 153%% <em>decreasing</em> line-numbers, i.e., the last comment line in the 154%% input is first in the resulting list. `Text' is a single 155%% string, containing all characters following (but not including) the 156%% first comment-introducing `%' character on the line, up 157%% to (but not including) the line-terminating newline. For details on 158%% `Line', `Column' and `Indent', see {@link file/1}. 159 160-spec scan_lines(string()) -> [commentLine()]. 161 162scan_lines(Text) -> 163 scan_lines(Text, 1, 0, 0, []). 164 165scan_lines([$\040 | Cs], L, Col, M, Ack) -> 166 scan_lines(Cs, L, Col + 1, M, Ack); 167scan_lines([$\t | Cs], L, Col, M, Ack) -> 168 scan_lines(Cs, L, tab(Col), M, Ack); 169scan_lines([$\n | Cs], L, _Col, _M, Ack) -> 170 scan_lines(Cs, L + 1, 0, 0, Ack); 171scan_lines([$\r, $\n | Cs], L, _Col, _M, Ack) -> 172 scan_lines(Cs, L + 1, 0, 0, Ack); 173scan_lines([$\r | Cs], L, _Col, _M, Ack) -> 174 scan_lines(Cs, L + 1, 0, 0, Ack); 175scan_lines([$% | Cs], L, Col, M, Ack) -> 176 scan_comment(Cs, "", L, Col, M, Ack); 177scan_lines([$$ | Cs], L, Col, _M, Ack) -> 178 scan_char(Cs, L, Col + 1, Ack); 179scan_lines([$" | Cs], L, Col, _M, Ack) -> 180 scan_string(Cs, $", L, Col + 1, Ack); 181scan_lines([$' | Cs], L, Col, _M, Ack) -> 182 scan_string(Cs, $', L, Col + 1, Ack); 183scan_lines([_C | Cs], L, Col, _M, Ack) -> 184 N = Col + 1, 185 scan_lines(Cs, L, N, N, Ack); 186scan_lines([], _L, _Col, _M, Ack) -> 187 Ack. 188 189tab(Col) -> 190 Col - (Col rem 8) + 8. 191 192scan_comment([$\n | Cs], Cs1, L, Col, M, Ack) -> 193 seen_comment(Cs, Cs1, L, Col, M, Ack); 194scan_comment([$\r, $\n | Cs], Cs1, L, Col, M, Ack) -> 195 seen_comment(Cs, Cs1, L, Col, M, Ack); 196scan_comment([$\r | Cs], Cs1, L, Col, M, Ack) -> 197 seen_comment(Cs, Cs1, L, Col, M, Ack); 198scan_comment([C | Cs], Cs1, L, Col, M, Ack) -> 199 scan_comment(Cs, [C | Cs1], L, Col, M, Ack); 200scan_comment([], Cs1, L, Col, M, Ack) -> 201 seen_comment([], Cs1, L, Col, M, Ack). 202 203%% Add a comment line to the ackumulator and return to normal 204%% scanning. Note that we compute column positions starting at 0 205%% internally, but the column values in the comment descriptors 206%% should start at 1. 207 208seen_comment(Cs, Cs1, L, Col, M, Ack) -> 209 %% Compute indentation and strip trailing spaces 210 N = Col - M, 211 Text = lists:reverse(string:trim(Cs1, leading)), 212 Ack1 = [{L, Col + 1, N, Text} | Ack], 213 scan_lines(Cs, L + 1, 0, 0, Ack1). 214 215scan_string([Quote | Cs], Quote, L, Col, Ack) -> 216 N = Col + 1, 217 scan_lines(Cs, L, N, N, Ack); 218scan_string([$\t | Cs], Quote, L, Col, Ack) -> 219 scan_string(Cs, Quote, L, tab(Col), Ack); 220scan_string([$\n | Cs], Quote, L, _Col, Ack) -> 221 %% Newlines should really not occur in strings/atoms, but we 222 %% want to be well behaved even if the input is not. 223 scan_string(Cs, Quote, L + 1, 0, Ack); 224scan_string([$\r, $\n | Cs], Quote, L, _Col, Ack) -> 225 scan_string(Cs, Quote, L + 1, 0, Ack); 226scan_string([$\r | Cs], Quote, L, _Col, Ack) -> 227 scan_string(Cs, Quote, L + 1, 0, Ack); 228scan_string([$\\, _C | Cs], Quote, L, Col, Ack) -> 229 scan_string(Cs, Quote, L, Col + 2, Ack); % ignore character C 230scan_string([_C | Cs], Quote, L, Col, Ack) -> 231 scan_string(Cs, Quote, L, Col + 1, Ack); 232scan_string([], _Quote, _L, _Col, Ack) -> 233 %% Finish quietly. 234 Ack. 235 236scan_char([$\t | Cs], L, Col, Ack) -> 237 N = tab(Col), 238 scan_lines(Cs, L, N, N, Ack); % this is not just any whitespace 239scan_char([$\n | Cs], L, _Col, Ack) -> 240 scan_lines(Cs, L + 1, 0, 0, Ack); % handle this, just in case 241scan_char([$\r, $\n | Cs], L, _Col, Ack) -> 242 scan_lines(Cs, L + 1, 0, 0, Ack); 243scan_char([$\r | Cs], L, _Col, Ack) -> 244 scan_lines(Cs, L + 1, 0, 0, Ack); 245scan_char([$\\, _C | Cs], L, Col, Ack) -> 246 N = Col + 2, % character C must be ignored 247 scan_lines(Cs, L, N, N, Ack); 248scan_char([_C | Cs], L, Col, Ack) -> 249 N = Col + 1, % character C must be ignored 250 scan_lines(Cs, L, N, N, Ack); 251scan_char([], _L, _Col, Ack) -> 252 %% Finish quietly. 253 Ack. 254 255 256%% ===================================================================== 257%% @spec join_lines([CommentLine]) -> [Comment] 258%% 259%% CommentLine = {Line, Column, Indent, string()} 260%% Line = integer() 261%% Column = integer() 262%% Indent = integer() 263%% Comment = {Line, Column, Indent, Text} 264%% Text = [string()] 265%% 266%% @doc Joins individual comment lines into multi-line comments. The 267%% input is a list of entries representing individual comment lines, 268%% <em>in order of decreasing line-numbers</em>; see 269%% {@link scan_lines/1} for details. The result is a list of 270%% entries representing <em>multi-line</em> comments, <em>still listed 271%% in order of decreasing line-numbers</em>, but where for each entry, 272%% `Text' is a list of consecutive comment lines in order of 273%% <em>increasing</em> line-numbers (i.e., top-down). 274%% 275%% @see scan_lines/1 276 277-spec join_lines([commentLine()]) -> [comment()]. 278 279join_lines([{L, Col, Ind, Txt} | Lines]) -> 280 join_lines(Lines, [Txt], L, Col, Ind); 281join_lines([]) -> 282 []. 283 284%% In the following, we assume that the current `Txt' is never empty. 285%% Recall that the list is in reverse line-number order. 286 287join_lines([{L1, Col1, Ind1, Txt1} | Lines], Txt, L, Col, Ind) -> 288 if L1 =:= L - 1, Col1 =:= Col, Ind + 1 =:= Col -> 289 %% The last test above checks that the previous 290 %% comment was alone on its line; otherwise it won't 291 %% be joined with the current; this is not always what 292 %% one wants, but works well in general. 293 join_lines(Lines, [Txt1 | Txt], L1, Col1, Ind1); 294 true -> 295 %% Finish the current comment and let the new line 296 %% start the next one. 297 [{L, Col, Ind, Txt} 298 | join_lines(Lines, [Txt1], L1, Col1, Ind1)] 299 end; 300join_lines([], Txt, L, Col, Ind) -> 301 [{L, Col, Ind, Txt}]. 302 303 304%% ===================================================================== 305%% Utility functions for internal use 306 307filename([C|T]) when is_integer(C), C > 0 -> 308 [C | filename(T)]; 309filename([]) -> 310 []; 311filename(N) -> 312 report_error("bad filename: `~tP'.", [N, 25]), 313 exit(error). 314 315error_read_file(Name) -> 316 report_error("error reading file `~ts'.", [Name]). 317 318report_error(S, Vs) -> 319 error_logger:error_msg(lists:concat([?MODULE, ": ", S, "\n"]), Vs). 320 321%% ===================================================================== 322