1%% =====================================================================
2%% Licensed under the Apache License, Version 2.0 (the "License"); you may
3%% not use this file except in compliance with the License. You may obtain
4%% a copy of the License at <http://www.apache.org/licenses/LICENSE-2.0>
5%%
6%% Unless required by applicable law or agreed to in writing, software
7%% distributed under the License is distributed on an "AS IS" BASIS,
8%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9%% See the License for the specific language governing permissions and
10%% limitations under the License.
11%%
12%% Alternatively, you may use this file under the terms of the GNU Lesser
13%% General Public License (the "LGPL") as published by the Free Software
14%% Foundation; either version 2.1, or (at your option) any later version.
15%% If you wish to allow use of your version of this file only under the
16%% terms of the LGPL, you should delete the provisions above and replace
17%% them with the notice and other provisions required by the LGPL; see
18%% <http://www.gnu.org/licenses/>. If you do not delete the provisions
19%% above, a recipient may use your version of this file under the terms of
20%% either the Apache License or the LGPL.
21%%
22%% =====================================================================
23%% @copyright 1997-2006 Richard Carlsson
24%% @author Richard Carlsson <carlsson.richard@gmail.com>
25%% @end
26%% =====================================================================
27
28%% @doc Functions for reading comment lines from Erlang source code.
29
30-module(erl_comment_scan).
31
32-export([file/1, join_lines/1, scan_lines/1, string/1]).
33
34-export_type([comment/0]).
35
36%% =====================================================================
37
38-type comment()     :: {Line:: integer(),
39                        Column:: integer(),
40                        Indentation :: integer(),
41                        Text :: [string()]}.
42-type commentLine() :: {Line :: integer(),
43                        Column :: integer(),
44                        Indent :: integer(),
45                        Text :: string()}.
46
47%% =====================================================================
48%% @spec file(FileName::file:filename()) -> [Comment]
49%%
50%%	    Comment = {Line, Column, Indentation, Text}
51%%	    Line = integer()
52%%          Column = integer()
53%%          Indentation = integer()
54%%          Text = [string()]
55%%
56%% @doc Extracts comments from an Erlang source code file. Returns a
57%% list of entries representing <em>multi-line</em> comments, listed in
58%% order of increasing line-numbers. For each entry, `Text'
59%% is a list of strings representing the consecutive comment lines in
60%% top-down order; the strings contain <em>all</em> characters following
61%% (but not including) the first comment-introducing `%'
62%% character on the line, up to (but not including) the line-terminating
63%% newline.
64%%
65%% Furthermore, `Line' is the line number and
66%% `Column' the left column of the comment (i.e., the column
67%% of the comment-introducing `%' character).
68%% `Indent' is the indentation (or padding), measured in
69%% character positions between the last non-whitespace character before
70%% the comment (or the left margin), and the left column of the comment.
71%% `Line' and `Column' are always positive
72%% integers, and `Indentation' is a nonnegative integer.
73%%
74%% Evaluation exits with reason `{read, Reason}' if a read
75%% error occurred, where `Reason' is an atom corresponding to
76%% a Posix error code; see the module {@link //kernel/file} for details.
77
78-spec file(file:filename()) -> [comment()].
79
80file(Name) ->
81    Name1 = filename(Name),
82    case catch {ok, file:read_file(Name1)} of
83	{ok, V} ->
84	    case V of
85		{ok, B} ->
86                    Encoding = epp:read_encoding_from_binary(B),
87                    Enc = case Encoding of
88                              none -> epp:default_encoding();
89                              Enc0 -> Enc0
90                          end,
91                    case catch unicode:characters_to_list(B, Enc) of
92                        String when is_list(String) ->
93                            string(String);
94                        R when Encoding =:= none ->
95                            case
96                              catch unicode:characters_to_list(B, latin1)
97                            of
98                                String when is_list(String) ->
99                                    string(String);
100                                _ ->
101                                    error_read_file(Name1),
102                                    exit(R)
103                            end;
104                        R ->
105                            error_read_file(Name1),
106                            exit(R)
107                    end;
108		{error, E} ->
109		    error_read_file(Name1),
110		    exit({read, E})
111	    end;
112	{'EXIT', E} ->
113	    error_read_file(Name1),
114	    exit(E);
115	R ->
116	    error_read_file(Name1),
117	    throw(R)
118    end.
119
120
121%% =====================================================================
122%% @spec string(string()) -> [Comment]
123%%
124%%	    Comment = {Line, Column, Indentation, Text}
125%%	    Line = integer()
126%%          Column = integer()
127%%          Indentation = integer()
128%%          Text = [string()]
129%%
130%% @doc Extracts comments from a string containing Erlang source code.
131%% Except for reading directly from a string, the behaviour is the same
132%% as for {@link file/1}.
133%%
134%% @see file/1
135
136-spec string(string()) -> [comment()].
137
138string(Text) ->
139    lists:reverse(join_lines(scan_lines(Text))).
140
141
142%% =====================================================================
143%% @spec scan_lines(string()) -> [CommentLine]
144%%
145%%	    CommentLine = {Line, Column, Indent, Text}
146%%	    Line = integer()
147%%	    Column = integer()
148%%	    Indent = integer()
149%%	    Text = string()
150%%
151%% @doc Extracts individual comment lines from a source code string.
152%% Returns a list of comment lines found in the text, listed in order of
153%% <em>decreasing</em> line-numbers, i.e., the last comment line in the
154%% input is first in the resulting list. `Text' is a single
155%% string, containing all characters following (but not including) the
156%% first comment-introducing `%' character on the line, up
157%% to (but not including) the line-terminating newline. For details on
158%% `Line', `Column' and `Indent', see {@link file/1}.
159
160-spec scan_lines(string()) -> [commentLine()].
161
162scan_lines(Text) ->
163    scan_lines(Text, 1, 0, 0, []).
164
165scan_lines([$\040 | Cs], L, Col, M, Ack) ->
166    scan_lines(Cs, L, Col + 1, M, Ack);
167scan_lines([$\t | Cs], L, Col, M, Ack) ->
168    scan_lines(Cs, L, tab(Col), M, Ack);
169scan_lines([$\n | Cs], L, _Col, _M, Ack) ->
170    scan_lines(Cs, L + 1, 0, 0, Ack);
171scan_lines([$\r, $\n | Cs], L, _Col, _M, Ack) ->
172    scan_lines(Cs, L + 1, 0, 0, Ack);
173scan_lines([$\r | Cs], L, _Col, _M, Ack) ->
174    scan_lines(Cs, L + 1, 0, 0, Ack);
175scan_lines([$% | Cs], L, Col, M, Ack) ->
176    scan_comment(Cs, "", L, Col, M, Ack);
177scan_lines([$$ | Cs], L, Col, _M, Ack) ->
178    scan_char(Cs, L, Col + 1, Ack);
179scan_lines([$" | Cs], L, Col, _M, Ack) ->
180    scan_string(Cs, $", L, Col + 1, Ack);
181scan_lines([$' | Cs], L, Col, _M, Ack) ->
182    scan_string(Cs, $', L, Col + 1, Ack);
183scan_lines([_C | Cs], L, Col, _M, Ack) ->
184    N = Col + 1,
185    scan_lines(Cs, L, N, N, Ack);
186scan_lines([], _L, _Col, _M, Ack) ->
187    Ack.
188
189tab(Col) ->
190    Col - (Col rem 8) + 8.
191
192scan_comment([$\n | Cs], Cs1, L, Col, M, Ack) ->
193    seen_comment(Cs, Cs1, L, Col, M, Ack);
194scan_comment([$\r, $\n | Cs], Cs1, L, Col, M, Ack) ->
195    seen_comment(Cs, Cs1, L, Col, M, Ack);
196scan_comment([$\r | Cs], Cs1, L, Col, M, Ack) ->
197    seen_comment(Cs, Cs1, L, Col, M, Ack);
198scan_comment([C | Cs], Cs1, L, Col, M, Ack) ->
199    scan_comment(Cs, [C | Cs1], L, Col, M, Ack);
200scan_comment([], Cs1, L, Col, M, Ack) ->
201    seen_comment([], Cs1, L, Col, M, Ack).
202
203%% Add a comment line to the ackumulator and return to normal
204%% scanning. Note that we compute column positions starting at 0
205%% internally, but the column values in the comment descriptors
206%% should start at 1.
207
208seen_comment(Cs, Cs1, L, Col, M, Ack) ->
209    %% Compute indentation and strip trailing spaces
210    N = Col - M,
211    Text = lists:reverse(string:trim(Cs1, leading)),
212    Ack1 = [{L, Col + 1, N, Text} | Ack],
213    scan_lines(Cs, L + 1, 0, 0, Ack1).
214
215scan_string([Quote | Cs], Quote, L, Col, Ack) ->
216    N = Col + 1,
217    scan_lines(Cs, L, N, N, Ack);
218scan_string([$\t | Cs], Quote, L, Col, Ack) ->
219    scan_string(Cs, Quote, L, tab(Col), Ack);
220scan_string([$\n | Cs], Quote, L, _Col, Ack) ->
221    %% Newlines should really not occur in strings/atoms, but we
222    %% want to be well behaved even if the input is not.
223    scan_string(Cs, Quote, L + 1, 0, Ack);
224scan_string([$\r, $\n | Cs], Quote, L, _Col, Ack) ->
225    scan_string(Cs, Quote, L + 1, 0, Ack);
226scan_string([$\r | Cs], Quote, L, _Col, Ack) ->
227    scan_string(Cs, Quote, L + 1, 0, Ack);
228scan_string([$\\, _C | Cs], Quote, L, Col, Ack) ->
229    scan_string(Cs, Quote, L, Col + 2, Ack);  % ignore character C
230scan_string([_C | Cs], Quote, L, Col, Ack) ->
231    scan_string(Cs, Quote, L, Col + 1, Ack);
232scan_string([], _Quote, _L, _Col, Ack) ->
233    %% Finish quietly.
234    Ack.
235
236scan_char([$\t | Cs], L, Col, Ack) ->
237    N = tab(Col),
238    scan_lines(Cs, L, N, N, Ack);    % this is not just any whitespace
239scan_char([$\n | Cs], L, _Col, Ack) ->
240    scan_lines(Cs, L + 1, 0, 0, Ack);    % handle this, just in case
241scan_char([$\r, $\n | Cs], L, _Col, Ack) ->
242    scan_lines(Cs, L + 1, 0, 0, Ack);
243scan_char([$\r | Cs], L, _Col, Ack) ->
244    scan_lines(Cs, L + 1, 0, 0, Ack);
245scan_char([$\\, _C | Cs], L, Col, Ack) ->
246    N = Col + 2,    % character C must be ignored
247    scan_lines(Cs, L, N, N, Ack);
248scan_char([_C | Cs], L, Col, Ack) ->
249    N = Col + 1,    % character C must be ignored
250    scan_lines(Cs, L, N, N, Ack);
251scan_char([], _L, _Col, Ack) ->
252    %% Finish quietly.
253    Ack.
254
255
256%% =====================================================================
257%% @spec join_lines([CommentLine]) -> [Comment]
258%%
259%%	    CommentLine = {Line, Column, Indent, string()}
260%%	    Line = integer()
261%%	    Column = integer()
262%%	    Indent = integer()
263%%	    Comment = {Line, Column, Indent, Text}
264%%	    Text = [string()]
265%%
266%% @doc Joins individual comment lines into multi-line comments. The
267%% input is a list of entries representing individual comment lines,
268%% <em>in order of decreasing line-numbers</em>; see
269%% {@link scan_lines/1} for details. The result is a list of
270%% entries representing <em>multi-line</em> comments, <em>still listed
271%% in order of decreasing line-numbers</em>, but where for each entry,
272%% `Text' is a list of consecutive comment lines in order of
273%% <em>increasing</em> line-numbers (i.e., top-down).
274%%
275%% @see scan_lines/1
276
277-spec join_lines([commentLine()]) -> [comment()].
278
279join_lines([{L, Col, Ind, Txt} | Lines]) ->
280    join_lines(Lines, [Txt], L, Col, Ind);
281join_lines([]) ->
282    [].
283
284%% In the following, we assume that the current `Txt' is never empty.
285%% Recall that the list is in reverse line-number order.
286
287join_lines([{L1, Col1, Ind1, Txt1} | Lines], Txt, L, Col, Ind) ->
288    if L1 =:= L - 1, Col1 =:= Col, Ind + 1 =:= Col ->
289	    %% The last test above checks that the previous
290	    %% comment was alone on its line; otherwise it won't
291	    %% be joined with the current; this is not always what
292	    %% one wants, but works well in general.
293	    join_lines(Lines, [Txt1 | Txt], L1, Col1, Ind1);
294       true ->
295	    %% Finish the current comment and let the new line
296	    %% start the next one.
297	    [{L, Col, Ind, Txt}
298	     | join_lines(Lines, [Txt1], L1, Col1, Ind1)]
299    end;
300join_lines([], Txt, L, Col, Ind) ->
301    [{L, Col, Ind, Txt}].
302
303
304%% =====================================================================
305%% Utility functions for internal use
306
307filename([C|T]) when is_integer(C), C > 0 ->
308    [C | filename(T)];
309filename([]) ->
310    [];
311filename(N) ->
312    report_error("bad filename: `~tP'.", [N, 25]),
313    exit(error).
314
315error_read_file(Name) ->
316    report_error("error reading file `~ts'.", [Name]).
317
318report_error(S, Vs) ->
319    error_logger:error_msg(lists:concat([?MODULE, ": ", S, "\n"]), Vs).
320
321%% =====================================================================
322