1%%
2%% %CopyrightBegin%
3%%
4%% Copyright Ericsson AB 2009-2015. All Rights Reserved.
5%%
6%% Licensed under the Apache License, Version 2.0 (the "License");
7%% you may not use this file except in compliance with the License.
8%% You may obtain a copy of the License at
9%%
10%%     http://www.apache.org/licenses/LICENSE-2.0
11%%
12%% Unless required by applicable law or agreed to in writing, software
13%% distributed under the License is distributed on an "AS IS" BASIS,
14%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15%% See the License for the specific language governing permissions and
16%% limitations under the License.
17%%
18%% %CopyrightEnd%
19
20-module(demo_html_tagger).
21
22%% You will notice that this program has very few type declarations
23%% That's because this program uses some pretty dodgy techniques to
24%% get at the data it requires.
25
26%% I use epp_dodger to parse the file and the new imporved erl_scan
27%% find the exact values of the tokens
28
29%% epp_dodger returns an objects of type erl_syntax which are pretty
30%% nasty buggers. We could write the types out but it would hardly
31%% help.
32
33%% to test run
34
35%%-compile(export_all).
36
37
38-export([erl2htmltext/1, erl2htmlfile/1]).
39
40erl2htmltext(File) ->
41    try
42	erl2html0(File)
43    catch
44	What:Why ->
45	    io:format("error in:~s ~p ~p~n",[File,What,Why])
46    end.
47
48erl2htmlfile(File) ->
49    try
50	Text = erl2html0(File),
51	Root = filename:basename(filename:rootname(File)),
52	Out = "./html/" ++ Root ++ ".html",
53	file:write_file(Out, [Text])
54    catch
55	What:Why ->
56	    io:format("error in:~s ~p ~p~n",[File,What,Why])
57    end.
58
59
60splitErlang(File) ->
61    {ok, Forms} = dodge_file(File),
62    {Anchors, Patches} = analyse(Forms),
63    Raw = read_raw_forms(File),
64    Raw1 = merge_anchors(Anchors, Raw),
65    Raw2 = merge_forms(Raw1, Patches, []),
66    Rtf = [final(I) || I <- Raw2],
67    {taggedBlocks, Rtf}.
68
69erl2html0(File) ->
70    Tb = splitErlang(File),
71    Html = to_html(Tb),
72    prelude(Html).
73
74merge_forms([{Tag,L1}|T], Patches, L) ->
75    {L2, Patches1} = apply_patches(L1, Patches),
76    merge_forms(T, Patches1, [{Tag,L2}|L]);
77merge_forms([], _, L) ->
78    lists:reverse(L).
79
80apply_patches(Toks, [])      ->
81    %% we've run out of patches but we must still simplify
82    %% every term
83    {[simplify(I) || I <- Toks], []};
84apply_patches(Toks, Patches) ->
85    apply_patches(Toks, Patches, []).
86
87apply_patches([{atom,Ln,Val}=A|T], Patches, L) ->
88    case do_patch(Ln, Patches)  of
89	{yes, New, Patches1} ->
90	    New1 = reformat(New, Val),
91	    apply_patches(T, Patches1, [New1|L]);
92	{no, Patches1} ->
93	    apply_patches(T, Patches1, [simplify(A)|L])
94    end;
95apply_patches([H|T], Patches, L) ->
96    apply_patches(T, Patches, [simplify(H)|L]);
97apply_patches([], Patches, L) ->
98    {lists:reverse(L), Patches}.
99
100
101simplify({atom,_,Str}) ->
102    case (catch list_to_existing_atom(Str)) of
103	{'EXIT', _} ->
104	    {atom, Str};
105	A ->
106	    case is_keyword(A) of
107		true -> {keyword, Str};
108		false ->
109		    {atom, Str}
110	    end
111    end;
112simplify({dot,_,Str}) ->
113    {terminal, Str};
114simplify({Tag,_,Str}) ->
115    case is_keyword(Tag) of
116	true ->
117	    {keyword, Str};
118	false ->
119	    case is_terminal(Tag) of
120		true ->
121		    {terminal, Str};
122		false ->
123		    {Tag, Str}
124	    end
125    end;
126simplify(X) ->
127    io:format("simplify wtfit:~p~n",[X]),
128    X.
129
130do_patch(Ln, [{Ln,Tag}|P])                  -> {yes, Tag, P};
131do_patch(Ln, [{Ln1,_}|_] = P) when Ln1 > Ln -> {no, P};
132do_patch(Ln, [_|T])                         -> do_patch(Ln, T);
133do_patch(_, [])                             -> {no, []}.
134
135reformat({local,{F,A}}, Str) -> {local,F,A,Str};
136reformat({remote,M,F,A}, Str) -> {remote,M,F,A,Str};
137reformat({remote,{M,F,A}}, Str) -> {remote,M,F,A,Str};
138reformat({bif,{F,A}}, Str) -> {bif,F,A,Str};
139reformat(Tag, Str) ->
140    io:format("reformat*:~p ~p~n",[Tag,Str]),
141    {Tag,Str}.
142
143to_html({taggedBlocks, L}) ->
144    [[anchor1(Anchor),to_html(Body)] || {Anchor,Body} <- L];
145to_html({taggedToks, L}) ->
146    [to_html1(I) || I <- L].
147
148anchor1({func, F, A}) ->
149    ["<a name='",linkname(F,A),"'></a>"];
150anchor1({specification, F, A}) ->
151    ["<a name='",linkname(F,A),"'></a>"];
152anchor1(_X) ->
153    "".
154
155linkname(F, A) when is_atom(F) ->
156    a2s(F) ++ "-" ++ integer_to_list(A);
157linkname(F, A) when is_list(F) ->
158    F ++ "-" ++ integer_to_list(A).
159
160a2s(A) ->
161    atom_to_list(A).
162
163font(C, S) ->
164    ["<font color=\"", C, "\">", htmlquote(S), "</font>"].
165
166htmlquote("<" ++ T) -> ["&lt;"|htmlquote(T)];
167htmlquote([H|T]) -> [H|htmlquote(T)];
168htmlquote([]) -> [].
169
170to_html1({white_space,V}) -> V;
171to_html1({comment, V})    -> font("#B22222", V);
172to_html1({var,V})         -> font("orange", V);
173to_html1({string,V})      -> font("#FA8072", V);
174to_html1({integer,V})     -> font("#1111AA", V);
175to_html1({bif,_F,_A,Str}) -> font("#FF00FF", Str);
176to_html1({keyword, V})    -> font("#FF00FF", V);
177to_html1({atom, V})       -> V;
178to_html1({terminal,V})    -> V;
179to_html1({char,V})        -> V;
180to_html1({float,V})       -> V;
181to_html1({anchor,F,A}) ->
182    ["<a name='",linkname(F,A),"'></a>"];
183to_html1({local,F,A,Str}) ->
184    ["<a href='#",linkname(F,A),"'>",
185     htmlquote(Str),"</a>"];
186to_html1({remote,_M,_F,_A,Str}) ->
187    %%["<a href='",htmlname(M), "#",linkname(F,A),"'>",htmlquote(Str),"</a>"],
188    Str.
189
190%% merge the anchors
191%% there should be one block per anchor
192%% we check the containing form (for safety)
193
194%% merge_anchors([{_,{file,_}}|A], B) ->
195%%     merge_anchors(A, B);
196merge_anchors([{Tag,Val}=H|A], [B|T])  ->
197    case contains(Tag, B) of
198	true ->
199	    [{Val,B}|merge_anchors(A, T)];
200	false ->
201	    io:format("Logic error: H=~p B=~p~n",[H,B]),
202	    exit(1)
203    end;
204merge_anchors([], []) -> [];
205merge_anchors([], [X]) ->
206    %% this is the last block -
207    %% trailing white space and comments have no tag
208    %% because eos is not a tag ...
209    [{eof, X}];
210merge_anchors(X, Y) ->
211    io:format("ops:~p~n",[{X,Y}]),
212    [].
213
214contains(Loc, [{_,Loc,_}|_]) -> true;
215contains(Loc, [_|T]) ->  contains(Loc, T);
216contains(_, []) -> false.
217
218
219dodge_file(File) ->
220    case file:open(File, [read]) of
221	{ok, Handle} ->
222	    {ok, F} = epp_dodger:parse(Handle, {1,1}),
223	    file:close(Handle),
224	    L = [revert_forms(I) || I <- F],
225	    {ok, L};
226	Error ->
227	    Error
228    end.
229
230revert_forms(F) ->
231    case erl_syntax:is_form(F) of
232	true ->
233	    %% revert fails on ifdef ... etc
234	    case (catch erl_syntax:revert(F)) of
235		{'EXIT', _Why} ->
236		    io:format("error reverting:~p=~p~n",[F,_Why]),
237		    F;
238		Other ->
239		    Other
240	    end;
241	false ->
242	    io:format("uugh:~p~n",[F])
243    end.
244
245%% read up to dot
246%% read_raw_forms(File) -> [form()]
247%% form() = [tok()]
248%% tok() = {Type,{Line::int,Col::int},string}
249%% Type = atom | int | var | string ...
250
251read_raw_forms(File) ->
252    {ok, Bin} = file:read_file(File),
253    Str = binary_to_list(Bin),
254    loop(erl_scan:tokens([], Str, {1,1}, [return,text]), []).
255
256loop({done, {eof,_}, eof}, L) ->
257    lists:reverse(L);
258loop({done, {ok, Toks, _}, eof}, L) ->
259    lists:reverse([normalize_toks(Toks)|L]);
260loop({done, {ok, Toks, Ln}, Str1}, L) ->
261    loop(erl_scan:tokens([], Str1, Ln, [return,text]),
262	 [normalize_toks(Toks)|L]);
263loop({more, X}, L) ->
264    loop(erl_scan:tokens(X, eof, {1,1}, [return,text]), L).
265
266normalize_toks(Toks) ->
267    [normalize_tok(I) || I <- Toks].
268
269normalize_tok(Tok) ->
270    %% this is the portable way ...
271    Type = erl_scan:category(Tok),
272    Line = erl_scan:line(Tok),
273    Col = erl_scan:column(Tok),
274    Txt = erl_scan:text(Tok),
275    Val  = {Type,{Line,Col},Txt},
276    %% io:format("here:X=~p ~p~n",[Tok,Val]),
277    Val.
278
279
280%% analse the result of dodge_file
281
282analyse(Forms) ->
283    Calls = calls(Forms),
284    Anchors = compute_anchors(Forms),
285    Imports = [{{F,A},Mod} ||
286		  {attribute,_,import,{Mod,L}} <- Forms, {F,A} <- L],
287    D = dict:from_list(Imports),
288    Patches = [{Loc, resolve(X, D)} || {Loc, X} <- Calls],
289    {Anchors, Patches}.
290
291%% An anchor is something that is placed at the start of every form
292%% The anchor is named after the first item in the form
293%% compute_anchors(Forms) -> [{{Line,Col}, anchor()}]
294%%   {Line,Col} is the line and column of where the
295%%   form starts - this is not the same as the first token in
296%%   the form since we might have skipped comments and white space
297%%   at the start of the form.
298%%   anchor() is a term decscribing the anchor
299%%   anchor(() = {func,Name,Aritry} (for functions)
300%%             |
301%%             | {Type,{Line,Col}} anythis else
302
303compute_anchors(Forms) ->
304    A1 = [anchor0(I) || I <- Forms],
305    merge_specs(A1).
306
307%% If a specification is immediately followed by
308%% a function when we promote the function anchor to point
309%% at the specification.
310%% We change the second tag to func2 - because we still want a
311%% tag for every block
312
313merge_specs([{_Ln1,{specification,F,A}}=H,{Ln2, {func,F,A}}|T]) ->
314    [H,{Ln2,{func1,F,A}}|merge_specs(T)];
315merge_specs([H|T]) ->
316    [H|merge_specs(T)];
317merge_specs([]) ->
318    [].
319
320anchor0(I) ->
321    case  anchor(I) of
322	{{Line,Col,_,_}, Val} ->
323	    {{Line,Col}, Val};
324	{{_,_}, _} = X ->
325	    X
326    end.
327
328anchor({function, Ln, F, A, _}) -> {Ln, {func, F, A}};
329anchor({attribute,Ln,'spec', {{F,A},_}}) ->
330    {Ln, {specification,F,A}};
331anchor({attribute,Ln,module, M}) ->
332    {Ln, {module,M}};
333anchor({attribute,Ln,Type,_}) -> {Ln, {Type, Ln}};
334anchor({eof,Ln}) -> {Ln, eof};
335anchor({error,{Ln,_,_}}) ->
336    %% Ln is in a different format in errors (sigh)
337    {Line, Col} = Ln,
338    Ln1 = {Line,Col,0,""},
339    {Ln1, {error, Ln}};
340anchor({tree,attribute,{attr,{_,_,_,Type}=Ln,_,_},_}) ->
341    {Ln, {attribute,Type,Ln}};
342anchor({tree,attribute,_,
343	{attribute, {atom,Ln,Type}, _}}) ->
344    {Ln, {attribute,Type,Ln}};
345anchor({tree,attribute,
346	{attr,Ln,[],none},
347	_}=X) ->
348    io:format("FIX ME this is a bug????:~p~n",[X]),
349    {Ln, {other, Ln}};
350anchor(X) ->
351    %% this is some syntactic form that I don't know
352    %% about yet ...
353    io:format("FIX ME this is a bug????:~p~n",[X]),
354    exit(1).
355
356resolve({F,A}=Tup, D) ->
357    case dict:find({F,A}, D) of
358	{ok, Mod} ->
359	    {remote,Mod,F,A};
360	error ->
361	    case erlang:is_builtin(erlang, F, A) of
362		true  -> {bif, {F,A}};
363		false -> {local,Tup}
364	    end
365    end;
366resolve({erlang,F,A}, _) ->
367    {bif,{F,A}};
368resolve({anchor,_,_}=A, _) ->
369    A;
370resolve(X, _D) ->
371    {remote, X}.
372
373calls(X) -> lists:reverse(calls(X, [])).
374
375calls({call,_,{atom,Ln,Func},Args}, L) ->
376    calls(Args, [{normalise(Ln),{Func,length(Args)}}|L]);
377calls({call,_,{remote,_,{atom,Ln1,Mod},{atom,_Ln2,Func}}, Args}, L) ->
378    calls(Args, [{normalise(Ln1),{Mod,Func,length(Args)}}|L]);
379calls(T, L) when is_tuple(T) ->
380    calls(tuple_to_list(T), L);
381calls([], L) ->
382    L;
383calls(T, L) when is_list(T) ->
384    lists:foldl(fun calls/2, L, T);
385calls(_, L) ->
386    L.
387
388normalise({_Line,_Col}=X) ->
389    X;
390normalise({Line,Col,_Len,_Text}) ->
391    {Line, Col}.
392
393
394prelude(L) ->
395    ["<html>\n"
396     "<head>\n"
397     "</head>\n"
398     "<body>\n"
399     "<ul><pre>\n",L,"\n</pre></ul></body>"].
400
401
402final({Tag, Toks}) ->
403    {Tag, {taggedToks, final1(Tag, Toks)}}.
404
405final1({Tag,_,_}, Toks) when Tag =:= func; Tag =:= func1 ->
406    %% io:format("fix_remote:~p~n",[Toks]),
407    fix_remote(Toks);
408final1({export,_}, Toks) ->
409    fix_exports(Toks);
410final1({import,_}, Toks) ->
411    fix_imports(Toks);
412final1(_, Toks) ->
413    %% io:format("final:~p~n",[X]),
414    Toks.
415
416
417fix_imports(Toks) ->
418    %% io:format("fix imports:~p~n",[Toks]),
419    Mod = find_imported_module(Toks),
420    %% io:format("Mod =~p~n",[Mod]),
421    fix_imports(Toks, Mod).
422
423fix_imports([{atom,A},{terminal,"/"},{integer,N}|T], Mod) ->
424    [{remote, Mod,A,list_to_integer(N),A++"/"++N}|
425     fix_imports(T, Mod)];
426fix_imports([H|T], Mod) ->
427    [H|fix_imports(T, Mod)];
428fix_imports([], _) ->
429    [].
430
431%% skip to the atom import, then take the first atom after import
432find_imported_module([{atom,"import"}|T]) -> find_imported_module1(T);
433find_imported_module([_|T])               -> find_imported_module(T).
434
435find_imported_module1([{atom,M}|_]) -> list_to_atom(M);
436find_imported_module1([_|T])        -> find_imported_module1(T).
437
438%% won't work if there is white space between the symbols
439%% fix later
440
441fix_exports([{atom,A},{terminal,"/"},{integer,N}|T]) ->
442    [{local,A,list_to_integer(N),A++"/"++N}|fix_exports(T)];
443fix_exports([H|T]) ->
444    [H|fix_exports(T)];
445fix_exports([]) ->
446    [].
447
448%% fix_remote merges Mod : Func into a single string
449%%  the problem is that
450%%  we only tag the first atom in a remote call mod:func(...)
451%%  mod is tagged as remote - but we want to
452%%  extend the tagging to include the entire mod:func
453%%  call ...
454
455fix_remote([{remote,M,F,A,Str},{terminal,":"},{atom,Str1}|T]) ->
456    [{remote,M,F,A,Str ++ ":" ++ Str1}|fix_remote(T)];
457fix_remote([{remote,M,F,A,Str},{white_space,S1},{terminal,":"},{atom,Str1}|T]) ->
458    [{remote,M,F,A,Str ++ S1 ++ ":" ++ Str1}|fix_remote(T)];
459fix_remote([{remote,M,F,A,Str},{white_space,S1},{terminal,":"},{white_space,S2},{atom,Str1}|T]) ->
460    [{remote,M,F,A,Str ++ S1 ++ ":" ++ S2 ++ Str1}|fix_remote(T)];
461fix_remote([{remote,M,F,A,Str},{terminal,":"},{white_space,S2},{atom,Str1}|T]) ->
462    [{remote,M,F,A,Str ++ ":" ++ S2 ++ Str1}|fix_remote(T)];
463fix_remote([H|T]) ->
464    [H|fix_remote(T)];
465fix_remote([]) ->
466    [].
467
468-spec is_keyword(atom()) -> boolean().
469
470is_keyword('after' ) -> true;
471is_keyword('and') -> true;
472is_keyword('andalso' ) -> true;
473is_keyword('band' ) -> true;
474is_keyword('begin' ) -> true;
475is_keyword('bnot' ) -> true;
476is_keyword('bor' ) -> true;
477is_keyword('bsl' ) -> true;
478is_keyword('bsr' ) -> true;
479is_keyword('bxor' ) -> true;
480is_keyword('case' ) -> true;
481is_keyword('catch' ) -> true;
482is_keyword('cond') -> true;
483is_keyword('div' ) -> true;
484is_keyword('end' ) -> true;
485is_keyword('fun' ) -> true;
486is_keyword('if' ) -> true;
487is_keyword('not') -> true;
488is_keyword('of' ) -> true;
489is_keyword('or' ) -> true;
490is_keyword('orelse' ) -> true;
491is_keyword('receive' ) -> true;
492is_keyword('rem' ) -> true;
493is_keyword('spec') -> true;
494is_keyword('try' ) -> true;
495is_keyword('when') -> true;
496is_keyword('xor') -> true;
497is_keyword(_) -> false.
498
499is_terminal('!')  -> true;
500is_terminal('#')  -> true;
501is_terminal('(')  -> true;
502is_terminal(')')  -> true;
503is_terminal('*')  -> true;
504is_terminal('+')  -> true;
505is_terminal('++') -> true;
506is_terminal(',')  -> true;
507is_terminal('-')  -> true;
508is_terminal('--') -> true;
509is_terminal('->') -> true;
510is_terminal('.')  -> true;
511is_terminal('/')  -> true;
512is_terminal('/=') -> true;
513is_terminal(':')  -> true;
514is_terminal(':-') -> true;
515is_terminal('::') -> true;
516is_terminal(';')  -> true;
517is_terminal('<')  -> true;
518is_terminal('<-') -> true;
519is_terminal('<<') -> true;
520is_terminal('<=') -> true;
521is_terminal('=')  -> true;
522is_terminal('=/=') -> true;
523is_terminal('=:=') -> true;
524is_terminal('=<') -> true;
525is_terminal('==') -> true;
526is_terminal('>')  -> true;
527is_terminal('>=') -> true;
528is_terminal('>>') -> true;
529is_terminal('?')  -> true;
530is_terminal('[')  -> true;
531is_terminal(']')  -> true;
532is_terminal('{')  -> true;
533is_terminal('|')  -> true;
534is_terminal('||') -> true;
535is_terminal('}')  -> true;
536is_terminal(_) -> false.
537