1%    -*- Erlang -*-
2%    File:        mail_html.erl
3%    Author:        Johan Bevemyr
4%    Created:        Sat Jun 19 15:13:49 2004
5%    Purpose:   Transform HTML to text
6
7-module('mail_html').
8-author('jb@mor.bevemyr.com').
9
10-export([html_to_text/1]).
11
12html_to_text(Input) ->
13    Tokens = tokenize(lists:flatten(Input), [], [], 1),
14    Ehtml = parse(Tokens),
15    RevText = ehtml_to_text(Ehtml, []),
16    lists:reverse(RevText).
17
18ehtml_to_text([], Acc) ->
19    Acc;
20ehtml_to_text([{Tag, _Opts}|Rest], Acc) ->
21    Acc2 = add_tag_space(Tag, Acc),
22    ehtml_to_text(Rest, Acc2);
23ehtml_to_text([{script, _Opts, _Body}|Rest], Acc) ->
24    ehtml_to_text(Rest, Acc);
25ehtml_to_text([{Tag, _Opts, Body}|Rest], Acc) ->
26    Acc1 = add_tag_space(Tag, Acc),
27    Acc2 = ehtml_to_text(Body, Acc1),
28    ehtml_to_text(Rest, Acc2);
29ehtml_to_text([Text|Rest], Acc) ->
30    Text2 = text_reformat(Text, []),
31    ehtml_to_text(Rest, [Text2|Acc]).
32
33add_tag_space(p, Acc) ->
34    [$\n,$\r|Acc];
35add_tag_space(br, Acc) ->
36    [$\n,$\r|Acc];
37add_tag_space(hr, Acc) ->
38    [$\n,$\r|Acc];
39add_tag_space(_, Acc) ->
40    Acc.
41
42text_reformat([], Acc) ->
43    lists:reverse(Acc);
44text_reformat([$\n|R], [$ |Acc]) ->
45    text_reformat(R, Acc);
46text_reformat([$\n|R], Acc) ->
47    text_reformat(R, [$ |Acc]);
48text_reformat([$\r|R], Acc) ->
49    text_reformat(R, Acc);
50text_reformat([C|R], Acc) ->
51    text_reformat(R, [C|Acc]).
52
53%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
54%%
55%% Alternative parser, recursive as hell
56%%
57
58parse(Tokens) ->
59    parse(Tokens, []).
60
61parse([], Acc) -> lists:reverse(Acc);
62
63parse([{begin_tag, T, A, _L}|Rest], Acc) ->
64    case tag_type(T) of
65        leaf ->
66            parse(Rest, [{T,A}|Acc]);
67        node ->
68            case find_body(T, Rest, []) of
69                {error, _Reason} ->
70                    %% no body found, assume leaf
71                    %% io:format("Error: ~s on line ~p\n", [Reason, L]),
72                    parse(Rest, [{T,A}|Acc]);
73                {Body,Rest2} ->
74                    ParsedBody = parse(Body),
75                    parse(Rest2, [{T,A,ParsedBody}|Acc])
76            end
77    end;
78parse([{end_tag, _T, _A, _L}|Rest], Acc) ->
79    %% errounous end tag, ignore
80    parse(Rest, Acc);
81parse([{data, Data, _L}|Rest], Acc) ->
82    parse(Rest, [Data|Acc]).
83
84find_body(Tag, [], _Acc) ->
85    {error, "Missing end tag for "++atom_to_list(Tag)};
86find_body(Tag, [{end_tag,Tag,_,_}|Rest], Acc) ->
87    {lists:reverse(Acc),Rest};
88find_body(Tag, [{begin_tag, Tag, A, L}|Rest], Acc) ->
89    case find_body(Tag, Rest, []) of
90        {error, Reason} ->
91            %% no body found
92            {error, Reason};
93        {Body, Rest1} ->
94            find_body(Tag, Rest1,
95                      [{end_tag, Tag, [], -1}|lists:reverse(Body)++
96                       [{begin_tag, Tag, A, L}|Acc]])
97    end;
98find_body(Tag, [X|Rest], Acc) ->
99    find_body(Tag, Rest, [X|Acc]).
100
101%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
102
103
104tag_type(Tag) ->
105    yaws_html:tag_type(Tag).
106
107% tokenize(Input, DataAcc, TokenAcc, LineNr)
108
109tokenize([], [], Tokens, _Line) ->
110    lists:reverse(Tokens);
111tokenize([], Acc, Tokens, Line) ->
112    lists:reverse([{data, lists:reverse(Acc), Line}|Tokens]);
113tokenize([$<,$!,$-,$-|R0], Acc, Tokens, L0) ->
114    {R1, L1} = skip_comment(R0,L0),
115    tokenize(R1, Acc, Tokens, L1);
116tokenize([$<|R0], Acc, Tokens, L0) ->
117    {Tag,R1,L1} = scan_tag(R0,L0),
118    if
119        Acc == [] ->
120            next_token(Tag, R1, [Tag|Tokens], L1);
121        true ->
122            Data = {data,lists:reverse(Acc),L0},
123            next_token(Tag, R1, [Tag,Data|Tokens], L1)
124    end;
125tokenize([C=$\n|R0], Acc, Tokens, L) ->
126    tokenize(R0, [C|Acc], Tokens, L+1);
127tokenize([C=$\r|R0], Acc, Tokens, L) ->
128    tokenize(R0, [C|Acc], Tokens, L+1);
129tokenize([C|R0], Acc, Tokens, L) ->
130    tokenize(R0, [C|Acc], Tokens, L).
131
132%
133
134next_token({begin_tag, script, _, _}, R, Tokens, L) ->
135    {Data, R1, L1} = scan_endtag(R, "script", L),
136    tokenize(R1, [], [{data, Data, L}|Tokens], L1);
137next_token({begin_tag, style, _, _}, R, Tokens, L) ->
138    {Data, R1, L1} = scan_endtag(R, "style", L),
139    tokenize(R1, [], [{data, Data, L}|Tokens], L1);
140next_token(_Tag, R, Tokens, L) ->
141    tokenize(R, [], Tokens, L).
142
143%% '<' <id> <sp>+ [<id><sp>*['='<val>]]* ['/'] '>'
144
145scan_tag([$/|I], L) ->
146    {_R0,L0} = skip_space(I, L),
147    {Name,R1,L1} = scan_tag_name(I, L0),
148    {R2,L2} = skip_space(R1, L1),
149    {Args,R3,L3} = scan_tag_args(R2, L2),
150    {{end_tag,list_to_atom(lowercase(Name)),Args,L0}, R3, L3};
151scan_tag(I, L) ->
152    {_R0,L0} = skip_space(I, L),
153    {Name,R1,L1} = scan_tag_name(I, L0),
154    {R2,L2} = skip_space(R1, L1),
155    {Args,R3,L3} = scan_tag_args(R2, L2),
156    {{begin_tag,list_to_atom(lowercase(Name)),Args,L0}, R3, L3}.
157
158%
159
160scan_tag_name(I, L) ->
161    scan_token(I, [], L).
162
163%
164
165scan_tag_args(I, L) ->
166    scan_tag_args(I, [], L).
167
168scan_tag_args([], Acc, L) ->
169    {lists:reverse(Acc), [], L};
170scan_tag_args([$>|R], Acc, L) ->
171    {lists:reverse(Acc), R, L};
172scan_tag_args(R=[$<|_], Acc, L) ->  %% bad html
173    {lists:reverse(Acc), R, L};
174scan_tag_args(R0, Acc, L0) ->
175    {Name,R1,L1} = scan_value(R0, L0),
176    {R2, L2} = skip_space(R1, L1),
177    case R2 of
178        [$=|R3] ->
179            {R4,L4} = skip_space(R3, L2),
180            {Value,R5,L5} = scan_value(R4, L4),
181            {R6,L6} = skip_space(R5, L5),
182            OptName = list_to_atom(lowercase(Name)),
183            scan_tag_args(R6, [{OptName,Value}|Acc], L6);
184        _ ->
185            scan_tag_args(R2, [Name|Acc], L2)
186    end.
187
188%
189
190scan_value([$"|R], L) ->
191    scan_quote(R, [], $", L);
192scan_value([$'|R], L) ->
193    scan_quote(R, [], $', L);
194scan_value(R, L) ->
195    scan_token(R, [], L).
196
197%
198
199scan_token([], Acc, L) ->
200    {lists:reverse(Acc), [], L};
201scan_token(R=[$>|_], Acc, L) ->
202    {lists:reverse(Acc), R, L};
203scan_token(R=[$<|_], Acc, L) ->  %% bad html
204    {lists:reverse(Acc), R, L};
205scan_token(R=[$=|_], Acc, L) ->  %% bad html
206    {lists:reverse(Acc), R, L};
207scan_token([C|R], Acc, L0) ->
208    case char_class(C) of
209        space ->
210            {lists:reverse(Acc), R, L0};
211        nl ->
212            {lists:reverse(Acc), R, L0+1};
213        _ ->
214            scan_token(R, [C|Acc], L0)
215    end.
216
217%
218
219scan_quote([], Acc, _Q, L) ->
220    {lists:reverse(Acc), [], L};
221scan_quote([Q|R], Acc, Q, L) ->
222    {lists:reverse(Acc), R, L};
223scan_quote([C=$\n|R], Acc, Q, L) ->
224    scan_quote(R, [C|Acc], Q, L+1);
225scan_quote([C=$\r|R], Acc, Q, L) ->
226    scan_quote(R, [C|Acc], Q, L+1);
227scan_quote([C|R], Acc, Q, L) ->
228    scan_quote(R, [C|Acc], Q, L).
229
230%
231
232scan_endtag(R, Tag, L) ->
233    scan_endtag(R, Tag, [], L).
234
235scan_endtag([], _Tag, Acc, L) ->
236    {lists:reverse(Acc), [], L};
237scan_endtag(R=[$<,$/|R0], Tag, Acc, L0) ->
238    case casecmp(Tag, R0) of
239        {true, R1} ->
240            {R2,_} = skip_space(R1,L0),
241            if hd(R2) == $> ->
242                    {lists:reverse(Acc), R, L0};
243               true ->
244                    scan_endtag(R0, Tag, Acc, L0)
245            end;
246        false ->
247            scan_endtag(R0, Tag, Acc, L0)
248    end;
249scan_endtag([C=$\n|R], Tag, Acc, L) ->
250    scan_endtag(R, Tag, [C|Acc], L+1);
251scan_endtag([C=$\r|R], Tag, Acc, L) ->
252    scan_endtag(R, Tag, [C|Acc], L+1);
253scan_endtag([C|R], Tag, Acc, L) ->
254    scan_endtag(R, Tag, [C|Acc], L).
255
256%
257
258casecmp([], R) -> {true, R};
259casecmp([C1|T1], [C2|T2]) ->
260    C2low = lowercase_ch(C2),
261    if C1 == C2low -> casecmp(T1,T2);
262       true        -> false
263    end.
264
265%
266
267char_class($\n) -> nl;
268char_class($\r) -> nl;
269char_class($ )  -> space;
270char_class($\t) -> space;
271char_class(C) when C >= $a, C =< $z -> alpha;
272char_class(C) when C >= $A, C =< $Z -> alpha;
273char_class(C) when C >= $0, C =< $9 -> digit;
274char_class(_C)   -> other.
275
276%
277
278skip_space([], L) ->
279    {[], L};
280skip_space(R = [C|R0], L) ->
281    case char_class(C) of
282        nl ->
283            skip_space(R0, L+1);
284        space ->
285            skip_space(R0, L);
286        _ ->
287            {R, L}
288    end.
289
290%
291
292skip_comment([], L) ->          {[], L};
293skip_comment([$-,$-,$>|R],L) -> {R,L};
294skip_comment([$\n|R],L) ->      skip_comment(R,L+1);
295skip_comment([$\r|R],L) ->      skip_comment(R,L+1);
296skip_comment([_C|R],L) ->        skip_comment(R,L).
297
298%
299
300lowercase(Str) ->
301    [lowercase_ch(S) || S <- Str].
302
303lowercase_ch(C) when C>=$A, C=<$Z -> C + 32;
304lowercase_ch(C) -> C.
305
306
307%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
308
309
310