1% -*- Erlang -*- 2% File: mail_html.erl 3% Author: Johan Bevemyr 4% Created: Sat Jun 19 15:13:49 2004 5% Purpose: Transform HTML to text 6 7-module('mail_html'). 8-author('jb@mor.bevemyr.com'). 9 10-export([html_to_text/1]). 11 12html_to_text(Input) -> 13 Tokens = tokenize(lists:flatten(Input), [], [], 1), 14 Ehtml = parse(Tokens), 15 RevText = ehtml_to_text(Ehtml, []), 16 lists:reverse(RevText). 17 18ehtml_to_text([], Acc) -> 19 Acc; 20ehtml_to_text([{Tag, _Opts}|Rest], Acc) -> 21 Acc2 = add_tag_space(Tag, Acc), 22 ehtml_to_text(Rest, Acc2); 23ehtml_to_text([{script, _Opts, _Body}|Rest], Acc) -> 24 ehtml_to_text(Rest, Acc); 25ehtml_to_text([{Tag, _Opts, Body}|Rest], Acc) -> 26 Acc1 = add_tag_space(Tag, Acc), 27 Acc2 = ehtml_to_text(Body, Acc1), 28 ehtml_to_text(Rest, Acc2); 29ehtml_to_text([Text|Rest], Acc) -> 30 Text2 = text_reformat(Text, []), 31 ehtml_to_text(Rest, [Text2|Acc]). 32 33add_tag_space(p, Acc) -> 34 [$\n,$\r|Acc]; 35add_tag_space(br, Acc) -> 36 [$\n,$\r|Acc]; 37add_tag_space(hr, Acc) -> 38 [$\n,$\r|Acc]; 39add_tag_space(_, Acc) -> 40 Acc. 41 42text_reformat([], Acc) -> 43 lists:reverse(Acc); 44text_reformat([$\n|R], [$ |Acc]) -> 45 text_reformat(R, Acc); 46text_reformat([$\n|R], Acc) -> 47 text_reformat(R, [$ |Acc]); 48text_reformat([$\r|R], Acc) -> 49 text_reformat(R, Acc); 50text_reformat([C|R], Acc) -> 51 text_reformat(R, [C|Acc]). 52 53%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 54%% 55%% Alternative parser, recursive as hell 56%% 57 58parse(Tokens) -> 59 parse(Tokens, []). 60 61parse([], Acc) -> lists:reverse(Acc); 62 63parse([{begin_tag, T, A, _L}|Rest], Acc) -> 64 case tag_type(T) of 65 leaf -> 66 parse(Rest, [{T,A}|Acc]); 67 node -> 68 case find_body(T, Rest, []) of 69 {error, _Reason} -> 70 %% no body found, assume leaf 71 %% io:format("Error: ~s on line ~p\n", [Reason, L]), 72 parse(Rest, [{T,A}|Acc]); 73 {Body,Rest2} -> 74 ParsedBody = parse(Body), 75 parse(Rest2, [{T,A,ParsedBody}|Acc]) 76 end 77 end; 78parse([{end_tag, _T, _A, _L}|Rest], Acc) -> 79 %% errounous end tag, ignore 80 parse(Rest, Acc); 81parse([{data, Data, _L}|Rest], Acc) -> 82 parse(Rest, [Data|Acc]). 83 84find_body(Tag, [], _Acc) -> 85 {error, "Missing end tag for "++atom_to_list(Tag)}; 86find_body(Tag, [{end_tag,Tag,_,_}|Rest], Acc) -> 87 {lists:reverse(Acc),Rest}; 88find_body(Tag, [{begin_tag, Tag, A, L}|Rest], Acc) -> 89 case find_body(Tag, Rest, []) of 90 {error, Reason} -> 91 %% no body found 92 {error, Reason}; 93 {Body, Rest1} -> 94 find_body(Tag, Rest1, 95 [{end_tag, Tag, [], -1}|lists:reverse(Body)++ 96 [{begin_tag, Tag, A, L}|Acc]]) 97 end; 98find_body(Tag, [X|Rest], Acc) -> 99 find_body(Tag, Rest, [X|Acc]). 100 101%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 102 103 104tag_type(Tag) -> 105 yaws_html:tag_type(Tag). 106 107% tokenize(Input, DataAcc, TokenAcc, LineNr) 108 109tokenize([], [], Tokens, _Line) -> 110 lists:reverse(Tokens); 111tokenize([], Acc, Tokens, Line) -> 112 lists:reverse([{data, lists:reverse(Acc), Line}|Tokens]); 113tokenize([$<,$!,$-,$-|R0], Acc, Tokens, L0) -> 114 {R1, L1} = skip_comment(R0,L0), 115 tokenize(R1, Acc, Tokens, L1); 116tokenize([$<|R0], Acc, Tokens, L0) -> 117 {Tag,R1,L1} = scan_tag(R0,L0), 118 if 119 Acc == [] -> 120 next_token(Tag, R1, [Tag|Tokens], L1); 121 true -> 122 Data = {data,lists:reverse(Acc),L0}, 123 next_token(Tag, R1, [Tag,Data|Tokens], L1) 124 end; 125tokenize([C=$\n|R0], Acc, Tokens, L) -> 126 tokenize(R0, [C|Acc], Tokens, L+1); 127tokenize([C=$\r|R0], Acc, Tokens, L) -> 128 tokenize(R0, [C|Acc], Tokens, L+1); 129tokenize([C|R0], Acc, Tokens, L) -> 130 tokenize(R0, [C|Acc], Tokens, L). 131 132% 133 134next_token({begin_tag, script, _, _}, R, Tokens, L) -> 135 {Data, R1, L1} = scan_endtag(R, "script", L), 136 tokenize(R1, [], [{data, Data, L}|Tokens], L1); 137next_token({begin_tag, style, _, _}, R, Tokens, L) -> 138 {Data, R1, L1} = scan_endtag(R, "style", L), 139 tokenize(R1, [], [{data, Data, L}|Tokens], L1); 140next_token(_Tag, R, Tokens, L) -> 141 tokenize(R, [], Tokens, L). 142 143%% '<' <id> <sp>+ [<id><sp>*['='<val>]]* ['/'] '>' 144 145scan_tag([$/|I], L) -> 146 {_R0,L0} = skip_space(I, L), 147 {Name,R1,L1} = scan_tag_name(I, L0), 148 {R2,L2} = skip_space(R1, L1), 149 {Args,R3,L3} = scan_tag_args(R2, L2), 150 {{end_tag,list_to_atom(lowercase(Name)),Args,L0}, R3, L3}; 151scan_tag(I, L) -> 152 {_R0,L0} = skip_space(I, L), 153 {Name,R1,L1} = scan_tag_name(I, L0), 154 {R2,L2} = skip_space(R1, L1), 155 {Args,R3,L3} = scan_tag_args(R2, L2), 156 {{begin_tag,list_to_atom(lowercase(Name)),Args,L0}, R3, L3}. 157 158% 159 160scan_tag_name(I, L) -> 161 scan_token(I, [], L). 162 163% 164 165scan_tag_args(I, L) -> 166 scan_tag_args(I, [], L). 167 168scan_tag_args([], Acc, L) -> 169 {lists:reverse(Acc), [], L}; 170scan_tag_args([$>|R], Acc, L) -> 171 {lists:reverse(Acc), R, L}; 172scan_tag_args(R=[$<|_], Acc, L) -> %% bad html 173 {lists:reverse(Acc), R, L}; 174scan_tag_args(R0, Acc, L0) -> 175 {Name,R1,L1} = scan_value(R0, L0), 176 {R2, L2} = skip_space(R1, L1), 177 case R2 of 178 [$=|R3] -> 179 {R4,L4} = skip_space(R3, L2), 180 {Value,R5,L5} = scan_value(R4, L4), 181 {R6,L6} = skip_space(R5, L5), 182 OptName = list_to_atom(lowercase(Name)), 183 scan_tag_args(R6, [{OptName,Value}|Acc], L6); 184 _ -> 185 scan_tag_args(R2, [Name|Acc], L2) 186 end. 187 188% 189 190scan_value([$"|R], L) -> 191 scan_quote(R, [], $", L); 192scan_value([$'|R], L) -> 193 scan_quote(R, [], $', L); 194scan_value(R, L) -> 195 scan_token(R, [], L). 196 197% 198 199scan_token([], Acc, L) -> 200 {lists:reverse(Acc), [], L}; 201scan_token(R=[$>|_], Acc, L) -> 202 {lists:reverse(Acc), R, L}; 203scan_token(R=[$<|_], Acc, L) -> %% bad html 204 {lists:reverse(Acc), R, L}; 205scan_token(R=[$=|_], Acc, L) -> %% bad html 206 {lists:reverse(Acc), R, L}; 207scan_token([C|R], Acc, L0) -> 208 case char_class(C) of 209 space -> 210 {lists:reverse(Acc), R, L0}; 211 nl -> 212 {lists:reverse(Acc), R, L0+1}; 213 _ -> 214 scan_token(R, [C|Acc], L0) 215 end. 216 217% 218 219scan_quote([], Acc, _Q, L) -> 220 {lists:reverse(Acc), [], L}; 221scan_quote([Q|R], Acc, Q, L) -> 222 {lists:reverse(Acc), R, L}; 223scan_quote([C=$\n|R], Acc, Q, L) -> 224 scan_quote(R, [C|Acc], Q, L+1); 225scan_quote([C=$\r|R], Acc, Q, L) -> 226 scan_quote(R, [C|Acc], Q, L+1); 227scan_quote([C|R], Acc, Q, L) -> 228 scan_quote(R, [C|Acc], Q, L). 229 230% 231 232scan_endtag(R, Tag, L) -> 233 scan_endtag(R, Tag, [], L). 234 235scan_endtag([], _Tag, Acc, L) -> 236 {lists:reverse(Acc), [], L}; 237scan_endtag(R=[$<,$/|R0], Tag, Acc, L0) -> 238 case casecmp(Tag, R0) of 239 {true, R1} -> 240 {R2,_} = skip_space(R1,L0), 241 if hd(R2) == $> -> 242 {lists:reverse(Acc), R, L0}; 243 true -> 244 scan_endtag(R0, Tag, Acc, L0) 245 end; 246 false -> 247 scan_endtag(R0, Tag, Acc, L0) 248 end; 249scan_endtag([C=$\n|R], Tag, Acc, L) -> 250 scan_endtag(R, Tag, [C|Acc], L+1); 251scan_endtag([C=$\r|R], Tag, Acc, L) -> 252 scan_endtag(R, Tag, [C|Acc], L+1); 253scan_endtag([C|R], Tag, Acc, L) -> 254 scan_endtag(R, Tag, [C|Acc], L). 255 256% 257 258casecmp([], R) -> {true, R}; 259casecmp([C1|T1], [C2|T2]) -> 260 C2low = lowercase_ch(C2), 261 if C1 == C2low -> casecmp(T1,T2); 262 true -> false 263 end. 264 265% 266 267char_class($\n) -> nl; 268char_class($\r) -> nl; 269char_class($ ) -> space; 270char_class($\t) -> space; 271char_class(C) when C >= $a, C =< $z -> alpha; 272char_class(C) when C >= $A, C =< $Z -> alpha; 273char_class(C) when C >= $0, C =< $9 -> digit; 274char_class(_C) -> other. 275 276% 277 278skip_space([], L) -> 279 {[], L}; 280skip_space(R = [C|R0], L) -> 281 case char_class(C) of 282 nl -> 283 skip_space(R0, L+1); 284 space -> 285 skip_space(R0, L); 286 _ -> 287 {R, L} 288 end. 289 290% 291 292skip_comment([], L) -> {[], L}; 293skip_comment([$-,$-,$>|R],L) -> {R,L}; 294skip_comment([$\n|R],L) -> skip_comment(R,L+1); 295skip_comment([$\r|R],L) -> skip_comment(R,L+1); 296skip_comment([_C|R],L) -> skip_comment(R,L). 297 298% 299 300lowercase(Str) -> 301 [lowercase_ch(S) || S <- Str]. 302 303lowercase_ch(C) when C>=$A, C=<$Z -> C + 32; 304lowercase_ch(C) -> C. 305 306 307%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 308 309 310