1/* $Id$ 2 3 Part of SWI-Prolog SGML/XML parser 4 5 Author: Jan Wielemaker 6 E-mail: jan@swi.psy.uva.nl 7 WWW: http://www.swi.psy.uva.nl/projects/SWI-Prolog/ 8 Copying: LGPL-2. See the file COPYING or http://www.gnu.org 9 10 Copyright (C) 1990-2000 SWI, University of Amsterdam. All rights reserved. 11*/ 12 13:- prolog_load_context(directory, CWD), 14 working_directory(_, CWD). 15 16:- asserta(file_search_path(foreign, '..')). 17:- asserta(file_search_path(library, '..')). 18 19:- use_module(library(sgml)). 20:- use_module(library(sgml_write)). 21 22test :- % default test 23 fp('.'). 24 25test(File) :- 26 file_name_extension(_, xml, File), !, 27 load_xml_file(File, Term), 28 xml_write(user_output, Term, []). 29test(File) :- 30 file_name_extension(_, sgml, File), !, 31 load_sgml_file(File, Term), 32 sgml_write(user_output, Term, []). 33test(File) :- 34 file_name_extension(_, html, File), !, 35 load_html_file(File, Term), 36 html_write(user_output, Term, []). 37 38test(File, Into, Encoding) :- 39 file_name_extension(_, xml, File), !, 40 load_xml_file(File, Term), 41 open(Into, write, Out, [encoding(Encoding)]), 42 xml_write(Out, Term, []), 43 close(Out). 44 45fp(Dir) :- 46 atom_concat(Dir, '/*', Pattern), 47 expand_file_name(Pattern, Files), 48 ( member(File, Files), 49 file_name_extension(_, Ext, File), 50 ml_file(Ext), 51 file_base_name(File, Base), 52 \+ blocked(Base), 53 format(user_error, '~w ... ', [Base]), 54 ( \+ utf8(Base) 55 -> format(user_error, ' (ISO Latin-1) ... ', []), 56 fixed_point(File, iso_latin_1) 57 ; true 58 ), 59 format(user_error, ' (UTF-8) ... ', []), 60 fixed_point(File, utf8), 61 format(user_error, ' done~n', []), 62 fail 63 ; true 64 ). 65 66ml_file(xml). 67ml_file(sgml). 68ml_file(html). 69 70%% blocked(+File) 71% 72% List of test-files that are blocked. These are either negative 73% tests or tests involving SDATA. 74 75blocked('bat.sgml'). 76blocked('i.sgml'). 77blocked('sdata.sgml'). 78blocked('cent-nul.xml'). 79blocked('defent.sgml'). 80blocked('comment.xml'). 81blocked('badxmlent.xml'). 82 83 84%% utf8(+File) 85% 86% File requires UTF-8. These are files that have UTF-8 characters 87% in element or attribute names. 88 89utf8('utf8-ru.xml'). 90 91 92%% fixed_point(+File, +Encoding) 93% 94% Perform write/read round-trip and validate the data has not 95% changed. 96 97fixed_point(File, Encoding) :- 98 file_name_extension(_, xml, File), !, 99 fp(File, Encoding, load_xml_file, xml_write). 100fixed_point(File, Encoding) :- 101 file_name_extension(_, sgml, File), !, 102 fp(File, Encoding, load_sgml_file, sgml_write). 103fixed_point(File, Encoding) :- 104 file_name_extension(_, html, File), !, 105 fp(File, Encoding, load_html_file, html_write). 106 107fp(File, Encoding, Load, Write) :- 108 put_char(user_error, r), 109 call(Load, File, Term), 110 tmp_file(xml, TmpFile), 111 open(TmpFile, write, TmpOut, [encoding(Encoding)]), 112 put_char(user_error, w), 113 call(Write, TmpOut, Term, []), 114 close(TmpOut), 115% cat(TmpFile, Encoding), 116 put_char(user_error, r), 117 call(Load, TmpFile, Term2), 118 delete_file(TmpFile), 119 ( eq(Term, Term2) 120 -> true 121 ; format(user_error, 'First file:~n', []), 122 %pp(Term), 123 save_in_file(f1, Term), 124 format(user_error, 'Second file:~n', []), 125 %pp(Term2), 126 save_in_file(f2, Term2), 127 fail 128 ). 129 130save_in_file(File, Term) :- 131 open(File, write, Out, [encoding(iso_latin_1)]), 132 current_output(C0), 133 set_output(Out), 134 pp(Term), 135 set_output(C0), 136 close(Out). 137 138 139cat(File, Encoding) :- 140 open(File, read, In, [encoding(Encoding)]), 141 copy_stream_data(In, current_output), 142 close(In). 143 144% eq(M1, M2) 145% 146% Test two terms for equivalence. The following mismatches are 147% allowed: 148% 149% * Order of attributes 150% * Layout in `element-only' content 151 152eq(X, X) :- !. 153eq([], []) :- !. 154eq([B|T], L) :- % delete blanks 155 blank_atom(B), !, 156 eq(T, L). 157eq(L, [B|T]) :- 158 blank_atom(B), !, 159 eq(T, L). 160eq([H1|T1], [H2|T2]) :- !, 161 eq(H1, H2), 162 eq(T1, T2). 163eq(element(Name, A1, C1), element(Name, A2, C2)) :- 164 att_eq(A1, A2), 165 ceq(C1, C2). 166eq(A1, A2) :- 167 atom(A1), 168 atom(A2), !, 169 normalise_blanks(A1, B1), 170 normalise_blanks(A2, B2), 171 ( B1 == B2 172 -> true 173 ; format(user_error, 174 'ERROR: CDATA differs:~n\ 175 \t~p~n\ 176 \t~p~n', 177 [B1, B2]) 178 ). 179eq(X, Y) :- 180 format(user_error, 181 'ERROR: Content differs:~n\ 182 \t~p~n\ 183 \t~p~n', 184 [X, Y]). 185 186att_eq(A1, A2) :- % ordering is unimportant 187 sort(A1, S), 188 sort(A2, S), !. 189att_eq(A1, A2) :- 190 format(user_error, 191 'ERROR: Attribute lists differ:~n\ 192 \t~p~n\ 193 \t~p~n', 194 [A1, A2]). 195 196ceq(C1, C2) :- 197 element_content(C1, E1), 198 element_content(C2, E2), !, 199 eq(E1, E2). 200ceq(C1, C2) :- 201 eq(C1, C2). 202 203element_content([], []). 204element_content([element(Name,Atts,C)|T0], [element(Name,Atts,C)|T]) :- !, 205 element_content(T0, T). 206element_content([Blank|T0], T) :- 207 blank_atom(Blank), 208 element_content(T0, T). 209 210blank_atom(Atom) :- 211 atom(Atom), 212 atom_codes(Atom, Codes), 213 all_blanks(Codes). 214 215all_blanks([]). 216all_blanks([H|T]) :- 217 code_type(H, space), 218 all_blanks(T). 219 220normalise_blanks(Atom, Normalised) :- 221 atom_codes(Atom, Codes), 222 eat_blanks(Codes, Codes1), 223 normalise_blanks2(Codes1, N), 224 atom_codes(Normalised, N). 225 226normalise_blanks2([], []). 227normalise_blanks2([H|T0], T) :- 228 code_type(H, space), !, 229 eat_blanks(T0, T1), 230 ( T1 == [] 231 -> T = [] 232 ; T = [32|T2], 233 normalise_blanks2(T1, T2) 234 ). 235normalise_blanks2([H|T0], [H|T]) :- 236 normalise_blanks2(T0, T). 237 238eat_blanks([H|T0], T) :- 239 code_type(H, space), !, 240 eat_blanks(T0, T). 241eat_blanks(L, L). 242