1%% 2%% %CopyrightBegin% 3%% 4%% Copyright Ericsson AB 2003-2018. All Rights Reserved. 5%% 6%% Licensed under the Apache License, Version 2.0 (the "License"); 7%% you may not use this file except in compliance with the License. 8%% You may obtain a copy of the License at 9%% 10%% http://www.apache.org/licenses/LICENSE-2.0 11%% 12%% Unless required by applicable law or agreed to in writing, software 13%% distributed under the License is distributed on an "AS IS" BASIS, 14%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15%% See the License for the specific language governing permissions and 16%% limitations under the License. 17%% 18%% %CopyrightEnd% 19%% 20 21%% Description : Simgle-pass XML scanner. See xmerl.hrl for data defs. 22 23%% @doc This module is the interface to the XML parser, it handles XML 1.0. 24%% The XML parser is activated through 25%% <tt>xmerl_scan:string/[1,2]</tt> or 26%% <tt>xmerl_scan:file/[1,2]</tt>. 27%% It returns records of the type defined in xmerl.hrl. 28%% See also <a href="xmerl_examples.html">tutorial</a> on customization 29%% functions. 30%% @type global_state(). <p> 31%% The global state of the scanner, represented by the #xmerl_scanner{} record. 32%% </p> 33%% @type option_list(). <p>Options allow to customize the behaviour of the 34%% scanner. 35%% See also <a href="xmerl_examples.html">tutorial</a> on customization 36%% functions. 37%% </p> 38%% <p> 39%% Possible options are: 40%% </p> 41%% <dl> 42%% <dt><code>{acc_fun, Fun}</code></dt> 43%% <dd>Call back function to accumulate contents of entity.</dd> 44%% <dt><code>{continuation_fun, Fun} | 45%% {continuation_fun, Fun, ContinuationState}</code></dt> 46%% <dd>Call back function to decide what to do if the scanner runs into EOF 47%% before the document is complete.</dd> 48%% <dt><code>{event_fun, Fun} | 49%% {event_fun, Fun, EventState}</code></dt> 50%% <dd>Call back function to handle scanner events.</dd> 51%% <dt><code>{fetch_fun, Fun} | 52%% {fetch_fun, Fun, FetchState}</code></dt> 53%% <dd>Call back function to fetch an external resource.</dd> 54%% <dt><code>{hook_fun, Fun} | 55%% {hook_fun, Fun, HookState}</code></dt> 56%% <dd>Call back function to process the document entities once 57%% identified.</dd> 58%% <dt><code>{close_fun, Fun}</code></dt> 59%% <dd>Called when document has been completely parsed.</dd> 60%% <dt><code>{rules, ReadFun, WriteFun, RulesState} | 61%% {rules, Rules}</code></dt> 62%% <dd>Handles storing of scanner information when parsing.</dd> 63%% <dt><code>{user_state, UserState}</code></dt> 64%% <dd>Global state variable accessible from all customization functions</dd> 65%% 66%% <dt><code>{fetch_path, PathList}</code></dt> 67%% <dd>PathList is a list of 68%% directories to search when fetching files. If the file in question 69%% is not in the fetch_path, the URI will be used as a file 70%% name.</dd> 71%% <dt><code>{space, Flag}</code></dt> 72%% <dd>'preserve' (default) to preserve spaces, 'normalize' to 73%% accumulate consecutive whitespace and replace it with one space.</dd> 74%% <dt><code>{line, Line}</code></dt> 75%% <dd>To specify starting line for scanning in document which contains 76%% fragments of XML.</dd> 77%% <dt><code>{namespace_conformant, Flag}</code></dt> 78%% <dd>Controls whether to behave as a namespace conformant XML parser, 79%% 'false' (default) to not otherwise 'true'.</dd> 80%% <dt><code>{validation, Flag}</code></dt> 81%% <dd>Controls whether to process as a validating XML parser: 82%% 'off' (default) no validation, or validation 'dtd' by DTD or 'schema' 83%% by XML Schema. 'false' and 'true' options are obsolete 84%% (i.e. they may be removed in a future release), if used 'false' 85%% equals 'off' and 'true' equals 'dtd'.</dd> 86%% <dt><code>{schemaLocation, [{Namespace,Link}|...]}</code></dt> 87%% <dd>Tells explicitly which XML Schema documents to use to validate 88%% the XML document. Used together with the 89%% <code>{validation,schema}</code> option.</dd> 90%% <dt><code>{quiet, Flag}</code></dt> 91%% <dd>Set to 'true' if xmerl should behave quietly and not output any 92%% information to standard output (default 'false').</dd> 93%% <dt><code>{doctype_DTD, DTD}</code></dt> 94%% <dd>Allows to specify DTD name when it isn't available in the XML 95%% document. This option has effect only together with 96%% <code>{validation,'dtd'</code> option.</dd> 97%% <dt><code>{xmlbase, Dir}</code></dt> 98%% <dd>XML Base directory. If using string/1 default is current directory. 99%% If using file/1 default is directory of given file.</dd> 100%% <dt><code>{encoding, Enc}</code></dt> 101%% <dd>Set default character set used (default UTF-8). 102%% This character set is used only if not explicitly given by the XML 103%% declaration. </dd> 104%% <dt><code>{document, Flag}</code></dt> 105%% <dd>Set to 'true' if xmerl should return a complete XML document 106%% as an xmlDocument record (default 'false').</dd> 107%% <dt><code>{comments, Flag}</code></dt> 108%% <dd>Set to 'false' if xmerl should skip comments otherwise they will 109%% be returned as xmlComment records (default 'true').</dd> 110%% <dt><code>{default_attrs, Flag}</code></dt> 111%% <dd>Set to 'true' if xmerl should add to elements missing attributes 112%% with a defined default value (default 'false').</dd> 113%% </dl> 114%% @type xmlElement() = #xmlElement{}. 115%% The record definition is found in xmerl.hrl. 116%% @type xmlDocument() = #xmlDocument{}. 117%% The record definition is found in xmerl.hrl. 118%% @type document() = xmlElement() | xmlDocument(). <p> 119%% The document returned by <tt>xmerl_scan:string/[1,2]</tt> and 120%% <tt>xmerl_scan:file/[1,2]</tt>. The type of the returned record depends on 121%% the value of the document option passed to the function. 122%% </p> 123 124-module(xmerl_scan). 125-vsn('0.20'). 126-date('03-09-16'). 127 128%% main API 129-export([string/1, string/2, 130 file/1, file/2]). 131 132%% access functions for various states 133-export([user_state/1, user_state/2, 134 event_state/1, event_state/2, 135 hook_state/1, hook_state/2, 136 rules_state/1, rules_state/2, 137 fetch_state/1, fetch_state/2, 138 cont_state/1, cont_state/2]). 139 140%% helper functions. To xmerl_lib ?? 141-export([accumulate_whitespace/4]). 142 143-export_type([xmlElement/0]). 144 145%-define(debug, 1). 146-include("xmerl.hrl"). % record def, macros 147-include("xmerl_internal.hrl"). 148-include_lib("kernel/include/file.hrl"). 149 150-type xmlElement() :: #xmlElement{}. 151 152-define(fatal(Reason, S), 153 if 154 S#xmerl_scanner.quiet -> 155 ok; 156 true -> 157 error_logger:error_msg("~p- fatal: ~p~n", [?LINE, Reason]), 158 ok 159 end, 160 fatal(Reason, S)). 161 162 163-define(ustate(U, S), S#xmerl_scanner{user_state = U}). 164 165 166%% Functions to access the various states 167 168%%% @spec user_state(S::global_state()) -> global_state() 169%%% @equiv user_state(UserState,S) 170user_state(#xmerl_scanner{user_state = S}) -> S. 171 172%%% @spec event_state(S::global_state()) -> global_state() 173%%% @equiv event_state(EventState,S) 174event_state(#xmerl_scanner{fun_states = #xmerl_fun_states{event = S}}) -> S. 175 176%%% @spec hook_state(S::global_state()) -> global_state() 177%%% @equiv hook_state(HookState,S) 178hook_state(#xmerl_scanner{fun_states = #xmerl_fun_states{hook = S}}) -> S. 179 180%%% @spec rules_state(S::global_state()) -> global_state() 181%%% @equiv rules_state(RulesState,S) 182rules_state(#xmerl_scanner{fun_states = #xmerl_fun_states{rules = S}}) -> S. 183 184%%% @spec fetch_state(S::global_state()) -> global_state() 185%%% @equiv fetch_state(FetchState,S) 186fetch_state(#xmerl_scanner{fun_states = #xmerl_fun_states{fetch = S}}) -> S. 187 188%%% @spec cont_state(S::global_state()) -> global_state() 189%%% @equiv cont_state(ContinuationState,S) 190cont_state(#xmerl_scanner{fun_states = #xmerl_fun_states{cont = S}}) -> S. 191 192 193%%%% Functions to modify the various states 194 195%%% @spec user_state(UserState, S::global_state()) -> global_state() 196%%% @doc For controlling the UserState, to be used in a user function. 197%%% See <a href="xmerl_examples.html">tutorial</a> on customization functions. 198user_state(X, S) -> 199 S#xmerl_scanner{user_state = X}. 200 201%%% @spec event_state(EventState, S::global_state()) -> global_state() 202%%% @doc For controlling the EventState, to be used in an event 203%%% function, and called at the beginning and at the end of a parsed entity. 204%%% See <a href="xmerl_examples.html">tutorial</a> on customization functions. 205event_state(X, S=#xmerl_scanner{fun_states = FS}) -> 206 FS1 = FS#xmerl_fun_states{event = X}, 207 S#xmerl_scanner{fun_states = FS1}. 208 209%%% @spec hook_state(HookState, S::global_state()) -> global_state() 210%%% @doc For controlling the HookState, to be used in a hook 211%%% function, and called when the parser has parsed a complete entity. 212%%% See <a href="xmerl_examples.html">tutorial</a> on customization functions. 213hook_state(X, S=#xmerl_scanner{fun_states = FS}) -> 214 FS1 = FS#xmerl_fun_states{hook = X}, 215 S#xmerl_scanner{fun_states = FS1}. 216 217%%% @spec rules_state(RulesState, S::global_state()) -> global_state() 218%%% @doc For controlling the RulesState, to be used in a rules 219%%% function, and called when the parser store scanner information in a rules 220%%% database. 221%%% See <a href="xmerl_examples.html">tutorial</a> on customization functions. 222rules_state(X, S=#xmerl_scanner{fun_states = FS}) -> 223 FS1 = FS#xmerl_fun_states{rules = X}, 224 S#xmerl_scanner{fun_states = FS1}. 225 226%%% @spec fetch_state(FetchState, S::global_state()) -> global_state() 227%%% @doc For controlling the FetchState, to be used in a fetch 228%%% function, and called when the parser fetch an external resource (eg. a DTD). 229%%% See <a href="xmerl_examples.html">tutorial</a> on customization functions. 230fetch_state(X, S=#xmerl_scanner{fun_states = FS}) -> 231 FS1 = FS#xmerl_fun_states{fetch = X}, 232 S#xmerl_scanner{fun_states = FS1}. 233 234%%% @spec cont_state(ContinuationState, S::global_state()) -> global_state() 235%%% @doc For controlling the ContinuationState, to be used in a continuation 236%%% function, and called when the parser encounters the end of the byte stream. 237%%% See <a href="xmerl_examples.html">tutorial</a> on customization functions. 238cont_state(X, S=#xmerl_scanner{fun_states = FS}) -> 239 FS1 = FS#xmerl_fun_states{cont = X}, 240 S#xmerl_scanner{fun_states = FS1}. 241 242 243%% @spec file(Filename::string()) -> {xmlElement(),Rest} 244%% Rest = list() 245%% @equiv file(Filename, []) 246file(F) -> 247 file(F, []). 248 249%% @spec file(Filename::string(), Options::option_list()) -> {document(),Rest} 250%% Rest = list() 251%%% @doc Parse file containing an XML document 252file(F, Options) -> 253 ExtCharset=case lists:keysearch(encoding,1,Options) of 254 {value,{_,Val}} -> Val; 255 false -> undefined 256 end, 257 case int_file(F,Options,ExtCharset) of 258 {Res, Tail,S=#xmerl_scanner{close_fun=Close}} -> 259 Close(S), % for side effects only - final state is dropped 260 {Res,Tail}; 261 {error, Reason} -> 262 {error, Reason} 263 end. 264 265int_file(F, Options,_ExtCharset) -> 266 %%?dbg("int_file F=~p~n",[F]), 267 case file:read_file(F) of 268 {ok, Bin} -> 269 int_string(binary_to_list(Bin), Options, filename:dirname(F),F); 270 Error -> 271 Error 272 end. 273 274int_file_decl(F, Options,_ExtCharset) -> 275% ?dbg("int_file_decl F=~p~n",[F]), 276 case file:read_file(F) of 277 {ok, Bin} -> 278 int_string_decl(binary_to_list(Bin), Options, filename:dirname(F),F); 279 Error -> 280 Error 281 end. 282 283%% @spec string(Text::list()) -> {xmlElement(),Rest} 284%% Rest = list() 285%% @equiv string(Text, []) 286string(Str) -> 287 string(Str, []). 288 289%% @spec string(Text::list(),Options::option_list()) -> {document(),Rest} 290%% Rest = list() 291%%% @doc Parse string containing an XML document 292string(Str, Options) -> 293 {Res, Tail, S=#xmerl_scanner{close_fun = Close}} = 294 int_string(Str, Options,file_name_unknown), 295 Close(S), % for side effects only - final state is dropped 296 {Res,Tail}. 297 298int_string(Str, Options,FileName) -> 299 {ok, XMLBase} = file:get_cwd(), 300 int_string(Str, Options, XMLBase, FileName). 301 302int_string(Str, Options, XMLBase, FileName) -> 303 S0=initial_state0(Options,XMLBase), 304 S = S0#xmerl_scanner{filename=FileName}, 305 %%?dbg("int_string1, calling xmerl_lib:detect_charset~n",[]), 306 307 %% In case of no encoding attribute in document utf-8 is default, but 308 %% another character set may be detected with help of Byte Order Marker or 309 %% with help of the encoding of the first 4 bytes. 310 case xmerl_lib:detect_charset(S#xmerl_scanner.encoding,Str) of 311 {auto,'iso-10646-utf-1',Str2} -> 312 scan_document(Str2, S#xmerl_scanner{encoding="iso-10646-utf-1"}); 313 {external,'iso-10646-utf-1',Str2} -> 314 scan_document(Str2, S#xmerl_scanner{encoding="iso-10646-utf-1"}); 315 {undefined,undefined,Str2} -> %% no auto detection 316 scan_document(Str2, S); 317 {external,ExtCharset,Str2} -> 318 %% no auto detection, ExtCharset is an explicitly provided 319 %% 7 bit,8 bit or utf-8 encoding 320 scan_document(Str2, S#xmerl_scanner{encoding=atom_to_list(ExtCharset)}) 321 end. 322 323int_string_decl(Str, Options, XMLBase, FileName) -> 324 S0=initial_state0(Options,XMLBase), 325 S = S0#xmerl_scanner{filename=FileName}, 326 case xmerl_lib:detect_charset(S#xmerl_scanner.encoding,Str) of 327 {auto,'iso-10646-utf-1',Str2} -> 328 scan_decl(Str2, S#xmerl_scanner{encoding="iso-10646-utf-1"}); 329 {external,'iso-10646-utf-1',Str2} -> 330 scan_decl(Str2, S#xmerl_scanner{encoding="iso-10646-utf-1"}); 331 {undefined,undefined,Str2} -> 332 scan_decl(Str2, S); 333 {external,ExtCharset,Str2} -> 334 scan_decl(Str2, S#xmerl_scanner{encoding=atom_to_list(ExtCharset)}) 335 end. 336 337 338 339initial_state0(Options,XMLBase) -> 340 CommonData = common_data(), 341 initial_state(Options, #xmerl_scanner{ 342 event_fun = fun event/2, 343 hook_fun = fun hook/2, 344 acc_fun = fun acc/3, 345 fetch_fun = fun fetch/2, 346 close_fun = fun close/1, 347 continuation_fun = fun cont/3, 348 rules_read_fun = fun rules_read/3, 349 rules_write_fun = fun rules_write/4, 350 rules_delete_fun= fun rules_delete/3, 351 xmlbase = XMLBase, 352 common_data = CommonData 353 }). 354 355initial_state([{event_fun, F}|T], S) -> 356 initial_state(T, S#xmerl_scanner{event_fun = F}); 357initial_state([{event_fun, F, ES}|T], S) -> 358 S1 = event_state(ES, S#xmerl_scanner{event_fun = F}), 359 initial_state(T, S1); 360initial_state([{acc_fun, F}|T], S) -> 361 initial_state(T, S#xmerl_scanner{acc_fun = F}); 362initial_state([{hook_fun, F}|T], S) -> 363 initial_state(T, S#xmerl_scanner{hook_fun = F}); 364initial_state([{hook_fun, F, HS}|T], S) -> 365 S1 = hook_state(HS, S#xmerl_scanner{hook_fun = F}), 366 initial_state(T, S1); 367initial_state([{close_fun, F}|T], S) -> 368 initial_state(T, S#xmerl_scanner{close_fun = F}); 369initial_state([{fetch_fun, F}|T], S) -> 370 initial_state(T, S#xmerl_scanner{fetch_fun = F}); 371initial_state([{fetch_fun, F, FS}|T], S) -> 372 S1 = fetch_state(FS, S#xmerl_scanner{fetch_fun = F}), 373 initial_state(T, S1); 374initial_state([{fetch_path, P}|T], S) -> 375 initial_state(T, S#xmerl_scanner{fetch_path = P}); 376initial_state([{continuation_fun, F}|T], S) -> 377 initial_state(T, S#xmerl_scanner{continuation_fun = F}); 378initial_state([{continuation_fun, F, CS}|T], S) -> 379 S1 = cont_state(CS, S#xmerl_scanner{continuation_fun = F}), 380 initial_state(T, S1); 381initial_state([{rules, R}|T], S) -> 382 initial_state(T, S#xmerl_scanner{rules = R, 383 keep_rules = true}); 384initial_state([{rules, Read, Write, RS}|T], S) -> 385 S1 = rules_state(RS, S#xmerl_scanner{rules_read_fun = Read, 386 rules_write_fun = Write, 387 keep_rules = true}), 388 initial_state(T, S1); 389initial_state([{user_state, F}|T], S) -> 390 initial_state(T, S#xmerl_scanner{user_state = F}); 391initial_state([{space, L}|T], S) -> 392 initial_state(T, S#xmerl_scanner{space = L}); 393initial_state([{line, L}|T], S) -> 394 initial_state(T, S#xmerl_scanner{line = L}); 395initial_state([{namespace_conformant, F}|T], S) when F==true; F==false -> 396 initial_state(T, S#xmerl_scanner{namespace_conformant = F}); 397initial_state([{validation, F}|T], S) 398 when F==off; F==dtd; F==schema; F==true; F==false -> 399 initial_state(T, S#xmerl_scanner{validation = validation_value(F)}); 400initial_state([{schemaLocation, SL}|T], S) when is_list(SL) -> 401 initial_state(T, S#xmerl_scanner{schemaLocation=SL}); 402initial_state([{quiet, F}|T], S) when F==true; F==false -> 403 initial_state(T, S#xmerl_scanner{quiet = F}); 404initial_state([{doctype_DTD,DTD}|T], S) -> 405 initial_state(T,S#xmerl_scanner{doctype_DTD = DTD}); 406initial_state([{document, F}|T], S) when is_boolean(F) -> 407 initial_state(T,S#xmerl_scanner{document = F}); 408initial_state([{comments, F}|T], S) when is_boolean(F) -> 409 initial_state(T,S#xmerl_scanner{comments = F}); 410initial_state([{default_attrs, F}|T], S) when is_boolean(F) -> 411 initial_state(T,S#xmerl_scanner{default_attrs = F}); 412initial_state([{text_decl,Bool}|T], S) -> 413 initial_state(T,S#xmerl_scanner{text_decl=Bool}); 414initial_state([{environment,Env}|T], S) -> 415 initial_state(T,S#xmerl_scanner{environment=Env}); 416initial_state([{xmlbase, D}|T], S) -> 417 initial_state(T, S#xmerl_scanner{xmlbase = D}); 418initial_state([{encoding, Enc}|T], S) -> 419 initial_state(T, S#xmerl_scanner{encoding = Enc}); 420initial_state([], S=#xmerl_scanner{rules = undefined}) -> 421 Tab = ets:new(rules, [set, public]), 422 S#xmerl_scanner{rules = Tab}; 423initial_state([], S) -> 424 S. 425 426validation_value(true) -> 427 dtd; 428validation_value(false) -> 429 off; 430validation_value(F) -> 431 F. 432 433%% Used for compacting (some) indentations. 434%% See also fast_accumulate_whitespace(). 435common_data() -> 436 {comdata(lists:duplicate(60, $\s), []), 437 comdata(lists:duplicate(15, $\t), []), 438 "\n"}. 439 440comdata([], CD)-> 441 list_to_tuple(CD); 442comdata([_ | T]=L, CD) -> 443 comdata(T, [[$\n | L] | CD]). 444 445%%% ----------------------------------------------------- 446%%% Default modifier functions 447 448%%% Hooks: 449%%% - {element, Line, Name, Attrs, Content} 450%%% - {processing_instruction, Line, Data} 451 452hook(X, State) -> 453 {X, State}. 454 455%%% Events: 456%%% 457%%% #xmerl_event{event : started | ended, 458%%% line : integer(), 459%%% col : integer(), 460%%% data} 461%%% 462%%% Data Events 463%%% document started, ended 464%%% #xmlElement started, ended 465%%% #xmlAttribute ended 466%%% #xmlPI ended 467%%% #xmlComment ended 468%%% #xmlText ended 469event(_X, S) -> 470 S. 471 472%% The acc/3 function can return either {Acc´, S'} or {Acc', Pos', S'}, 473%% where Pos' can be derived from X#xmlElement.pos, X#xmlText.pos, or 474%% X#xmlAttribute.pos (whichever is the current object type.) 475%% The acc/3 function is not allowed to redefine the type of object 476%% being defined, but _is_ allowed to either ignore it or split it 477%% into multiple objects (in which case {Acc',Pos',S'} should be returned.) 478%% If {Acc',S'} is returned, Pos will be incremented by 1 by default. 479%% Below is an example of an acceptable operation 480acc(#xmlText{value = Text}, [X = #xmlText{value = AccText}], S) -> 481 {[X#xmlText{value = AccText ++ Text}], S}; 482acc(X, Acc, S) -> 483 {[X|Acc], S}. 484 485fetch({system, URI}, S) -> 486 fetch_URI(URI, S); 487fetch({public, _PublicID, URI}, S) -> 488 fetch_URI(URI, S). 489 490%%% Always assume an external resource can be found locally! Thus 491%%% don't bother fetching with e.g. HTTP. Returns the path where the 492%%% resource is found. The path to the external resource is given by 493%%% URI directly or the option fetch_path (additional paths) or 494%%% directory (base path to external resource) 495fetch_URI(URI, S) -> 496 %% assume URI is a filename 497 Split = filename:split(URI), 498 Filename = fun([])->[];(X)->lists:last(X) end (Split), 499 Fullname = 500 case Split of %% how about Windows systems? 501 ["file:"|Name]-> %% absolute path, see RFC2396 sect 3 502 %% file:/dtd_name 503 filename:join(["/"|Name]); 504 ["/"|Rest] when Rest /= [] -> 505 %% absolute path name 506 URI; 507 ["http:"|_Rest] -> 508 {http,URI}; 509 [] -> %% empty systemliteral 510 []; 511 _ -> 512 filename:join(S#xmerl_scanner.xmlbase, URI) 513 end, 514 Path = path_locate(S#xmerl_scanner.fetch_path, Filename, Fullname), 515 ?dbg("fetch(~p) -> {file, ~p}.~n", [URI, Path]), 516 {ok, Path, S}. 517 518path_locate(_, _, {http,_}=URI) -> 519 URI; 520path_locate(_, _, []) -> 521 []; 522path_locate([Dir|Dirs], FN, FullName) -> 523 F = filename:join(Dir, FN), 524 case file:read_file_info(F) of 525 {ok, #file_info{type = regular}} -> 526 {file,F}; 527 _ -> 528 path_locate(Dirs, FN, FullName) 529 end; 530path_locate([], _FN, FullName) -> 531 {file,FullName}. 532 533 534cont(_F, Exception, US) -> 535 Exception(US). 536 537close(S) -> 538 S. 539 540 541%%% ----------------------------------------------------- 542%%% Scanner 543 544%%% [1] document ::= prolog element Misc* 545scan_document(Str0, S=#xmerl_scanner{event_fun = Event, 546 line = L, col = C, 547 environment=Env, 548 encoding=Charset, 549 document=Document, 550 validation=ValidateResult}) -> 551 S1 = Event(#xmerl_event{event = started, 552 line = L, 553 col = C, 554 data = document}, S), 555 556 %% Transform to given character set. 557 %% Note that if another character set is given in the encoding 558 %% attribute in a XML declaration that one will be used later 559 Str=if 560 Charset == "utf-8" -> 561 Str0; 562 Charset =/= undefined -> % Default character set is UTF-8 563 xmerl_ucs:to_unicode(Str0, list_to_atom(Charset)); 564 true -> %% Charset is undefined if no external input is 565 %% given, and no auto detection of character 566 %% encoding was made. 567 Str0 568 end, 569%% M1 = erlang:memory(), 570%% ?dbg("Memory status before prolog: ~p~n",[M1]), 571 {Prolog, Pos, T1, S2} = scan_prolog(Str, S1, _StartPos = 1), 572%% M2 = erlang:memory(), 573%% ?dbg("Memory status after prolog: ~p~n",[M2]), 574 %%?dbg("scan_document 2, prolog parsed~n",[]), 575 T2 = scan_mandatory("<", T1, 1, S2, expected_element_start_tag), 576%% M3 = erlang:memory(), 577%% ?dbg("Memory status before element: ~p~n",[M3]), 578 {Res, T3, S3} = scan_element(T2,S2,Pos), 579%% M4 = erlang:memory(), 580%% ?dbg("Memory status after element: ~p~n",[M4]), 581 {Misc, _Pos1, Tail, S4}=scan_misc(T3, S3, Pos + 1), 582%% M5 = erlang:memory(), 583%% ?dbg("Memory status after misc: ~p~n",[M5]), 584 585 S5 = #xmerl_scanner{} = Event(#xmerl_event{event = ended, 586 line = S4#xmerl_scanner.line, 587 col = S4#xmerl_scanner.col, 588 data = document}, S4), 589 590 {Res2, S6} = case validation_mode(ValidateResult) of 591 off -> 592 {Res, cleanup(S5)}; 593 dtd when Env == element; Env == prolog -> 594 check_decl2(S5), 595 case xmerl_validate:validate(S5, Res) of 596 {'EXIT', {error, Reason}} -> 597 S5b = cleanup(S5), 598 ?fatal({failed_validation, Reason}, S5b); 599 {'EXIT', Reason} -> 600 S5b = cleanup(S5), 601 ?fatal({failed_validation, Reason}, S5b); 602 {error, Reason} -> 603 S5b = cleanup(S5), 604 ?fatal({failed_validation, Reason}, S5b); 605 {error, Reason, _Next} -> 606 S5b = cleanup(S5), 607 ?fatal({failed_validation, Reason}, S5b); 608 _XML -> 609 {Res, cleanup(S5)} 610 end; 611 schema -> 612 case schemaLocations(Res, S5) of 613 {ok, Schemas} -> 614 _ = cleanup(S5), 615 %%?dbg("Schemas: ~p~nRes: ~p~ninhertih_options(S): ~p~n", 616 %% [Schemas,Res,inherit_options(S5)]), 617 XSDRes = xmerl_xsd:process_validate(Schemas, Res, 618 inherit_options(S5)), 619 handle_schema_result(XSDRes, S5); 620 _ -> 621 {Res, cleanup(S5)} 622 end; 623 _ -> 624 {Res, cleanup(S5)} 625 end, 626 627 Res3 = 628 case Document of 629 true -> 630 Content = lists:reverse(Prolog, [Res2 | lists:reverse(Misc)]), 631 #xmlDocument{content = Content}; 632 false -> 633 Res2 634 end, 635 {Res3, Tail, S6}. 636 637 638scan_decl(Str, S=#xmerl_scanner{event_fun = Event, 639 line = L, col = C, 640 environment=_Env, 641 encoding=_Charset, 642 validation=_ValidateResult}) -> 643 S1 = Event(#xmerl_event{event = started, 644 line = L, 645 col = C, 646 data = document}, S), 647 648 case scan_prolog(Str, S1, _StartPos = 1) of 649 {_,_,T2="<"++_, S2} -> 650 {{S2#xmerl_scanner.user_state,T2},[],S2}; 651 {_,_,[], S2}-> 652 {[],[],S2}; 653 {_,_,T2, S2} -> 654 {_,_,S3} = scan_content(T2,S2,[],_Attrs=[],S2#xmerl_scanner.space, 655 _Lang=[],_Parents=[],#xmlNamespace{}), 656 {T2,[],S3} 657 end. 658 659 660%%% [22] Prolog 661%%% prolog ::= XMLDecl? Misc* (doctypedecl Misc*)? 662%%% 663%% empty text declarations are handled by the first function clause. 664scan_prolog(T, S, Pos) -> 665 scan_prolog(T, S, Pos, []). 666scan_prolog([], S=#xmerl_scanner{continuation_fun = F}, Pos, Acc) -> 667 ?dbg("cont()...~n", []), 668 F(fun(MoreBytes, S1) -> scan_prolog(MoreBytes, S1, Pos, Acc) end, 669 fun(S1) -> {Acc, Pos, [], S1} end, 670 S); 671scan_prolog("<?xml"++T, 672 S0=#xmerl_scanner{encoding=Charset0,col=Col,line=L}, 673 Pos,Acc) when ?whitespace(hd(T)) -> 674 {Charset, T3, S3} = 675 if 676 Col==1,L==1,S0#xmerl_scanner.text_decl==true -> 677 ?dbg("prolog(\"<?xml\")~n", []), 678 ?bump_col(5), 679 {_,T1,S1} = mandatory_strip(T,S), 680 {Decl,T2, S2}=scan_text_decl(T1,S1), 681 Encoding=Decl#xmlDecl.encoding, 682 {Encoding, T2, S2#xmerl_scanner{encoding=Encoding}}; 683 Col==1,L==1 -> 684 ?dbg("prolog(\"<?xml\")~n", []), 685 ?bump_col(5), 686 {Decl,T2, S2}=scan_xml_decl(T, S), 687 Encoding=Decl#xmlDecl.encoding, 688 {Encoding, T2, S2#xmerl_scanner{encoding=Encoding}}; 689 true -> 690 ?fatal({xml_declaration_must_be_first_in_doc,Col,L},S0) 691 end, 692 %% Charset0 is either (1) 'iso-10646-utf-1' (transformation by 693 %% auto detection), (2) undefined (no auto detection and no 694 %% external encoding), (3) any other encoding format that must be 695 %% conformant to the internal explicitly given encoding. The two 696 %% former cases implies that the explicit internal encoding 697 %% (Charset) may be different from Charset0. 698 699 %% Now transform to declared character set. 700 if 701 Charset==Charset0 -> % Document already transformed to this charset! 702 scan_prolog(T3, S3, Pos, Acc); 703 Charset0=/=undefined -> 704 %% For example may an external entity 705 %% have the BOM for utf-16 and the internal 706 %% explicit encoding='utf-16', then it will be auto 707 %% detected and transformed, Charset0 will be 708 %% 'iso-10646-utf-1', and Charset will be 'utf-16', all 709 %% legal. 710 %% 711 scan_prolog(T3,S3#xmerl_scanner{encoding=Charset0},Pos,Acc); 712 Charset == "utf-8" -> 713 scan_prolog(T3, S3, Pos, Acc); 714 Charset=/=undefined -> % Document not previously transformed 715 T4=xmerl_ucs:to_unicode(T3,list_to_atom(Charset)), 716 scan_prolog(T4, S3, Pos, Acc); 717 true -> % No encoding info given 718 scan_prolog(T3, S3, Pos, Acc) 719 end; 720scan_prolog("<!DOCTYPE" ++ T, 721 S0=#xmerl_scanner{environment=prolog,encoding=_Charset}, 722 Pos, Acc) -> 723 ?dbg("prolog(\"<!DOCTYPE\")~n", []), 724 ?bump_col(9), 725 %% If no known character set assume it is UTF-8 726 T1=if 727 %% Charset==undefined -> xmerl_ucs:to_unicode(T,'utf-8'); 728 true -> T 729 end, 730 {T2, S1} = scan_doctype(T1, S), 731 scan_misc(T2, S1, Pos, Acc); 732scan_prolog(Str="%"++_T,S=#xmerl_scanner{environment={external,_}}, 733 Pos,Acc) -> 734 {T, S1} = scan_ext_subset(Str,S), 735 {Acc, Pos, T, S1}; 736scan_prolog(Str, S0 = #xmerl_scanner{user_state=_US,encoding=_Charset}, 737 Pos,Acc) -> 738 ?dbg("prolog(\"<\")~n", []), 739 740 %% Check for Comments, PI before possible DOCTYPE declaration 741 ?bump_col(1), 742 %% If no known character set assume it is UTF-8 743 T=if 744%% Charset==undefined -> xmerl_ucs:to_unicode(Str,'utf-8'); 745 true -> Str 746 end, 747 {Acc1, Pos1, T1, S1}=scan_misc(T, S, Pos, Acc), 748 scan_prolog2(T1,S1,Pos1,Acc1). 749 750 751 752scan_prolog2([], S=#xmerl_scanner{continuation_fun = F}, Pos, Acc) -> 753 ?dbg("cont()...~n", []), 754 F(fun(MoreBytes, S1) -> scan_prolog2(MoreBytes, S1, Pos, Acc) end, 755 fun(S1) -> {Acc, Pos, [], S1} end, 756 S); 757scan_prolog2("<!DOCTYPE" ++ T, S0=#xmerl_scanner{environment=prolog}, 758 Pos, Acc) -> 759 ?dbg("prolog(\"<!DOCTYPE\")~n", []), 760 ?bump_col(9), 761 {T1, S1} = scan_doctype(T, S), 762 scan_misc(T1, S1, Pos, Acc); 763scan_prolog2(Str = "<!" ++ _, S, Pos, Acc) -> 764 ?dbg("prolog(\"<!\")~n", []), 765 %% In e.g. a DTD, we jump directly to markup declarations 766 {T, S1} = scan_ext_subset(Str, S), 767 {Acc, Pos, T, S1}; 768scan_prolog2(Str, S0 = #xmerl_scanner{user_state=_US},Pos,Acc) -> 769 ?dbg("prolog(\"<\")~n", []), 770 771 %% Here we consider the DTD provided by doctype_DTD option, 772 S1 = 773 case S0 of 774 #xmerl_scanner{validation=dtd,doctype_DTD=DTD} when is_list(DTD) -> 775 S=fetch_DTD(undefined,S0), 776 check_decl(S), 777 S; 778 _ -> S0 779 end, 780 %% Check for more Comments and PI after DOCTYPE declaration 781% ?bump_col(1), 782 scan_misc(Str, S1, Pos, Acc). 783 784 785 786 787%%% [27] Misc ::= Comment | PI | S 788%% Note: 789%% - Neither of Comment and PI are returned in the resulting parsed 790%% structure. 791%% - scan_misc/3 implements Misc* as that is how the rule is always used 792scan_misc(T, S, Pos) -> 793 scan_misc(T, S, Pos, []). 794scan_misc([], S=#xmerl_scanner{continuation_fun = F}, Pos, Acc) -> 795 ?dbg("cont()...~n", []), 796 F(fun(MoreBytes, S1) -> scan_misc(MoreBytes, S1, Pos, Acc) end, 797 fun(S1) -> {Acc, Pos, [], S1} end, 798 S); 799scan_misc("<!--" ++ T, S0=#xmerl_scanner{acc_fun = F, comments=CF}, Pos, Acc) -> % Comment 800 ?bump_col(4), 801 {C, T1, S1} = scan_comment(T, S, Pos, _Parents = [], _Lang = []), 802 case CF of 803 true -> 804 {Acc2, Pos2, S3} = 805 case F(C, Acc, S1) of 806 {Acc1, S2} -> 807 {Acc1, Pos + 1, S2}; 808 {Acc1, Pos1, S2} -> 809 {Acc1, Pos1, S2} 810 end, 811 scan_misc(T1, S3, Pos2, Acc2); 812 false -> 813 scan_misc(T1, S1, Pos, Acc) 814 end; 815scan_misc("<?" ++ T, S0=#xmerl_scanner{acc_fun = F}, Pos, Acc) -> % PI 816 ?dbg("prolog(\"<?\")~n", []), 817 ?bump_col(2), 818 {PI, T1, S1} = scan_pi(T, S, Pos, []), 819 {Acc2, Pos2, S3} = case F(PI, Acc, S1) of 820 {Acc1, S2} -> 821 {Acc1, Pos + 1, S2}; 822 {Acc1, Pos1, S2} -> 823 {Acc1, Pos1, S2} 824 end, 825 scan_misc(T1,S3,Pos2,Acc2); 826scan_misc(T=[H|_T], S, Pos, Acc) when ?whitespace(H) -> 827 ?dbg("prolog(whitespace)~n", []), 828 {_,T1,S1}=strip(T,S), 829 scan_misc(T1,S1,Pos,Acc); 830scan_misc(T,S,Pos,Acc) -> 831 {Acc,Pos,T,S}. 832 833 834cleanup(S=#xmerl_scanner{keep_rules = false, 835 rules = Rules}) -> 836 ets:delete(Rules), 837 S#xmerl_scanner{rules = undefined}; 838cleanup(S) -> 839 S. 840 841%%% Prolog and Document Type Declaration XML 1.0 Section 2.8 842%% [23] XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>' 843%% [24] VersionInfo ::= S 'version' Eq ("'" VersionNum "'" | '"' VersionNum '"') 844scan_xml_decl(T, S) -> 845 %% VersionInfo [24] is mandatory 846 {_,T1,S1} = mandatory_strip(T,S), 847 {T2,S2} = 848 case T1 of 849 "version" ++ _T2 -> 850 {_T2,S1#xmerl_scanner{col=S1#xmerl_scanner.col+7}}; 851 _ -> ?fatal(expected_version_attribute,S1) 852 end, 853 {T3, S3} = scan_eq(T2, S2), 854 {Vsn, T4, S4} = scan_xml_vsn(T3, S3), 855 Attr = #xmlAttribute{name = version, 856 parents = [{xml, _XMLPos = 1}], 857 value = Vsn}, 858 scan_xml_decl(T4, S4, #xmlDecl{vsn = Vsn, 859 attributes = [Attr]}). 860 861scan_xml_decl([], S=#xmerl_scanner{continuation_fun = F}, Decl) -> 862 ?dbg("cont()...~n", []), 863 F(fun(MoreBytes, S1) -> scan_xml_decl(MoreBytes, S1, Decl) end, 864 fun(S1) -> {[], [], S1} end, 865 S); 866scan_xml_decl("?>" ++ T, S0, Decl) -> 867 ?bump_col(2), 868 return_xml_decl(T,S,Decl); 869scan_xml_decl(T,S=#xmerl_scanner{event_fun = _Event},Decl) when ?whitespace(hd(T)) -> 870 {_,T1,S1}=mandatory_strip(T,S), 871 scan_xml_decl2(T1,S1,Decl); 872scan_xml_decl(_T,S=#xmerl_scanner{event_fun = _Event},_Decl) -> 873 ?fatal(preformat([expected,one,'of:'],['?>',whitespace_character],","),S). 874 875scan_xml_decl2("?>" ++ T, S0,Decl) -> 876 ?bump_col(2), 877 return_xml_decl(T,S,Decl); 878scan_xml_decl2("encoding" ++ T, S0 = #xmerl_scanner{event_fun = Event}, 879 Decl0 = #xmlDecl{attributes = Attrs}) -> 880 %% [80] EncodingDecl 881 ?bump_col(8), 882 {T1, S1} = scan_eq(T, S), 883 {EncName, T2, S2} = scan_enc_name(T1, S1), 884 LowEncName=xmerl_lib:to_lower(EncName), 885 Attr = #xmlAttribute{name = encoding, 886 parents = [{xml, _XMLPos = 1}], 887 value = LowEncName}, 888 Decl = Decl0#xmlDecl{encoding = LowEncName, 889 attributes = [Attr|Attrs]}, 890 S3 = #xmerl_scanner{} = Event(#xmerl_event{event = ended, 891 line = S0#xmerl_scanner.line, 892 col = S0#xmerl_scanner.col, 893 data = Attr}, S2), 894 case T2 of 895 "?>" ++ _T3 -> 896 scan_xml_decl3(T2,S3,Decl); 897 _ -> 898 {_,T3,S4} = mandatory_strip(T2,S3), 899 scan_xml_decl3(T3, S4, Decl) 900 end; 901scan_xml_decl2(T="standalone" ++ _T,S,Decl) -> 902 scan_xml_decl3(T,S,Decl); 903scan_xml_decl2(_BadString,S,_Decl) -> 904 ?fatal(preformat([expected,one,'of:'],['?>',standalone,encoding],","),S). 905% ?fatal(lists:flatten(io_lib:format("~s ~s ~s: ~s, ~s, ~s",[expected,one,'of','?>',standalone,encoding])),S). 906% ?fatal({expected_one_of,"?>",standalone,encoding},S). 907 908scan_xml_decl3("?>" ++ T, S0,Decl) -> 909 ?bump_col(2), 910 return_xml_decl(T,S,Decl); 911scan_xml_decl3("standalone" ++ T,S0 = #xmerl_scanner{event_fun = Event}, 912 Decl0 = #xmlDecl{attributes = Attrs}) -> 913 %% [32] SDDecl 914 ?bump_col(10), 915 {T1, S1} = scan_eq(T, S), 916 {StValue,T2,S2}=scan_standalone_value(T1,S1), 917 Attr = #xmlAttribute{name = standalone, 918 parents = [{xml, _XMLPos = 1}], 919 value = StValue}, 920 Decl = Decl0#xmlDecl{standalone = StValue, 921 attributes = [Attr|Attrs]}, 922 S3 = #xmerl_scanner{} = Event(#xmerl_event{event = ended, 923 line = S0#xmerl_scanner.line, 924 col = S0#xmerl_scanner.col, 925 data = Attr}, S2), 926 {_,T3,S4} = strip(T2,S3), 927 T4 = scan_mandatory("?>",T3,2,S4,expected_xml_decl_endtag), 928%% "?>" ++ T4 = T3, 929 return_xml_decl(T4, S4#xmerl_scanner{col=S4#xmerl_scanner.col+2}, Decl). 930 931 932return_xml_decl(T,S=#xmerl_scanner{hook_fun = _Hook, 933 event_fun = Event}, 934 Decl0 = #xmlDecl{attributes = Attrs}) -> 935 ?strip1, 936 Decl = Decl0#xmlDecl{attributes = lists:reverse(Attrs)}, 937 S2 = #xmerl_scanner{} = Event(#xmerl_event{event = ended, 938 line = S#xmerl_scanner.line, 939 col = S#xmerl_scanner.col, 940 data = Decl}, S1), 941%% {Ret, S3} = Hook(Decl, S2), 942%% {Ret, T1, S3}. 943 {Decl, T1, S2}. 944 945 946scan_standalone_value("'yes'" ++T,S0)-> 947 ?bump_col(5), 948 {'yes',T,S#xmerl_scanner{standalone=yes}}; 949scan_standalone_value("\"yes\"" ++T,S0)-> 950 ?bump_col(5), 951 {'yes',T,S#xmerl_scanner{standalone=yes}}; 952scan_standalone_value("'no'" ++T,S0) -> 953 ?bump_col(4), 954 {'no',T,S}; 955scan_standalone_value("\"no\"" ++T,S0) -> 956 ?bump_col(4), 957 {'no',T,S}. 958 959%%% 960%%% Text declaration XML 1.0 section 4.3.1 961%%% [77] TextDecl ::= '<?xml' VersionInfo? EncodingDecl S? '?>' 962scan_text_decl(T,S=#xmerl_scanner{event_fun = Event}) -> 963 {#xmlDecl{attributes=Attrs}=Decl0,T1,S1} = scan_optional_version(T,S), 964 T2 = 965 case T1 of 966 "encoding" ++ _T2 -> _T2; 967 _ -> 968 ?fatal(expected_encoding_attribute,S1) 969 end, 970 S2 = S1#xmerl_scanner{col = S1#xmerl_scanner.col + 8}, 971 {T3, S3} = scan_eq(T2, S2), 972 {EncName, T4, S4} = scan_enc_name(T3, S3), 973 LowEncName=xmerl_lib:to_lower(EncName), 974 ?strip5, 975 Attr = #xmlAttribute{name = encoding, 976 parents = [{xml,1}], 977 value = LowEncName}, 978 Decl = Decl0#xmlDecl{encoding = LowEncName, 979 attributes = [Attr|Attrs]}, 980 S6=#xmerl_scanner{} = Event(#xmerl_event{event = ended, 981 line = S5#xmerl_scanner.line, 982 col = S5#xmerl_scanner.col, 983 data = Attr}, S5), 984 scan_text_decl(T5,S6,Decl). 985 986scan_text_decl("?>"++T,S0 = #xmerl_scanner{hook_fun = _Hook, 987 event_fun = Event}, 988 Decl0 = #xmlDecl{attributes = Attrs}) -> 989 ?bump_col(2), 990 ?strip1, 991 Decl = Decl0#xmlDecl{attributes = lists:reverse(Attrs)}, 992 S2 = #xmerl_scanner{} = Event(#xmerl_event{event = ended, 993 line = S0#xmerl_scanner.line, 994 col = S0#xmerl_scanner.col, 995 data = Decl}, S1), 996%% {Ret, S3} = Hook(Decl, S2), 997%% {Ret, T1, S3}; 998 {Decl, T1, S2}; 999scan_text_decl([H|_T],S,_) -> 1000 ?fatal({unexpected_character_in_text_declaration,H},S). 1001 1002scan_optional_version("version"++T,S0) -> 1003 ?bump_col(7), 1004 ?strip1, 1005 {T2, S2} = scan_eq(T1, S1), 1006 {Vsn, T3, S3} = scan_xml_vsn(T2, S2), 1007 {_,T4,S4} = mandatory_strip(T3,S3), 1008 Attr = #xmlAttribute{name = version,parents = [{xml,1}],value = Vsn}, 1009 {#xmlDecl{attributes=[Attr]},T4,S4}; 1010scan_optional_version(T,S) -> 1011 {#xmlDecl{attributes=[]},T,S}. 1012 1013 1014 1015%%%%%%% [81] EncName 1016scan_enc_name([], S=#xmerl_scanner{continuation_fun = F}) -> 1017 ?dbg("cont()...~n", []), 1018 F(fun(MoreBytes, S1) -> scan_enc_name(MoreBytes, S1) end, 1019 fatal_fun(expected_encoding_name), 1020 S); 1021scan_enc_name([H|T], S0) when H >= $"; H =< $' -> 1022 ?bump_col(1), 1023 scan_enc_name(T, S, H, []). 1024 1025 1026scan_enc_name([], S=#xmerl_scanner{continuation_fun = F}, Delim, Acc) -> 1027 ?dbg("cont()...~n", []), 1028 F(fun(MoreBytes, S1) -> scan_enc_name(MoreBytes, S1, Delim, Acc) end, 1029 fatal_fun(expected_encoding_name), 1030 S); 1031scan_enc_name([H|T], S0, Delim, Acc) when H >= $a, H =< $z -> 1032 ?bump_col(1), 1033 scan_enc_name2(T, S, Delim, [H|Acc]); 1034scan_enc_name([H|T], S0, Delim, Acc) when H >= $A, H =< $Z -> 1035 ?bump_col(1), 1036 scan_enc_name2(T, S, Delim, [H|Acc]); 1037scan_enc_name([H|_T],S,_Delim,_Acc) -> 1038 ?fatal({error,{unexpected_character_in_Enc_Name,H}},S). 1039 1040scan_enc_name2([], S=#xmerl_scanner{continuation_fun = F}, Delim, Acc) -> 1041 ?dbg("cont()...~n", []), 1042 F(fun(MoreBytes, S1) -> scan_enc_name2(MoreBytes, S1, Delim, Acc) end, 1043 fatal_fun(expected_encoding_name), 1044 S); 1045scan_enc_name2([H|T], S0, H, Acc) -> 1046 ?bump_col(1), 1047 {lists:reverse(Acc), T, S}; 1048scan_enc_name2([H|T], S0, Delim, Acc) when H >= $a, H =< $z -> 1049 ?bump_col(1), 1050 scan_enc_name2(T, S, Delim, [H|Acc]); 1051scan_enc_name2([H|T], S0, Delim, Acc) when H >= $A, H =< $Z -> 1052 ?bump_col(1), 1053 scan_enc_name2(T, S, Delim, [H|Acc]); 1054scan_enc_name2([H|T], S0, Delim, Acc) when H >= $0, H =< $9 -> 1055 ?bump_col(1), 1056 scan_enc_name2(T, S, Delim, [H|Acc]); 1057scan_enc_name2([H|T], S0, Delim, Acc) when H == $.; H == $_; H == $- -> 1058 ?bump_col(1), 1059 scan_enc_name2(T, S, Delim, [H|Acc]). 1060 1061 1062%%%%%%% [26] VersionNum 1063%%% VersionNum ::= ([a-zA-Z0-9_.:] | '-')+ 1064scan_xml_vsn([], S=#xmerl_scanner{continuation_fun = F}) -> 1065 ?dbg("cont()...~n", []), 1066 F(fun(MoreBytes, S1) -> scan_xml_vsn(MoreBytes, S1) end, 1067 fatal_fun(unexpected_end), 1068 S); 1069scan_xml_vsn([H|T], S) when H==$"; H==$'-> 1070 xml_vsn(T, S#xmerl_scanner{col = S#xmerl_scanner.col+1}, H, []). 1071 1072xml_vsn([], S=#xmerl_scanner{continuation_fun = F}, Delim, Acc) -> 1073 ?dbg("cont()...~n", []), 1074 F(fun(MoreBytes, S1) -> xml_vsn(MoreBytes, S1, Delim, Acc) end, 1075 fatal_fun(unexpected_end), 1076 S); 1077xml_vsn([H|T], S=#xmerl_scanner{col = C}, H, Acc) -> 1078 {lists:reverse(Acc), T, S#xmerl_scanner{col = C+1}}; 1079xml_vsn([H|T], S=#xmerl_scanner{col = C},Delim, Acc) when H >= $a, H =< $z -> 1080 xml_vsn(T, S#xmerl_scanner{col = C+1}, Delim, [H|Acc]); 1081xml_vsn([H|T], S=#xmerl_scanner{col = C},Delim, Acc) when H >= $A, H =< $Z -> 1082 xml_vsn(T, S#xmerl_scanner{col = C+1}, Delim, [H|Acc]); 1083xml_vsn([H|T], S=#xmerl_scanner{col = C},Delim, Acc) when H >= $0, H =< $9 -> 1084 xml_vsn(T, S#xmerl_scanner{col = C+1}, Delim, [H|Acc]); 1085xml_vsn([H|T], S=#xmerl_scanner{col = C}, Delim, Acc) -> 1086 case lists:member(H, "_.:-") of 1087 true -> 1088 xml_vsn(T, S#xmerl_scanner{col = C+1}, Delim, [H|Acc]); 1089 false -> 1090 ?fatal({invalid_vsn_char, H}, S) 1091 end. 1092 1093%%%%%%% [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>' 1094 1095scan_pi([], S=#xmerl_scanner{continuation_fun = F}, Pos, Ps) -> 1096 ?dbg("cont()...~n", []), 1097 F(fun(MoreBytes, S1) -> scan_pi(MoreBytes, S1, Pos, Ps) end, 1098 fatal_fun(unexpected_end), 1099 S); 1100scan_pi(Str = [H1,H2,H3 | T],S0=#xmerl_scanner{line = L, col = C}, Pos, Ps) 1101 when H1==$x;H1==$X -> 1102 %% names beginning with [xX][mM][lL] are reserved for future use. 1103 ?bump_col(3), 1104 if 1105 ((H2==$m) or (H2==$M)) and 1106 ((H3==$l) or (H3==$L)) -> 1107 scan_wellknown_pi(T,S,Pos,Ps); 1108 true -> 1109 {Target, _NamespaceInfo, T1, S1} = scan_name(Str, S), 1110 scan_pi(T1, S1, Target, L, C, Pos, Ps, []) 1111 end; 1112scan_pi(Str, S=#xmerl_scanner{line = L, col = C}, Pos, Ps) -> 1113 {Target, _NamespaceInfo, T1, S1} = scan_name(Str, S), 1114 scan_pi(T1, S1, Target, L, C, Pos, Ps, []). 1115 1116 1117%%% More info on xml-stylesheet can be found at: 1118%%% "Associating Style Sheets with XML documents", Version 1.0, 1119%%% W3C Recommendation 29 June 1999 (http://www.w3.org/TR/xml-stylesheet/) 1120scan_wellknown_pi("-stylesheet"++T, S0=#xmerl_scanner{line=L,col=C},Pos,Ps) -> 1121 ?dbg("prolog(\"<?xml-stylesheet\")~n", []), 1122 ?bump_col(16), 1123 scan_pi(T, S, "xml-stylesheet",L,C,Pos,Ps,[]); 1124scan_wellknown_pi(Str,S,_Pos,_Ps) -> 1125 ?fatal({invalid_target_name, lists:sublist(Str, 1, 10)}, S). 1126 1127 1128 1129scan_pi([], S=#xmerl_scanner{continuation_fun = F}, Target, 1130 L, C, Pos, Ps, Acc) -> 1131 ?dbg("cont()...~n", []), 1132 F(fun(MoreBytes, S1) -> scan_pi(MoreBytes, S1, Target, 1133 L, C, Pos, Ps, Acc) end, 1134 fatal_fun(unexpected_end), 1135 S); 1136scan_pi("?>" ++ T, S0 = #xmerl_scanner{hook_fun = Hook, 1137 event_fun = Event}, 1138 Target, L, C, Pos, Ps, Acc) -> 1139 ?bump_col(2), 1140 PI = #xmlPI{name = Target, 1141 parents = Ps, 1142 pos = Pos, 1143 value = lists:reverse(Acc)}, 1144 S1 = #xmerl_scanner{} = Event(#xmerl_event{event = ended, 1145 line = L, 1146 col = C, 1147 data = PI}, S), 1148 {Ret, S2} = Hook(PI, S1), 1149 {Ret, T, S2}; 1150scan_pi([H|T], S, Target, L, C, Pos, Ps, Acc) when ?whitespace(H) -> 1151 ?strip1, 1152 scan_pi2(T1, S1, Target, L, C, Pos, Ps, Acc); 1153scan_pi([H|_T],S,_Target, _L, _C, _Pos, _Ps, _Acc) -> 1154 ?fatal({expected_whitespace_OR_end_of_PI,{char,H}}, S). 1155 1156scan_pi2([], S=#xmerl_scanner{continuation_fun = F}, Target, 1157 L, C, Pos, Ps, Acc) -> 1158 ?dbg("cont()...~n", []), 1159 F(fun(MoreBytes, S1) -> scan_pi2(MoreBytes, S1, Target, 1160 L, C, Pos, Ps, Acc) end, 1161 fatal_fun(unexpected_end), 1162 S); 1163scan_pi2("?>" ++ T, S0 = #xmerl_scanner{hook_fun = Hook, 1164 event_fun = Event}, 1165 Target, L, C, Pos, Ps, Acc) -> 1166 ?bump_col(2), 1167 PI = #xmlPI{name = Target, 1168 parents = Ps, 1169 pos = Pos, 1170 value = lists:reverse(Acc)}, 1171 S1 = #xmerl_scanner{} = Event(#xmerl_event{event = ended, 1172 line = L, 1173 col = C, 1174 data = PI}, S), 1175 {Ret, S2} = Hook(PI, S1), 1176 {Ret, T, S2}; 1177scan_pi2(Str, S0, Target, L, C, Pos, Ps, Acc) -> 1178 ?bump_col(1), 1179 {Ch,T} = wfc_legal_char(Str,S), 1180 scan_pi2(T, S, Target, L, C, Pos, Ps, [Ch|Acc]). 1181 1182 1183 1184%% [28] doctypedecl ::= 1185%% '<!DOCTYPE' S Name (S ExternalID)? S? ('[' intSubset ']' S?)? '>' 1186scan_doctype([], S=#xmerl_scanner{continuation_fun = F}) -> 1187 ?dbg("cont()...~n", []), 1188 F(fun(MoreBytes, S1) -> scan_doctype(MoreBytes, S1) end, 1189 fatal_fun(unexpected_end), 1190 S); 1191scan_doctype(T, S) -> 1192 {_,T1,S1} = mandatory_strip(T,S), 1193 {DTName, _NamespaceInfo, T2, S2} = scan_name(T1, S1), 1194 ?strip3, 1195 scan_doctype1(T3, S3#xmerl_scanner{doctype_name = DTName}). 1196 1197 1198%% [75] ExternalID ::= 'SYSTEM' S SystemLiteral 1199%% | 'PUBLIC' S PubidLiteral S SystemLiteral 1200scan_doctype1([], S=#xmerl_scanner{continuation_fun = F}) -> 1201 ?dbg("cont()...~n", []), 1202 F(fun(MoreBytes, S1) -> scan_doctype1(MoreBytes, S1) end, 1203 fatal_fun(unexpected_end), 1204 S); 1205scan_doctype1("PUBLIC" ++ T, S0) -> 1206 ?bump_col(6), 1207 {_,T1,S1} = mandatory_strip(T,S), 1208 {PIDL, T2, S2} = scan_pubid_literal(T1, S1), 1209 {_,T3,S3} = mandatory_strip(T2,S2), 1210 {SL, T4, S4} = scan_system_literal(T3, S3), 1211 ?strip5, 1212 scan_doctype2(T5, S5, {public, PIDL, SL}); 1213scan_doctype1("SYSTEM" ++ T, S0) -> 1214 ?bump_col(6), 1215 {_,T1,S1} = mandatory_strip(T,S), 1216 {SL, T2, S2} = scan_system_literal(T1, S1), 1217 ?strip3, 1218 scan_doctype2(T3, S3, {system, SL}); 1219scan_doctype1(T, S) -> 1220 scan_doctype2(T, S, undefined). 1221 1222 1223scan_doctype2([], S=#xmerl_scanner{continuation_fun = F},DTD) -> 1224 ?dbg("cont()...~n", []), 1225 F(fun(MoreBytes, S1) -> scan_doctype2(MoreBytes, S1, DTD) end, 1226 fatal_fun(unexpected_end), 1227 S); 1228scan_doctype2("[" ++ T, S0, DTD) -> 1229 ?bump_col(1), 1230 ?strip1, 1231 scan_doctype3(T1, S1, DTD); 1232scan_doctype2(">" ++ T, S0, DTD) -> 1233 ?bump_col(1), 1234 ?strip1, 1235 S2 = fetch_DTD(DTD, S1), 1236 check_decl(S2), 1237 {T1, S2}; 1238scan_doctype2(_T,S,_DTD) -> 1239 ?fatal(expected_end_of_DOCTYPE_declaration, S). 1240 1241%% [28a] DeclSep ::= PEReference | S 1242%% [28b] intSubset ::= (markupdecl | DeclSep)* 1243scan_doctype3([], S=#xmerl_scanner{continuation_fun = F},DTD) -> 1244 ?dbg("cont()...~n", []), 1245 F(fun(MoreBytes, S1) -> scan_doctype3(MoreBytes, S1,DTD) end, 1246 fatal_fun(unexpected_end), 1247 S); 1248scan_doctype3("%" ++ T, S0, DTD) -> 1249 ?bump_col(1), 1250 {PERefName, T1, S1} = scan_pe_reference(T, S), 1251 ?strip2, 1252 case expand_pe_reference(PERefName, S2,as_PE) of 1253 {system, _} = Name -> 1254 S3 = fetch_DTD(Name, S2), 1255 check_decl(S3), 1256 scan_doctype3(T2, S3, DTD); 1257 {public, _} = Name -> 1258 S3 = fetch_DTD(Name, S2), 1259 check_decl(S3), 1260 scan_doctype3(T2, S3, DTD); 1261 {public, _, _} = Name -> 1262 S3 = fetch_DTD(Name, S2), 1263 check_decl(S3), 1264 scan_doctype3(T2, S3, DTD); 1265 ExpRef when is_list(ExpRef) -> % Space added, see Section 4.4.8 1266 {_,T3,S3} = strip(ExpRef++T2,S2), 1267 scan_doctype3(T3,S3,DTD) 1268 end; 1269scan_doctype3("]" ++ T, S0, DTD) -> 1270 ?bump_col(1), 1271 ?strip1, 1272 S2 = fetch_DTD(DTD, S1), 1273 check_decl(S2), 1274 T2 = scan_mandatory(">",T1,1,S2,expected_doctype_end_tag), 1275%% ">" ++ T2 = T1, 1276 {T2, S2}; 1277scan_doctype3(T, S, DTD) -> 1278 {_, T1, S1} = scan_markup_decl(T, S), 1279 scan_doctype3(T1, S1, DTD). 1280 1281 1282 1283fetch_DTD(undefined, S=#xmerl_scanner{doctype_DTD=URI}) when is_list(URI)-> 1284 %% allow to specify DTD name when it isn't available in xml stream 1285 fetch_DTD({system,URI},S#xmerl_scanner{doctype_DTD=option_provided}); 1286fetch_DTD(undefined, S) -> 1287 S; 1288% fetch_DTD(_,S=#xmerl_scanner{validation=false}) -> 1289% S; 1290fetch_DTD(DTDSpec, S)-> 1291 case fetch_and_parse(DTDSpec,S,[{text_decl,true}, 1292 {environment,{external,subset}}]) of 1293 NewS when is_record(NewS,xmerl_scanner) -> 1294 NewS; 1295 {_Res,_Tail,_Sx} -> % Continue with old scanner data, result in Rules 1296 S 1297 end. 1298 1299fetch_and_parse(ExtSpec,S=#xmerl_scanner{fetch_fun=Fetch, 1300 rules=Rules, 1301 xmlbase = XMLBase}, 1302 Options0) -> 1303 RetS = 1304 case Fetch(ExtSpec, S) of 1305 {ok, NewS} -> 1306 %% For backward compatibility only. This will be removed later!! 1307 NewS; 1308 {ok, not_fetched,NewS} -> 1309 NewS; 1310 {ok, DataRet, NewS = #xmerl_scanner{ 1311 fetch_path=FetchPath, 1312 user_state = UState, 1313 event_fun = Event, 1314 hook_fun = Hook, 1315 fetch_fun = Fetch1, 1316 close_fun = Close1, 1317 continuation_fun = Cont, 1318 acc_fun = Acc, 1319 rules_read_fun = Read, 1320 rules_write_fun = Write, 1321 validation = Valid, 1322 quiet = Quiet, 1323 encoding = Charset 1324 }} -> 1325 EvS = event_state(NewS), 1326 HoS = hook_state(NewS), 1327 FeS = fetch_state(NewS), 1328 CoS = cont_state(NewS), 1329 Options = Options0++[{fetch_path,FetchPath}, 1330 {user_state, UState}, 1331 {rules, Rules}, 1332 {event_fun, Event, EvS}, 1333 {hook_fun, Hook, HoS}, 1334 {fetch_fun, Fetch1, FeS}, 1335 {close_fun, Close1}, 1336 {continuation_fun, Cont, CoS}, 1337 {rules, Read, Write, ""}, 1338 {acc_fun, Acc}, 1339 {validation,Valid}, 1340 {quiet,Quiet}, 1341 {encoding,Charset}], 1342 1343 case DataRet of 1344 {file, F} -> 1345 int_file_decl(F, Options,Charset); 1346 {string, String} -> 1347 int_string_decl(String, Options,XMLBase,file_name_unknown); 1348 _ -> 1349 %% other scheme 1350 {DataRet,[],NewS} 1351 end; 1352 Error -> 1353 ?fatal({error_fetching_DTD, {ExtSpec, Error}}, S) 1354 end, 1355 case RetS of 1356 #xmerl_scanner{} -> 1357 RetS#xmerl_scanner{text_decl=false, 1358 environment=S#xmerl_scanner.environment}; 1359 _ -> RetS 1360 end. 1361 1362 1363fetch_not_parse(ExtSpec,S=#xmerl_scanner{fetch_fun=Fetch}) -> 1364 case Fetch(ExtSpec,S) of 1365 {ok, not_fetched,_NewS} -> 1366 ?fatal({error_fetching_external_source,ExtSpec},S); 1367 {ok, DataRet, NewS} -> 1368 {String,LocationName} = 1369 case DataRet of 1370 {file,F} -> 1371 {get_file(F,S),F}; 1372 {string,Str} -> 1373 {binary_to_list(Str),file_name_unknown}; 1374 {http,URI} -> 1375 {{http,URI},URI}; 1376 _ -> DataRet 1377 end, 1378 {String, NewS#xmerl_scanner{filename=LocationName}}; 1379 _ -> 1380 ?fatal({error_fetching_external_resource,ExtSpec},S) 1381 end. 1382 1383get_file(F,S) -> 1384% ?dbg("get_file F=~p~n",[F]), 1385 case file:read_file(F) of 1386 {ok,Bin} -> 1387 binary_to_list(Bin); 1388 Err -> 1389 ?fatal({error_reading_file,F,Err},S) 1390 end. 1391%% check_decl/1 1392%% Now it is necessary to check that all referenced types is declared, 1393%% since it is legal to reference some xml types before they are 1394%% declared. 1395check_decl(#xmerl_scanner{validation=V}) when V =/= dtd -> 1396 ok; 1397check_decl(#xmerl_scanner{rules=Tab} = S) -> 1398 check_notations(Tab,S), 1399 check_elements(Tab,S), %% check also attribute defs for element 1400 check_entities(Tab,S). 1401 1402check_notations(Tab,S) -> 1403 case ets:match(Tab,{{notation,'$1'},undeclared}) of 1404 [[]] -> ok; 1405 [] -> ok; 1406 [L] when is_list(L) -> 1407 ?fatal({error_missing_declaration_in_DTD,hd(L)},S); 1408 Err -> 1409 ?fatal({error_missing_declaration_in_DTD,Err},S) 1410 end. 1411 1412check_elements(Tab,S) -> 1413 case catch ets:match(Tab,{{elem_def,'_'},'$2'},10) of 1414 {_,_}=M -> 1415 Fun = fun({Match,'$end_of_table'},_F) -> 1416 lists:foreach(fun(X)->check_elements2(X,S) end, 1417 Match), 1418 ok; 1419 ('$end_of_table',_) -> 1420 ok; 1421 ({Match,Cont},F) -> 1422 lists:foreach(fun(X)->check_elements2(X,S) end, 1423 Match), 1424 F(ets:match(Cont),F) 1425 end, 1426 Fun(M,Fun); 1427 '$end_of_table' -> ok; 1428 Err -> ?fatal({error_missing_declaration_in_DTD,Err},S) 1429 end. 1430 1431% it is not an error to declare attributes for an element that is not 1432% declared. 1433check_elements2([#xmlElement{attributes=Attrs}],S) -> 1434 check_attributes(Attrs,S); 1435check_elements2(_,_) -> 1436 ok. 1437 1438check_attributes([{N1,'ID',_,_,_}=Attr|Rest],S) -> 1439 case lists:keysearch('ID',2,Rest) of 1440 {value,Att2} -> 1441 ?fatal({error_more_than_one_ID_def,N1,element(1,Att2)},S); 1442 _ -> 1443 ok 1444 end, 1445 vc_ID_Attribute_Default(Attr,S), 1446 check_attributes(Rest,S); 1447check_attributes([{_,{enumeration,_},_,_,_}=Attr|T],S) -> 1448 vc_Enumeration(Attr,S), 1449 check_attributes(T,S); 1450check_attributes([{_,Ent,_,_,_}=Attr|T],S) 1451 when Ent=='ENTITY';Ent=='ENTITIES' -> 1452 vc_Entity_Name(Attr,S), 1453 check_attributes(T,S); 1454check_attributes([_|T],S) -> 1455 check_attributes(T,S); 1456check_attributes([],_S) -> 1457 ok. 1458 1459check_entities(Tab,S=#xmerl_scanner{validation=dtd}) -> 1460 case ets:match(Tab,{{entity,'$1'},undeclared}) of 1461 [[]] -> ok; 1462 [] -> ok; 1463 [L] when is_list(L) -> 1464 ?fatal({error_missing_declaration_in_DTD,hd(L)},S); 1465 Err -> 1466 ?fatal({error_missing_declaration_in_DTD,Err},S) 1467 end; 1468check_entities(_,_) -> 1469 ok. 1470 1471 1472%% check_decl2/1: checks that all referenced ID attributes are declared 1473check_decl2(S=#xmerl_scanner{rules=Tab}) -> 1474 check_referenced_ids(Tab,S). 1475 1476 1477check_referenced_ids(Tab,S) -> 1478 case ets:match(Tab,{{id,'$1'},undeclared}) of 1479 [[]] -> ok; 1480 [] -> ok; 1481 [L] when is_list(L) -> 1482 ?fatal({error_missing_declaration_in_DTD,hd(L)},S); 1483 Err -> 1484 ?fatal({error_missing_declaration_in_DTD,Err},S) 1485 end. 1486 1487%%%%%%% [30] extSubSet ::= TextDecl? extSubsetDecl 1488 1489scan_ext_subset([], S=#xmerl_scanner{continuation_fun = F}) -> 1490 ?dbg("cont()...~n", []), 1491 F(fun(MoreBytes, S1) -> scan_ext_subset(MoreBytes, S1) end, 1492 fun(S1) -> {[], S1} end, 1493 S); 1494scan_ext_subset("%" ++ T, S0) -> 1495 %% DeclSep [28a]: WFC: PE Between Declarations. 1496 %% The replacement text of a parameter entity reference in a 1497 %% DeclSep must match the production extSubsetDecl. 1498 ?bump_col(1), 1499 {T1,S1} = scan_decl_sep(T,S), 1500 scan_ext_subset(T1, S1); 1501scan_ext_subset("<![" ++ T, S0) -> 1502 ?bump_col(3), 1503 ?strip1, 1504 {_, T2, S2} = scan_conditional_sect(T1, S1), 1505 scan_ext_subset(T2,S2); 1506scan_ext_subset(T, S) when ?whitespace(hd(T)) -> 1507 {_,T1,S1} = strip(T,S), 1508 scan_ext_subset(T1, S1); 1509scan_ext_subset(T, S) -> 1510 {_, T1, S1} = scan_markup_decl(T, S), 1511 scan_ext_subset(T1, S1). 1512 1513 1514%%%%%%% [28a] DeclSep ::= PEReference | S 1515scan_decl_sep(T,S) -> 1516 {PERefName, T1, S1} = scan_pe_reference(T, S), 1517 {ExpandedRef,S2} = 1518 case expand_pe_reference(PERefName,S1,as_PE) of 1519 Tuple when is_tuple(Tuple) -> 1520 %% {system,URI} or {public,URI} 1521 {ExpRef,_Sx}=fetch_not_parse(Tuple,S1), 1522 {ExpRef,S1}; 1523 ExpRef -> 1524 {ExpRef,S1} 1525 end, 1526 {_,TRef,S3} = strip(ExpandedRef,S2), 1527 {_,S4}=scan_ext_subset(TRef,S3), 1528 {T1,S4}. 1529% scan_decl_sep(T,S=#xmerl_scanner{rules_read_fun=Read, 1530% rules_write_fun=Write, 1531% rules_delete_fun=Delete}) -> 1532% {PERefName, T1, S1} = scan_pe_reference(T, S), 1533% {ExpandedRef,S2} = 1534% case expand_pe_reference(PERefName,S1,as_PE) of 1535% Tuple when tuple(Tuple) -> 1536% %% {system,URI} or {public,URI} 1537% {ExpRef,Sx}=fetch_not_parse(Tuple,S1), 1538% {EntV,_,_S2} = scan_entity_value(ExpRef, Sx, no_delim, 1539% PERefName,parameter), 1540% %% should do an update Write(parameter_entity) so next 1541% %% expand_pe_reference is faster 1542% Delete(parameter_entity,PERefName,_S2), 1543% _S3 = Write(parameter_entity,PERefName,EntV,_S2), 1544% EntV2 = Read(parameter_entity,PERefName,_S3), 1545% {" " ++ EntV2 ++ " ",_S3}; 1546% ExpRef -> 1547% {ExpRef,S1} 1548% end, 1549% {_, T3, S3} = strip(ExpandedRef,S2), 1550% {_T4,S4} = scan_ext_subset(T3,S3), 1551% strip(T1,S4). 1552 1553%%%%%%% [61] ConditionalSect ::= includeSect | ignoreSect 1554 1555scan_conditional_sect([], S=#xmerl_scanner{continuation_fun = F}) -> 1556 ?dbg("cont()...~n", []), 1557 F(fun(MoreBytes, S1) -> scan_conditional_sect(MoreBytes, S1) end, 1558 fatal_fun(unexpected_end), 1559 S); 1560scan_conditional_sect("IGNORE" ++ T, S0) -> 1561 ?bump_col(6), 1562 ?strip1, 1563 T2 = scan_mandatory("[",T1,1,S,expected_IGNORE_bracket), 1564% "[" ++ T2 = T1, 1565 {_,T3,S3} = strip(T2,S1), 1566 scan_ignore(T3,S3); 1567scan_conditional_sect("INCLUDE" ++ T, S0) -> 1568 ?bump_col(7), 1569 ?strip1, 1570 T2 = scan_mandatory("[",T1,1,S,expected_INCLUDE_bracket), 1571% "[" ++ T2 = T1, 1572 {_,T3,S3} = strip(T2,S1), 1573 scan_include(T3, S3); 1574scan_conditional_sect("%"++T,S0) -> 1575 ?bump_col(1), 1576 {PERefName, T1, S1} = scan_pe_reference(T, S), 1577 ExpRef = expand_pe_reference(PERefName, S1,as_PE), 1578 {_,T2,S2} = strip(ExpRef ++ T1,S1), 1579 scan_conditional_sect(T2,S2). 1580 1581 1582%%%% [63] ignoreSect ::= '<![' S? 'IGNORE' S? '[' ignoreSectContents* ']]>' 1583%%%% [64] ignoreSectContents ::= Ignore ('<![' ignoreSectContents ']]>' Ignore)* 1584%%%% [65] Ignore ::= Char* - (Char* ('<![' | ']]>') Char*) 1585scan_ignore(Str,S) -> 1586 scan_ignore(Str,S,0). 1587 1588scan_ignore([], S=#xmerl_scanner{continuation_fun = F},Level) -> 1589 ?dbg("cont()...~n", []), 1590 F(fun(MoreBytes, S1) -> scan_ignore(MoreBytes, S1,Level) end, 1591 fatal_fun(unexpected_end), 1592 S); 1593scan_ignore("<![" ++ T, S0,Level) -> 1594 %% nested conditional section. Topmost condition is ignore, though 1595 ?bump_col(3), 1596 scan_ignore(T, S,Level+1); 1597scan_ignore("]]>" ++ T, S0,0) -> 1598 ?bump_col(3), 1599 {[], T, S}; 1600scan_ignore("]]>" ++ T, S0,Level) -> 1601 ?bump_col(3), 1602 scan_ignore(T, S,Level-1); 1603scan_ignore([_H|T],S0,Level) -> 1604 ?bump_col(1), 1605 scan_ignore(T,S,Level). 1606 1607 1608%%%%%%% [62] includeSect ::= '<![' S? 'INCLUDE' S? '[' extSubsetDecl ']]>' 1609scan_include([], S=#xmerl_scanner{continuation_fun = F}) -> 1610 ?dbg("cont()...~n", []), 1611 F(fun(MoreBytes, S1) -> scan_include(MoreBytes, S1) end, 1612 fatal_fun(unexpected_end), 1613 S); 1614scan_include("]]>" ++ T, S0) -> 1615 ?bump_col(3), 1616 {[], T, S}; 1617scan_include("%" ++ T, S0) -> 1618 ?bump_col(1), 1619 {PERefName, T1, S1} = scan_pe_reference(T, S), 1620 ExpRef = expand_pe_reference(PERefName, S1,as_PE), 1621 {_,T2,S2} = strip(ExpRef ++ T1,S1), 1622 scan_include(T2, S2); 1623scan_include("<![" ++ T, S0) -> 1624 ?bump_col(3), 1625 ?strip1, 1626 {_, T2, S2} = scan_conditional_sect(T1, S1), 1627 ?strip3, 1628 scan_include(T3,S3); 1629scan_include(T, S) -> 1630 {_, T1, S1} = scan_markup_decl(T, S), 1631 scan_include(T1, S1). 1632 1633 1634%%%%%%% [29] markupdecl ::= elementdecl | AttlistDecl | EntityDecl | 1635%%%%%%% NotationDecl | PI |Comment 1636%%%%%%% [45] elementdecl ::= '<!ELEMENT' S Name S contentspec S? '>' 1637 1638%% Validity constraint: Unique Type Declaration: No element type may be 1639%% declared more than once. 1640%% 1641scan_markup_decl([], S=#xmerl_scanner{continuation_fun = F}) -> 1642 ?dbg("cont()...~n", []), 1643 F(fun(MoreBytes, S1) -> scan_markup_decl(MoreBytes, S1) end, 1644 fun(S1) -> {[], [], S1} end, 1645 S); 1646scan_markup_decl("<!--" ++ T, S0) -> 1647 ?bump_col(4), 1648 scan_comment(T, S); 1649scan_markup_decl("<?" ++ T, S0) -> 1650 ?bump_col(2), 1651 {_PI, T1, S1} = scan_pi(T, S,_Pos=markup,[]), 1652 strip(T1, S1); 1653scan_markup_decl("<!ELEMENT" ++ T, 1654 #xmerl_scanner{rules_read_fun = Read, 1655 rules_write_fun = Write, 1656 rules_delete_fun = Delete} = S0) -> 1657 ?bump_col(9), 1658 {_,T1,S1} = mandatory_strip(T,S), 1659 {Ename, _NamespaceInfo, T2, S2} = scan_name(T1, S1), 1660 Element = 1661 case Read(elem_def, Ename, S2) of 1662 El = #xmlElement{elementdef=Decl} when Decl =/= undeclared -> 1663 case S2#xmerl_scanner.validation of 1664 dtd -> 1665 ?fatal({already_defined, Ename}, S2); 1666 _ -> 1667 Delete(elem_def,Ename,S2), 1668 El 1669 end; 1670 El = #xmlElement{} -> 1671 Delete(elem_def,Ename,S2), 1672 El; 1673 undefined -> 1674 #xmlElement{} 1675 end, 1676 {_,T3,S3} = mandatory_strip(T2,S2), 1677 {Edef, T4, S4} = scan_contentspec(T3, S3), 1678 ?strip5, 1679 {">" ++ T6,S6} = scan_element_completion(T5,S5), 1680 S7 = Write(elem_def, Ename, 1681 Element#xmlElement{name = Ename, 1682 content = Edef, 1683 elementdef=S6#xmerl_scanner.environment}, 1684 S6#xmerl_scanner{col=S6#xmerl_scanner.col+1}), 1685 strip(T6,S7); 1686scan_markup_decl("<!ENTITY" ++ T, S0) -> 1687 %% <!ENTITY [%] entity.name NDATA notation.name> 1688 %% <!ENTITY [%] entity.name "replacement text"> 1689 %% <!ENTITY [%] entity.name SYSTEM "system.identifier"> 1690 %% <!ENTITY [%] entity.name PUBLIC public.identifier "system.identifier"> 1691 ?bump_col(8), 1692 {_,T1,S1} = mandatory_strip(T,S), 1693 {T2, S2} = scan_entity(T1, S1), 1694 strip(T2,S2); 1695scan_markup_decl("<!NOTATION" ++ T, S0) -> 1696 %% <!NOTATION notation.name "public.identifier" "helper.application"> 1697 ?bump_col(10), 1698 {_,T1,S1} = mandatory_strip(T,S), 1699 {T2, S2} = scan_notation_decl(T1, S1), 1700 strip(T2,S2); 1701scan_markup_decl("<!ATTLIST" ++ T, 1702 #xmerl_scanner{rules_read_fun = Read, 1703 rules_write_fun = Write, 1704 rules_delete_fun= Delete} = S0) -> 1705 %% <!ATTLIST Ename ( AttrName Type Value )*> 1706 ?bump_col(9), 1707 {_,T1,S1} = mandatory_strip(T,S), 1708 {Ename, _NamespaceInfo, T2, S2} = scan_name(T1, S1), 1709% ?strip3, 1710 {Attributes, T4, S4} = scan_attdef(T2, S2), 1711 {EDEF,MergedAttrs} = 1712 case Read(elem_def, Ename, S4) of 1713 undefined -> %% this may happen when the ELEMENT is declared in 1714 %% the external DTD but the ATTLIST in the 1715 %% internal DTD. 1716 {#xmlElement{},update_attributes(Attributes,[])}; 1717 Edef = #xmlElement{attributes = OldAttrs} -> 1718 Delete(elem_def,Ename,S4), 1719 %% the slot in rules table must be empty so that the 1720 %% later write has the assumed effect. Read maybe 1721 %% should empty the table slot. 1722 {Edef,update_attributes(Attributes, OldAttrs)} 1723 end, 1724 NewEdef = EDEF#xmlElement{name=Ename,attributes = MergedAttrs}, 1725 S5 = Write(elem_def, Ename, NewEdef, S4), 1726 T5 = T4, 1727 strip(T5,S5); 1728scan_markup_decl(_Str,S) -> 1729 ?fatal(expected_markup,S). 1730 1731scan_element_completion(T,S) -> 1732 scan_markup_completion_gt(T,S). 1733 1734update_attributes(NewAttrs, OldAttrs) -> 1735 update_attributes1(NewAttrs,lists:reverse(OldAttrs)). 1736 1737update_attributes1([A = {Name,_Type,_DefaultV,_DefaultD,_Env}|Attrs], 1738 OldAttrs) -> 1739 case lists:keymember(Name, 1, OldAttrs) of 1740 true -> 1741 update_attributes1(Attrs, OldAttrs); 1742 false -> 1743 update_attributes1(Attrs, [A|OldAttrs]) 1744 end; 1745update_attributes1([],Acc) -> 1746 lists:reverse(Acc). 1747 1748 1749%%%%%%% [53] AttDef 1750 1751scan_attdef([], S=#xmerl_scanner{continuation_fun = F}) -> 1752 ?dbg("cont()...~n", []), 1753 F(fun(MoreBytes, S1) -> scan_attdef(MoreBytes, S1) end, 1754 fatal_fun(unexpected_end), 1755 S); 1756scan_attdef(T, S) -> 1757 scan_attdef(T, S, _AttrAcc = []). 1758 1759 1760scan_attdef([], S=#xmerl_scanner{continuation_fun = F}, Attrs) -> 1761 ?dbg("cont()...~n", []), 1762 F(fun(MoreBytes, S1) -> scan_attdef(MoreBytes, S1, Attrs) end, 1763 fatal_fun(unexpected_end), 1764 S); 1765scan_attdef(">" ++ T, S0, Attrs) -> 1766 ?bump_col(1), 1767 {lists:reverse(Attrs), T, S}; 1768scan_attdef("%" ++ _T, S=#xmerl_scanner{environment=prolog}, _Attrs) -> 1769 ?fatal({error,{wfc_PEs_In_Internal_Subset}},S); 1770scan_attdef("%" ++ T, S0, Attrs) -> 1771 ?bump_col(1), 1772 {PERefName, T1, S1} = scan_pe_reference(T, S), 1773 ExpRef = expand_pe_reference(PERefName, S1,as_PE), 1774 {_,T2,S2} = strip(ExpRef ++ T1,S1), 1775 scan_attdef(T2, S2, Attrs); 1776scan_attdef(T,S,Attrs) -> 1777 {_,T1,S1} = mandatory_strip(T,S), 1778 scan_attdef2(T1,S1,Attrs). 1779 1780scan_attdef2(">" ++ T, S0, Attrs) -> 1781 ?bump_col(1), 1782 {lists:reverse(Attrs), T, S}; 1783scan_attdef2("%" ++ _T, S=#xmerl_scanner{environment=prolog}, _Attrs) -> 1784 ?fatal({error,{wfc_PEs_In_Internal_Subset}},S); 1785scan_attdef2("%" ++ T, S0, Attrs) -> 1786 ?bump_col(1), 1787 {PERefName, T1, S1} = scan_pe_reference(T, S), 1788 ExpRef = expand_pe_reference(PERefName, S1,as_PE), 1789 {_,T2,S2} = strip(ExpRef ++ T1,S1), 1790 scan_attdef2(T2, S2, Attrs); 1791scan_attdef2(T, S, Attrs) -> 1792 {AttName, _NamespaceInfo, T1, S1} = scan_name(T, S), 1793 {_,T2,S2} = mandatory_strip(T1,S1), 1794 {AttType, T3, S3} = scan_att_type(T2, S2), 1795 {_,T4,S4} = mandatory_strip(T3,S3), 1796 {{DefaultDecl,DefaultValue}, T5, S5} = scan_default_decl(T4, S4, AttType), 1797 ?strip6, 1798 Attr = {AttName, AttType,DefaultValue,DefaultDecl, 1799 S#xmerl_scanner.environment}, 1800 scan_attdef2(T6, S6, [Attr|Attrs]). 1801 1802 1803%%% [54] StringType 1804scan_att_type([], S=#xmerl_scanner{continuation_fun = F}) -> 1805 ?dbg("cont()...~n", []), 1806 F(fun(MoreBytes, S1) -> scan_att_type(MoreBytes, S1) end, 1807 fatal_fun(unexpected_end), 1808 S); 1809scan_att_type("CDATA" ++ T, S0) -> 1810 ?bump_col(5), 1811 {'CDATA', T, S}; 1812%%% [55] TokenizedType 1813scan_att_type("IDREFS" ++ T, S0) -> 1814 ?bump_col(6), 1815 {'IDREFS', T, S}; 1816scan_att_type("IDREF" ++ T, S0) -> 1817 ?bump_col(5), 1818 {'IDREF', T, S}; 1819scan_att_type("ID" ++ T, S0) -> 1820 ?bump_col(2), 1821 {'ID', T, S}; 1822scan_att_type("ENTITY" ++ T, S0) -> 1823 ?bump_col(6), 1824 {'ENTITY', T, S}; 1825scan_att_type("ENTITIES" ++ T, S0) -> 1826 ?bump_col(8), 1827 {'ENTITIES', T, S}; 1828scan_att_type("NMTOKENS" ++ T, S0) -> 1829 ?bump_col(8), 1830 {'NMTOKENS', T, S}; 1831scan_att_type("NMTOKEN" ++ T, S0) -> 1832 ?bump_col(7), 1833 {'NMTOKEN', T, S}; 1834%%% [57] EnumeratedType 1835scan_att_type("NOTATION" ++ T, S0) -> 1836 ?bump_col(8), 1837 {_,T1,S1} = mandatory_strip(T,S), 1838 T2 = scan_mandatory("(",T1,1,S1,expected_parenthesis_after_NOTATION), 1839% "(" ++ T2 = T1, 1840 S2 = S1, 1841 ?strip3, 1842 {Name, _NamespaceInfo, T4, S4} = scan_name(T3, S3), 1843 notation_exists(Name, S4), 1844 ?strip5, 1845 scan_notation_type(T5, S5, [Name]); 1846scan_att_type("(" ++ T, S0) -> 1847 ?bump_col(1), 1848 ?strip1, 1849 {NmToken, _NamespaceInfo, T2, S2} = scan_nmtoken(T1, S1), 1850 ?strip3, 1851 scan_enumeration(T3, S3, [NmToken]); 1852scan_att_type("%" ++ _T, S=#xmerl_scanner{environment=prolog}) -> 1853 ?fatal({error,{wfc_PEs_In_Internal_Subset}},S); 1854scan_att_type("%" ++ T, S0) -> 1855 ?bump_col(1), 1856 {PERefName, T1, S1} = scan_pe_reference(T, S), 1857 ExpRef = expand_pe_reference(PERefName, S1,in_literal), 1858 {ExpRef,T1,S1}. 1859 1860%%% [58] NotationType 1861 1862scan_notation_type([], S=#xmerl_scanner{continuation_fun = F}, Acc) -> 1863 ?dbg("cont()...~n", []), 1864 F(fun(MoreBytes, S1) -> scan_notation_type(MoreBytes, S1, Acc) end, 1865 fatal_fun(unexpected_end), 1866 S); 1867scan_notation_type(")" ++ T, S0, Acc) -> 1868 ?bump_col(1), 1869 {{notation, lists:reverse(Acc)}, T, S}; 1870scan_notation_type("|" ++ T, S0, Acc) -> 1871 ?bump_col(1), 1872 ?strip1, 1873 {Name, _NamespaceInfo, T2, S2} = scan_name(T1, S1), 1874 notation_exists(Name, S2), 1875 ?strip3, 1876 scan_notation_type(T3, S3, [Name | Acc]). 1877 1878%%% Validity constraint for NotationType: 1879%%% The used notation names must be declared in the DTD, but they may 1880%%% be declared later. 1881notation_exists(Name, #xmerl_scanner{rules_read_fun = Read, 1882 rules_write_fun = Write } = S) -> 1883 case Read(notation, Name, S) of 1884 undefined -> 1885 %% this is legal, since the referenced NOTATION 1886 %% may be declared later in internal or external 1887 %% subset. 1888 Write(notation,Name,undeclared,S); 1889 _Value -> 1890 ok 1891 end. 1892 1893%%% [59] Enumeration 1894 1895scan_enumeration([], S=#xmerl_scanner{continuation_fun = F}, Acc) -> 1896 ?dbg("cont()...~n", []), 1897 F(fun(MoreBytes, S1) -> scan_enumeration(MoreBytes, S1, Acc) end, 1898 fatal_fun(unexpected_end), 1899 S); 1900scan_enumeration(")" ++ T, S0, Acc) -> 1901 ?bump_col(1), 1902 {{enumeration, lists:reverse(Acc)}, T, S}; 1903scan_enumeration("|" ++ T, S0, Acc) -> 1904 ?bump_col(1), 1905 ?strip1, 1906 {NmToken, _NamespaceInfo, T2, S2} = scan_nmtoken(T1, S1), 1907 ?strip3, 1908 scan_enumeration(T3, S3, [NmToken|Acc]). 1909 1910 1911%%%%%%% [60] DefaultDecl 1912 1913scan_default_decl([], S=#xmerl_scanner{continuation_fun = F}, Type) -> 1914 ?dbg("cont()...~n", []), 1915 F(fun(MoreBytes, S1) -> scan_default_decl(MoreBytes, S1, Type) end, 1916 fatal_fun(unexpected_end), 1917 S); 1918scan_default_decl("#REQUIRED" ++ T, S0, _Type) -> 1919 ?bump_col(9), 1920 {{'#REQUIRED',no_value}, T, S}; 1921scan_default_decl("#IMPLIED" ++ T, S0, _Type) -> 1922 ?bump_col(8), 1923 {{'#IMPLIED',no_value}, T, S}; 1924scan_default_decl("#FIXED" ++ T, S0, Type) -> 1925 ?bump_col(6), 1926 {_,T1,S1} = mandatory_strip(T,S), 1927 {Value,T2,S2,_} = default_value(T1, S1, Type), 1928 {{'#FIXED',Value},T2,S2}; 1929scan_default_decl(Str, S, Type) -> 1930 {Value,T1,S1,_} = default_value(Str, S, Type), 1931 {{no_decl,Value},T1,S1}. 1932 1933 1934%% There is room here to validate against Type, but we don't do it at 1935%% the moment. 1936default_value(T, S, Type) -> 1937 {_Val, _T1, _S1,_} = scan_att_value(T, S, Type). 1938 1939 1940%%%%%%% [71] EntityDef 1941 1942scan_entity([], S=#xmerl_scanner{continuation_fun = F}) -> 1943 ?dbg("cont()...~n", []), 1944 F(fun(MoreBytes, S1) -> scan_entity(MoreBytes, S1) end, 1945 fatal_fun(unexpected_end), 1946 S); 1947scan_entity("%" ++ T, #xmerl_scanner{rules_write_fun = Write} = S0) -> 1948 %% parameter entity 1949 ?bump_col(1), 1950 {_,T1,S1} = mandatory_strip(T,S), 1951 {PEName, _NamespaceInfo, T2, S2} = scan_name_no_colons(T1, S1), 1952 {_,T3,S3} = mandatory_strip(T2,S2), 1953 {PEDef, T4, S4} = scan_pe_def(T3, S3, PEName), 1954 ?strip5, 1955 {">" ++ T6,S6} = scan_entity_completion(T5,S5), 1956 S7 = Write(parameter_entity, PEName, PEDef, S6), 1957 {T6, S7}; 1958scan_entity(T, #xmerl_scanner{rules_write_fun = Write, 1959 rules_read_fun = Read, 1960 rules_delete_fun = Delete} = S) -> 1961 %% generic entity 1962 {EName, _NamespaceInfo, T1, S1} = scan_name_no_colons(T, S), 1963 {_,T2,S2} = mandatory_strip(T1,S1), 1964 {EDef, EntType, T3, S3} = scan_entity_def(T2, S2, EName), 1965 check_entity_recursion(EName,S3), 1966 ?strip4, 1967 {">" ++ T5,S5} = scan_entity_completion(T4,S4), 1968 case Read(entity,EName,S5) of 1969 undeclared -> Delete(entity,EName,S5); 1970 _ -> ok 1971 end, 1972 S6 = Write(entity, EName, {S5#xmerl_scanner.environment,EntType,EDef}, S5), 1973 {T5, S6}. 1974 1975scan_entity_completion(T,S) -> 1976 scan_markup_completion_gt(T,S). 1977 1978%%%%%%% [73] EntityDef 1979 1980scan_entity_def([], S=#xmerl_scanner{continuation_fun = F}, EName) -> 1981 ?dbg("cont()...~n", []), 1982 F(fun(MoreBytes, S1) -> scan_entity_def(MoreBytes, S1, EName) end, 1983 fatal_fun(unexpected_end), 1984 S); 1985scan_entity_def("'" ++ T, S0, EName) -> 1986 ?bump_col(1), 1987 {EVal,T1,S1}=scan_entity_value(T, S, $', EName,general), 1988 {EVal,internal,T1,S1}; 1989scan_entity_def("\"" ++ T, S0, EName) -> 1990 ?bump_col(1), 1991 {EVal,T1,S1}=scan_entity_value(T, S, $", EName,general), 1992 {EVal,internal,T1,S1}; 1993%% external general entity, parsed or unparsed. 1994scan_entity_def(Str, S, EName) -> 1995 {ExtID, T1, S1} = scan_external_id(Str, S), 1996 {NData, T2, S2} = scan_ndata_decl(T1, S1), 1997 case NData of 1998 {ndata,_} -> 1999 %% if NDATA exists it is an unparsed ENTITY 2000 {{ExtID,NData},external,T2,S2}; 2001 _ -> 2002 case fetch_and_parse(ExtID,S2, 2003 [{text_decl,true}, 2004 {environment,{external,{entity,EName}}}]) of 2005 {{_USret,Entity},_Tail,_Sx} -> 2006 {Entity, external,T2, S2}; 2007 {Entity,_Tail,Sx} -> 2008 OldRef=S2#xmerl_scanner.entity_references, 2009 NewRef=Sx#xmerl_scanner.entity_references, 2010 {Entity,external,T2, 2011 S2#xmerl_scanner{entity_references=OldRef++NewRef}}; 2012 {error,enoent} -> % this bad entity is declared, 2013 % but it may not be referenced, 2014 % then it would not be an 2015 % error. 2016 {{error,enoent},external,T2,S2} 2017 end 2018 end. 2019 2020 2021scan_ndata_decl([], S=#xmerl_scanner{continuation_fun = F}) -> 2022 ?dbg("cont()...~n", []), 2023 F(fun(MoreBytes, S1) -> scan_ndata_decl(MoreBytes, S1) end, 2024 fatal_fun(unexpected_end), 2025 S); 2026scan_ndata_decl(Str = ">"++_T, S) -> 2027 {[], Str, S}; 2028scan_ndata_decl(T, S) -> 2029 {_,T1,S1} = mandatory_strip(T,S), 2030 scan_ndata_decl2(T1,S1). 2031scan_ndata_decl2(Str = ">"++_T,S) -> 2032 {[], Str, S}; 2033scan_ndata_decl2("NDATA" ++ T,S0 = #xmerl_scanner{rules_read_fun = Read, 2034 rules_write_fun = Write}) -> 2035 ?bump_col(5), 2036 {_,T1,S1} = mandatory_strip(T,S), 2037 {Name, _NamespaceInfo, T2, S2} = scan_name(T1, S1), 2038 case Read(notation, Name, S2) of 2039 undefined -> %% this is legal, since the referenced NOTATION 2040 %% may be declared later in internal or external 2041 %% subset. 2042 Write(notation,Name,undeclared,S2), 2043 {{ndata,Name},T2,S2}; 2044 _Value -> 2045 {{ndata, Name}, T2, S2} 2046 end. 2047 2048%%%%%%% [39] element 2049 2050scan_element(T, S, Pos) -> 2051 scan_element(T, S, Pos, S#xmerl_scanner.space, 2052 _Lang = [], _Parents = [], #xmlNamespace{}). 2053 2054scan_element(T, S=#xmerl_scanner{line=L,col=C}, 2055 Pos, SpaceDefault,Lang, Parents, NS) -> 2056 {Name, NamespaceInfo, T1, S1} = scan_name(T, S), 2057 vc_Element_valid(Name,NamespaceInfo,S), 2058 ?strip2, 2059 scan_element(T2, S2, Pos, Name, L, C, _Attrs = [], 2060 Lang, Parents, NamespaceInfo, NS, 2061 SpaceDefault). 2062 2063 2064scan_element("/", S=#xmerl_scanner{continuation_fun = F}, 2065 Pos, Name, StartL, StartC, Attrs, Lang, Parents, 2066 NSI, NS, SpaceDefault) -> 2067 ?dbg("trailing / detected~n", []), 2068 F(fun(MoreBytes, S1) -> scan_element("/" ++ MoreBytes, S1, 2069 Pos, Name, StartL, StartC, Attrs, 2070 Lang,Parents,NSI,NS,SpaceDefault) end, 2071 fatal_fun(unexpected_end), 2072 S); 2073scan_element([], S=#xmerl_scanner{continuation_fun = F}, 2074 Pos, Name, StartL, StartC, Attrs, Lang, Parents, 2075 NSI, NS, SpaceDefault) -> 2076 ?dbg("cont()...~n", []), 2077 F(fun(MoreBytes, S1) -> scan_element(MoreBytes, S1, 2078 Pos, Name, StartL, StartC, Attrs, 2079 Lang,Parents,NSI,NS,SpaceDefault) end, 2080 fatal_fun(unexpected_end), 2081 S); 2082scan_element("/>" ++ T, S0 = #xmerl_scanner{hook_fun = Hook, 2083 event_fun = Event, 2084 line = L, col = C, 2085 xmlbase_cache=XMLBase}, Pos, 2086 Name, _StartL, _StartC, Attrs0, Lang, Parents, NSI, 2087 Namespace, _SpaceDefault) -> 2088 ?bump_col(2), 2089 Attrs = lists:reverse(Attrs0), 2090 E=processed_whole_element(S, Pos, Name, Attrs, Lang, Parents,NSI,Namespace), 2091 2092 #xmlElement{attributes = Attrs1} = E, 2093 wfc_unique_att_spec(Attrs1,S), 2094 S1 = #xmerl_scanner{} = Event(#xmerl_event{event = ended, 2095 line = L, 2096 col = C, 2097 data = E}, S0), 2098 {Ret, S2} = Hook(E, S1), 2099 S2b=S2#xmerl_scanner{xmlbase=XMLBase}, 2100 {Ret, T, S2b}; 2101scan_element(">", S=#xmerl_scanner{continuation_fun = F}, 2102 Pos, Name, StartL, StartC, Attrs, Lang, Parents, 2103 NSI, NS, SpaceDefault) -> 2104 ?dbg("trailing > detected~n", []), 2105 F(fun(MoreBytes, S1) -> scan_element(">" ++ MoreBytes, S1, 2106 Pos, Name, StartL, StartC, Attrs, 2107 Lang,Parents,NSI,NS,SpaceDefault) end, 2108 fatal_fun(unexpected_end), 2109 S); 2110scan_element(">" ++ T, S0 = #xmerl_scanner{event_fun = Event, 2111 hook_fun = Hook, 2112 line = L, col = C, 2113 xmlbase_cache=XMLBase, 2114 space = SpaceOption}, 2115 Pos, Name, StartL, StartC, Attrs0, Lang, Parents, 2116 NSI, Namespace, SpaceDefault) -> 2117 ?bump_col(1), 2118 Attrs = lists:reverse(Attrs0), 2119 E0=processed_whole_element(S,Pos,Name,Attrs,Lang,Parents,NSI,Namespace), 2120 2121 #xmlElement{attributes = Attrs1} = E0, 2122 wfc_unique_att_spec(Attrs1,S), 2123 XMLSpace = case lists:keysearch('xml:space', #xmlAttribute.name, Attrs1) of 2124 false -> SpaceDefault; 2125 {value, #xmlAttribute{value="default"}} -> SpaceOption; 2126 {value, #xmlAttribute{value="preserve"}} -> preserve; 2127 _ -> SpaceDefault 2128 end, 2129 2130 E0=processed_whole_element(S,Pos,Name,Attrs1,Lang,Parents,NSI,Namespace), 2131 S1 = #xmerl_scanner{} = Event(#xmerl_event{event = started, 2132 line = StartL, 2133 col = StartC, 2134 data = E0}, S), 2135 2136 {Content, T1, S2} = scan_content(T, S1, Name, Attrs1, XMLSpace, 2137 E0#xmlElement.language, 2138 [{Name, Pos}|Parents], Namespace), 2139 2140 Element=E0#xmlElement{content=Content, 2141 xmlbase=E0#xmlElement.xmlbase}, 2142 S3 = #xmerl_scanner{} = Event(#xmerl_event{event = ended, 2143 line = L, 2144 col = C, 2145 data = Element}, S2), 2146 {Ret, S4} = Hook(Element, S3), 2147 S4b=S4#xmerl_scanner{xmlbase=XMLBase}, 2148 {Ret, T1, S4b}; 2149scan_element(T, S, Pos, Name, StartL, StartC, Attrs, Lang, Parents, 2150 NSI, NS, SpaceDefault) -> 2151 {AttName, NamespaceInfo, T1, S1} = scan_name(T, S), 2152 {T2, S2} = scan_eq(T1, S1), 2153 {AttType,_DefaultDecl} = get_att_type(S2,AttName,Name), 2154 {AttValue, T3a, S3a,IsNorm} = scan_att_value(T2, S2, AttType), 2155%% check_default_value(S3,DefaultDecl,AttValue), 2156 NewNS = check_namespace(AttName, NamespaceInfo, AttValue, NS), 2157 {T3,S3} = wfc_whitespace_betw_attrs(T3a,S3a), 2158 ?strip4, 2159 AttrPos = case Attrs of 2160 [] -> 2161 1; 2162 [#xmlAttribute{pos = P}|_] -> 2163 P+1 2164 end, 2165 Attr = #xmlAttribute{name = AttName, 2166 parents = [{Name, Pos}|Parents], 2167 pos = AttrPos, 2168 language = Lang, 2169 nsinfo = NamespaceInfo, 2170 value = AttValue, 2171 normalized = IsNorm}, 2172 XMLBase=if 2173 AttName=='xml:base' -> 2174 resolve_relative_uri(AttValue,S4#xmerl_scanner.xmlbase); 2175 true -> 2176 S4#xmerl_scanner.xmlbase 2177 end, 2178 2179 #xmerl_scanner{event_fun = Event, 2180 line = Line, 2181 col = Col} = S4, 2182 S5 = Event(#xmerl_event{event = ended, 2183 line = Line, 2184 col = Col, 2185 data = Attr}, 2186 S4#xmerl_scanner{xmlbase=XMLBase, 2187 xmlbase_cache=S#xmerl_scanner.xmlbase}), 2188 scan_element(T4, S5, Pos, Name, StartL, StartC, [Attr|Attrs], 2189 Lang, Parents, NSI, NewNS, SpaceDefault). 2190 2191get_default_attrs(S = #xmerl_scanner{rules_read_fun = Read}, ElemName) -> 2192 case Read(elem_def, ElemName, S) of 2193 #xmlElement{attributes = Attrs} -> 2194 [ {AttName, AttValue} || 2195 {AttName, _, AttValue, _, _} <- Attrs, AttValue =/= no_value ]; 2196 _ -> [] 2197 end. 2198 2199get_att_type(S=#xmerl_scanner{rules_read_fun=Read},AttName,ElemName) -> 2200 case Read(elem_def,ElemName,S) of 2201 #xmlElement{attributes = Attrs} -> 2202 case lists:keysearch(AttName,1,Attrs) of 2203 {value,{_,AttType,_,DefaultDecl,_}} -> 2204 {AttType,DefaultDecl}; 2205 _ -> {'CDATA',no_value} %% undefined attribute shall be treated as CDATA 2206 end; 2207 _ -> {'CDATA',no_value} 2208 end. 2209 2210resolve_relative_uri(NewBase="/"++_,CurrentBase) -> 2211 case xmerl_uri:parse(CurrentBase) of 2212 {error,_Reason} -> 2213 NewBase; 2214 {Scheme,Host,Port,_Path,_Query} -> 2215 atom_to_list(Scheme)++Host++":"++integer_to_list(Port)++NewBase 2216 end; 2217resolve_relative_uri(NewBase,CurrentBase) -> 2218 filename:join(CurrentBase,NewBase). 2219 2220 2221processed_whole_element(S=#xmerl_scanner{hook_fun = _Hook, 2222 xmlbase = XMLBase, 2223 line = _L, col = _C, 2224 event_fun = _Event}, 2225 Pos, Name, Attrs, Lang, Parents, NSI, Namespace) -> 2226 Language = check_language(Attrs, Lang), 2227 2228 AllAttrs = 2229 case S#xmerl_scanner.default_attrs of 2230 true -> 2231 DefaultAttrs = 2232 [ #xmlAttribute{name = AttName, 2233 parents = [{Name, Pos} | Parents], 2234 language = Lang, 2235 nsinfo = NSI, 2236 namespace = Namespace, 2237 value = AttValue, 2238 normalized = true} || 2239 {AttName, AttValue} <- get_default_attrs(S, Name), 2240 AttValue =/= no_value, 2241 not lists:keymember(AttName, #xmlAttribute.name, Attrs) ], 2242 lists:append(Attrs, DefaultAttrs); 2243 false -> 2244 Attrs 2245 end, 2246 2247 {ExpName, ExpAttrs} = 2248 case S#xmerl_scanner.namespace_conformant of 2249 true -> 2250 %% expand attribute names. We need to do this after having 2251 %% scanned all attributes of the element, since (as far as 2252 %% I can tell), XML Names only specifies that namespace attrs 2253 %% are valid within the whole scope of the element in which 2254 %% they are declared, which should also mean that even if they 2255 %% are declared after some other attributes, the namespace 2256 %% should apply to those attributes as well. 2257 %% Note that the default URI does not apply to attrbute names. 2258 TempNamespace = Namespace#xmlNamespace{default = []}, 2259 ExpAttrsX = 2260 [A#xmlAttribute{ 2261 namespace=Namespace, 2262 expanded_name=expanded_name( 2263 A#xmlAttribute.name, 2264 A#xmlAttribute.nsinfo, 2265 % NSI, 2266 TempNamespace, S)} || A <- AllAttrs], 2267 {expanded_name(Name, NSI, Namespace, S), ExpAttrsX}; 2268 false -> 2269 {Name, AllAttrs} 2270 end, 2271 2272 #xmlElement{name = Name, 2273 xmlbase = XMLBase, 2274 pos = Pos, 2275 parents = Parents, 2276 attributes = ExpAttrs, 2277 language = Language, 2278 expanded_name = ExpName, 2279 nsinfo = NSI, 2280 namespace = Namespace}. 2281 2282 2283check_language([#xmlAttribute{name='xml:lang',value=Lang}|_], _) -> 2284 Lang; 2285check_language([_|T], Lang) -> 2286 check_language(T, Lang); 2287check_language([], Lang) -> 2288 Lang. 2289 2290 2291check_namespace(xmlns, _, Value, NS) -> 2292 NS#xmlNamespace{default = list_to_atom(Value)}; 2293check_namespace(_, {"xmlns", Prefix}, Value, 2294 NS = #xmlNamespace{nodes = Ns}) -> 2295 NS#xmlNamespace{nodes = keyreplaceadd( 2296 Prefix, 1, Ns, {Prefix, list_to_atom(Value)})}; 2297check_namespace(_, _, _, NS) -> 2298 NS. 2299 2300 2301expanded_name(Name, [], #xmlNamespace{default = []}, _S) -> 2302 Name; 2303expanded_name(Name, [], #xmlNamespace{default = URI}, S) -> 2304 case URI of 2305 'http://www.w3.org/XML/1998/namespace' -> 2306 ?fatal(cannot_bind_default_namespace_to_xml_namespace_name, S); 2307 'http://www.w3.org/2000/xmlns/' -> 2308 ?fatal(cannot_bind_default_namespace_to_xmlns_namespace_name, S); 2309 _ -> 2310 {URI, Name} 2311 end; 2312expanded_name(Name, N = {"xmlns", Local}, #xmlNamespace{nodes = Ns}, S) -> 2313 {_, Value} = lists:keyfind(Local, 1, Ns), 2314 case Name of 2315 'xmlns:xml' when Value =:= 'http://www.w3.org/XML/1998/namespace' -> 2316 N; 2317 'xmlns:xml' when Value =/= 'http://www.w3.org/XML/1998/namespace' -> 2318 ?fatal({xml_prefix_cannot_be_redeclared, Value}, S); 2319 'xmlns:xmlns' -> 2320 ?fatal({xmlns_prefix_cannot_be_declared, Value}, S); 2321 _ -> 2322 case Value of 2323 'http://www.w3.org/XML/1998/namespace' -> 2324 ?fatal({cannot_bind_prefix_to_xml_namespace, Local}, S); 2325 'http://www.w3.org/2000/xmlns/' -> 2326 ?fatal({cannot_bind_prefix_to_xmlns_namespace, Local}, S); 2327 _ -> 2328 N 2329 end 2330 end; 2331expanded_name(_Name, {"xml", Local}, _NS, _S) -> 2332 {'http://www.w3.org/XML/1998/namespace', list_to_atom(Local)}; 2333expanded_name(_Name, {Prefix, Local}, #xmlNamespace{nodes = Ns}, S) -> 2334 case lists:keysearch(Prefix, 1, Ns) of 2335 {value, {_, URI}} -> 2336 {URI, list_to_atom(Local)}; 2337 false -> 2338 %% A namespace constraint of XML Names is that the prefix 2339 %% must be declared 2340 ?fatal({namespace_prefix_not_declared, Prefix}, S) 2341 end. 2342 2343keyreplaceadd(K, Pos, [H|T], Obj) when K == element(Pos, H) -> 2344 [Obj|T]; 2345keyreplaceadd(K, Pos, [H|T], Obj) -> 2346 [H|keyreplaceadd(K, Pos, T, Obj)]; 2347keyreplaceadd(_K, _Pos, [], Obj) -> 2348 [Obj]. 2349 2350%%%%%%% [10] AttValue 2351%% normalize the attribute value according to XML 1.0 section 3.3.3 2352 2353scan_att_value([], S=#xmerl_scanner{continuation_fun = F},AT) -> 2354 ?dbg("cont()...~n", []), 2355 F(fun(MoreBytes, S1) -> scan_att_value(MoreBytes, S1, AT) end, 2356 fatal_fun(unexpected_end), 2357 S); 2358scan_att_value("%"++_T,S=#xmerl_scanner{environment=prolog},_AttType) -> 2359 ?fatal({error,{wfc_PEs_In_Internal_Subset}},S); 2360scan_att_value("%"++T,S0=#xmerl_scanner{rules_read_fun=Read, 2361 rules_write_fun=Write, 2362 rules_delete_fun=Delete},AttType) -> 2363 ?bump_col(1), 2364 {Name,T1,S1} = scan_pe_reference(T,S), 2365 {ExpandedRef,S2} = 2366 case expand_pe_reference(Name,S1,in_literal) of 2367 Tuple when is_tuple(Tuple) -> 2368 %% {system,URI} or {public,URI} 2369 %% Included in literal, just get external file. 2370 {ExpRef,Sx}=fetch_not_parse(Tuple,S1), 2371 {EntV,_,_S2} = scan_entity_value(ExpRef, Sx, no_delim, 2372 Name,parameter), 2373 %% should do an update Write(parameter_entity) so next 2374 %% expand_pe_reference is faster 2375 Delete(parameter_entity,Name,_S2), 2376 _S3 = Write(parameter_entity,Name,EntV,_S2), 2377 EntV2 = Read(parameter_entity,Name,_S3), 2378 {EntV2,_S3}; 2379 ExpRef -> 2380 {ExpRef,S1} 2381 end, 2382 {_,T2,S3} = strip(ExpandedRef ++ T1,S2), 2383 scan_att_value(T2,S3,AttType); 2384scan_att_value([H|T], S0,'CDATA'=AT) when H == $"; H == $' -> 2385 ?bump_col(1), 2386 scan_att_chars(T, S, H, [],[], AT,false); 2387scan_att_value([H|T], S0,AttType) when H == $"; H == $' -> 2388 ?bump_col(1), 2389 {T1,S1,IsNorm} = normalize(T,S,false), 2390 scan_att_chars(T1, S1, H, [],[], AttType,IsNorm). 2391 2392scan_att_chars([],S=#xmerl_scanner{continuation_fun=F},H,Acc,TmpAcc,AT,IsNorm)-> 2393 ?dbg("cont()...~n", []), 2394 F(fun(MoreBytes, S1) -> 2395 scan_att_chars(MoreBytes, S1, H, Acc,TmpAcc,AT,IsNorm) 2396 end, 2397 fatal_fun(unexpected_end), 2398 S); 2399scan_att_chars([H|T], S0, H, Acc, TmpAcc,AttType,IsNorm) -> % End quote 2400 ?bump_col(1), 2401 check_att_default_val(S#xmerl_scanner.validation,TmpAcc,AttType,S), 2402 {Acc2,S2,IsNorm2} = 2403 if 2404 AttType == 'CDATA' -> {Acc,S,IsNorm}; 2405 true -> 2406 normalize(Acc,S,IsNorm) 2407 end, 2408 {lists:flatten(lists:reverse(Acc2)), T, S2,IsNorm2}; 2409scan_att_chars("&" ++ T, S0, Delim, Acc, TmpAcc,AT,IsNorm) -> % Reference 2410 ?bump_col(1), 2411 {ExpRef, T1, S1} = scan_reference(T, S), 2412 case markup_delimeter(ExpRef) of 2413 true -> 2414 scan_att_chars(T1,S1,Delim,[ExpRef|Acc],[ExpRef|TmpAcc],AT,IsNorm); 2415 _ -> 2416 case T of 2417 "#" ++ _ -> 2418 %% normalization rules (sec 3.3.3) require that for 2419 %% character references, the referenced character be 2420 %% added directly to the normalized value 2421 {T2,S2,IsNorm2} = 2422 if 2423 ?whitespace(hd(ExpRef)) -> 2424 normalize(T1, S1, IsNorm); 2425 true -> 2426 {T1, S1, IsNorm} 2427 end, 2428 scan_att_chars(T2, S2, Delim, ExpRef ++ Acc, TmpAcc, AT, IsNorm2); 2429 _ -> 2430 Ch = string_to_char_set(S#xmerl_scanner.encoding, ExpRef), 2431 scan_att_chars(Ch ++ T1, S1, Delim, Acc, TmpAcc, AT, IsNorm) 2432 end 2433 end; 2434scan_att_chars("<" ++ _T, S0, _Delim, _Acc,_, _,_) -> % Tags not allowed here 2435 ?fatal(unexpected_char, S0); 2436scan_att_chars([H|T], S0, Delim, Acc, _TmpAcc,'CDATA',IsNorm) 2437 when ?whitespace(H) -> 2438 ?bump_col(1), 2439 scan_att_chars(T, S, Delim, [$\s|Acc], [],'CDATA',IsNorm); 2440scan_att_chars([H|T], S0, Delim, Acc, TmpAcc,AT,IsNorm) 2441 when ?whitespace(H) -> 2442 ?bump_col(1), 2443 {T1,S1,IsNorm2} = normalize(T,S,IsNorm), 2444 check_att_default_val(S#xmerl_scanner.validation,TmpAcc,AT,S1), 2445 scan_att_chars(T1, S1, Delim, [$\s|Acc],[], AT,IsNorm2); 2446scan_att_chars(Str, S0, Delim, Acc, TmpAcc,AT,IsNorm) -> 2447 ?bump_col(1), 2448 {Ch,T} = to_ucs(S#xmerl_scanner.encoding,Str), 2449 valid_Char(S#xmerl_scanner.validation,AT,Ch,S), 2450 scan_att_chars(T, S, Delim, [Ch|Acc], [Ch|TmpAcc],AT,IsNorm). 2451 2452markup_delimeter("&")-> true; 2453markup_delimeter("\"") -> true; 2454markup_delimeter("\'") -> true; 2455markup_delimeter("<") -> true; 2456markup_delimeter(">") -> true; 2457markup_delimeter("%") -> true; 2458markup_delimeter(_) -> false. 2459 2460check_att_default_val(dtd,[],_Ent,_S) -> 2461 ok; 2462check_att_default_val(dtd,RevName,Ent,S) -> 2463 check_att_default_val(lists:reverse(RevName),Ent,S); 2464check_att_default_val(_,_,_,_) -> 2465 ok. 2466 2467check_att_default_val(Name,Ent,S=#xmerl_scanner{rules_write_fun=Write}) 2468 when Ent == 'ENTITY'; Ent == 'ENTITIES' -> 2469 case xmerl_lib:is_letter(hd(Name)) of 2470 true -> ok; 2471 _ -> ?fatal({illegal_first_character,Ent,Name},S) 2472 end, 2473 SName = list_to_atom(Name), 2474 Write(entity,SName,undeclared,S); 2475check_att_default_val(Name,IDR,S=#xmerl_scanner{rules_write_fun=Write}) 2476 when IDR == 'IDREF'; IDR == 'IDREFS' -> 2477 case xmerl_lib:is_letter(hd(Name)) of 2478 true -> ok; 2479 _ -> ?fatal({illegal_first_character,IDR,Name},S) 2480 end, 2481 SName = list_to_atom(Name), 2482 Write(id,SName,undeclared,S); 2483check_att_default_val(Name,'ID',S=#xmerl_scanner{rules_write_fun=Write, 2484 rules_read_fun=Read, 2485 rules_delete_fun=Delete}) -> 2486 case xmerl_lib:is_name(Name) of 2487 false -> 2488 ?fatal({'ID_names_must_be_Name_production',Name},S); 2489 _ -> 2490 ok 2491 end, 2492 SName = if 2493 is_list(Name) -> list_to_atom(Name); 2494 true -> Name 2495 end, 2496 case Read(id,SName,S) of 2497 undeclared -> %% was referenced in IDREF/IDREFS before defined 2498 Delete(id,SName,S); 2499 SName -> ?fatal({values_must_be_unique,'ID',SName},S); 2500 undefined -> ok 2501 end, 2502 Write(id,SName,SName,S); 2503check_att_default_val(_,_,_) -> 2504 ok. 2505 2506valid_Char(dtd,AT,C,S) when AT=='NMTOKEN';AT=='NMTOKENS' -> 2507 vc_Valid_Char(AT,C,S); 2508valid_Char(_,_,[C],S) -> 2509 case xmerl_lib:is_char(C) of 2510 true -> 2511 ok; 2512 false -> 2513 ?fatal({unexpected_char,C}, S) 2514 end; 2515valid_Char(_,_,C,S) -> 2516 case xmerl_lib:is_char(C) of 2517 true -> 2518 ok; 2519 false -> 2520 ?fatal({unexpected_char,C}, S) 2521 end. 2522 2523 2524 2525%%%%%%% [43] content 2526 2527scan_content(T, S, Name, Attrs, Space, Lang, Parents, NS) -> 2528 scan_content(T, S, _Pos = 1, Name, Attrs, Space, 2529 Lang, Parents, NS, _Acc = [],_MarkupDel=[]). 2530 2531scan_content("<", S= #xmerl_scanner{continuation_fun = F}, 2532 Pos, Name, Attrs, Space, Lang, Parents, NS, Acc,_) -> 2533 ?dbg("trailing < detected~n", []), 2534 F(fun(MoreBytes, S1) -> scan_content("<" ++ MoreBytes, S1, 2535 Pos, Name, Attrs, 2536 Space, Lang, Parents, NS, Acc,[]) end, 2537 fatal_fun(unexpected_end), 2538 S); 2539scan_content([], S=#xmerl_scanner{environment={external,{entity,_}}}, 2540 _Pos, _Name, _Attrs, _Space, _Lang, _Parents, _NS, Acc,_) -> 2541 {lists:reverse(Acc),[],S}; 2542scan_content([], S=#xmerl_scanner{environment=internal_parsed_entity}, 2543 _Pos, _Name, _Attrs, _Space, _Lang, _Parents, _NS, Acc,_) -> 2544 {lists:reverse(Acc),[],S}; 2545scan_content([], S=#xmerl_scanner{continuation_fun = F}, 2546 Pos, Name, Attrs, Space, Lang, Parents, NS, Acc,_) -> 2547 ?dbg("cont()...~n", []), 2548 F(fun(MoreBytes, S1) -> scan_content(MoreBytes, S1, 2549 Pos, Name, Attrs, 2550 Space, Lang, Parents, NS, Acc,[]) end, 2551 fatal_fun(unexpected_end), 2552 S); 2553scan_content("</" ++ T, S0, _Pos, Name, _Attrs, _Space, _Lang, 2554 _Parents, _NS, Acc,[]) -> 2555 ?bump_col(2), 2556 {ETagName, _NamespaceInfo, T1, S1} = scan_name(T, S), 2557 if ETagName == Name -> 2558 ok; 2559 true -> 2560 ?fatal({endtag_does_not_match, {was,ETagName,should_have_been, Name}}, S) 2561 end, 2562 ?strip2, 2563 case T2 of 2564 ">" ++ T3 -> 2565 {lists:reverse(Acc), T3, S2}; 2566 _ -> 2567 ?fatal({error,{unexpected_end_of_STag}},S) 2568 end; 2569scan_content([$&|_T]=Str, 2570 #xmerl_scanner{environment={external,{entity,EName}}} = S0, 2571 Pos, Name, Attrs, Space, Lang, Parents, NS, Acc,_) -> 2572 {_EntV,T1,S1}=scan_entity_value(Str,S0 ,[],EName,general), 2573 %%This is a problem. All referenced entities in the external entity must be checked for recursion, thus parse the contentbut,skip result. 2574 scan_content(T1,S1,Pos, Name, Attrs, Space, Lang, Parents, NS, Acc,[]); 2575scan_content("&"++T, 2576 #xmerl_scanner{environment=internal_parsed_entity} = S, 2577 Pos, Name, Attrs, Space, Lang, Parents, NS, Acc,_) -> 2578 {_, T1, S1} = scan_reference(T, S), 2579 scan_content(T1,S1,Pos, Name, Attrs, Space, Lang, Parents, NS, Acc,[]); 2580scan_content("&" ++ T, S0, Pos, Name, Attrs, Space, Lang, Parents, NS, Acc,[]) -> 2581 ?bump_col(1), 2582 {ExpRef, T1, S1} = scan_reference(T, S), 2583 case markup_delimeter(ExpRef) of 2584 true -> scan_content(ExpRef++T1,S1,Pos,Name,Attrs,Space,Lang,Parents,NS,Acc,ExpRef); 2585 _ -> 2586 scan_content(string_to_char_set(S1#xmerl_scanner.encoding,ExpRef)++T1,S1,Pos,Name,Attrs,Space,Lang,Parents,NS,Acc,[]) 2587 end; 2588scan_content("<!--" ++ T, S0=#xmerl_scanner{acc_fun = F, comments=CF}, Pos, Name, Attrs, Space, 2589 Lang, Parents, NS, Acc,[]) -> 2590 ?bump_col(4), 2591 {C, T1, S1} = scan_comment(T, S, Pos, Parents, Lang), 2592 case CF of 2593 true -> 2594 {Acc2, Pos2, S3} = 2595 case F(C, Acc, S1) of 2596 {Acc1, S2} -> 2597 {Acc1, Pos + 1, S2}; 2598 {Acc1, Pos1, S2} -> 2599 {Acc1, Pos1, S2} 2600 end, 2601 scan_content(T1, S3, Pos2, Name, Attrs, Space, Lang, Parents, NS, Acc2,[]); 2602 false -> 2603 scan_content(T1, S1, Pos, Name, Attrs, Space, Lang, Parents, NS, Acc,[]) 2604 end; 2605scan_content("<" ++ T, S0, Pos, Name, Attrs, Space, Lang, Parents, NS, Acc,[]) -> 2606 ?bump_col(1), 2607 {Markup, T1, S1} = 2608 scan_content_markup(T, S, Pos, Name, Attrs, Space, Lang, Parents, NS), 2609 AccF = S1#xmerl_scanner.acc_fun, 2610 {NewAcc, NewPos, NewS} = case AccF(Markup, Acc, S1) of 2611 {Acc2, S2} -> 2612 {Acc2, Pos+1, S2}; 2613 {Acc2, Pos2, S2} -> 2614 {Acc2, Pos2, S2} 2615 end, 2616 scan_content(T1, NewS, NewPos, Name, Attrs, Space, Lang, 2617 Parents, NS, NewAcc,[]); 2618scan_content([_H|T], S= #xmerl_scanner{environment={external,{entity,_}}}, 2619 Pos, Name, Attrs, Space, Lang, Parents, NS, Acc,_) -> 2620 %% Guess we have to scan the content to find any internal entity 2621 %% references. 2622 scan_content(T,S,Pos, Name, Attrs, Space, Lang, Parents, NS, Acc,[]); 2623scan_content(T, S=#xmerl_scanner{acc_fun = F, 2624 event_fun = Event, 2625 hook_fun=Hook, 2626 line = _L}, 2627 Pos, Name, Attrs, Space, Lang, Parents, NS, Acc,MarkupDel) -> 2628 Text0 = #xmlText{pos = Pos, 2629 parents = Parents}, 2630 S1 = #xmerl_scanner{} = Event(#xmerl_event{event = started, 2631 line = S#xmerl_scanner.line, 2632 data = Text0}, S), 2633 {Data, T1, S2} = scan_char_data(T, S1, Space,MarkupDel), 2634 Text = Text0#xmlText{value = Data}, 2635 {Ret,S2b} = Hook(Text,S2), 2636 S3 = #xmerl_scanner{} = Event(#xmerl_event{event = ended, 2637 line = S2b#xmerl_scanner.line, 2638 data = Ret}, S2b), 2639 {NewAcc, NewPos, NewS} = case F(Ret, Acc, S3) of 2640 {Acc4, S4} -> 2641 {Acc4, Pos+1, S4}; 2642 {Acc4, Pos4, S4} -> 2643 {Acc4, Pos4, S4} 2644 end, 2645 scan_content(T1, NewS, NewPos, Name, Attrs, Space, Lang, 2646 Parents, NS, NewAcc,[]). 2647 2648 2649scan_content_markup([], S=#xmerl_scanner{continuation_fun = F}, 2650 Pos, Name, Attrs, Space, Lang, Parents, NS) -> 2651 ?dbg("cont()...~n", []), 2652 F(fun(MoreBytes, S1) -> scan_content_markup( 2653 MoreBytes,S1,Pos,Name, 2654 Attrs,Space,Lang,Parents,NS) end, 2655 fatal_fun(unexpected_end), 2656 S); 2657scan_content_markup("![CDATA[" ++ T, S0, Pos, _Name, _Attrs, 2658 _Space, _Lang, Parents, _NS) -> 2659 ?bump_col(8), 2660 scan_cdata(T, S, Pos, Parents); 2661scan_content_markup("?"++T,S0,Pos,_Name,_Attrs,_Space,_Lang,Parents,_NS) -> 2662 ?bump_col(1), 2663 scan_pi(T, S, Pos, Parents); 2664scan_content_markup(T, S, Pos, _Name, _Attrs, Space, Lang, Parents, NS) -> 2665 scan_element(T, S, Pos, Space, Lang, Parents, NS). 2666 2667scan_char_data(T, S, Space,MUD) -> 2668 scan_char_data(T, S, Space,MUD, _Acc = []). 2669 2670%%%%%%% [14] CharData 2671 2672scan_char_data([], S=#xmerl_scanner{environment={external,{entity,_}}}, 2673 _Space,_MUD, Acc) -> 2674 2675 {lists:reverse(Acc), [], S}; 2676scan_char_data([], S=#xmerl_scanner{environment=internal_parsed_entity}, 2677 _Space, _MUD,Acc) -> 2678 2679 {lists:reverse(Acc), [], S}; 2680scan_char_data([], S=#xmerl_scanner{continuation_fun = F}, Space, _MUD,Acc) -> 2681 ?dbg("cont()...~n", []), 2682 F(fun(MoreBytes, S1) -> scan_char_data(MoreBytes,S1,Space,_MUD,Acc) end, 2683 fatal_fun(unexpected_end), 2684 S); 2685scan_char_data([$&|T], S,Space,"&",Acc) -> 2686 scan_char_data(T, S, Space,[], [$&|Acc]); 2687scan_char_data(T=[$&|_], S,_Space,_MUD,Acc) -> 2688 2689 {lists:reverse(Acc), T, S}; 2690scan_char_data("]]>" ++ _T, S, _Space,_MUD, _Acc) -> 2691 %% See Section 2.4: Especially: 2692 %% "The right angle bracket (>) MAY be represented using the string ">", 2693 %% and MUST, for compatibility, be escaped using either ">" or a 2694 %% character reference when it appears in the string "]]>" in content, when 2695 %% that string is not marking the end of a CDATA section. 2696 ?fatal(unexpected_cdata_end, S); 2697scan_char_data([$<|T],S,Space,"<", Acc) -> 2698 scan_char_data(T, S, Space,[], [$<|Acc]); 2699scan_char_data(T = [$<|_], S, _Space,_MUD,Acc) -> 2700 2701 {lists:reverse(Acc), T, S}; 2702scan_char_data(T = [H|R], S, Space,MUD, Acc) when ?whitespace(H) -> 2703 if 2704 MUD =:= [], Acc =:= [], H =:= $\n, Space =:= preserve -> 2705 case fast_accumulate_whitespace(R, S, T) of 2706 {done, Reply} -> 2707 Reply; 2708 {NewAcc, T1, S1} -> 2709 scan_char_data(T1, S1, Space, MUD, NewAcc) 2710 end; 2711 true -> 2712 {NewAcc, T1, S1} = accumulate_whitespace(T, S, Space, Acc), 2713 scan_char_data(T1, S1, Space,MUD,NewAcc) 2714 end; 2715scan_char_data([H1,H2|_T],S,_Space,_MUD,_Acc) when ?non_character(H1,H2) -> 2716 ?fatal({error,{not_allowed_to_use_Unicode_noncharacters}},S); 2717scan_char_data("]]>"++_T,S,_Space,_MUD,_Acc) -> 2718 ?fatal({error,{illegal_character_in_content,"]]>"}},S); 2719scan_char_data(Str,S0,Space,MUD,Acc) -> 2720 ?bump_col(1), 2721 {Ch,T} = wfc_legal_char(Str,S), 2722 scan_char_data(T,S,Space,MUD,[Ch|Acc]). 2723 2724 2725 2726%%%%%%% [18]-[21] CDATA 2727 2728scan_cdata(Str, S, Pos, Parents) -> 2729 scan_cdata(Str, S, Pos, Parents, _Acc = []). 2730 2731 2732scan_cdata([], S=#xmerl_scanner{continuation_fun = F}, Pos, Parents, Acc) -> 2733 ?dbg("cont()...~n", []), 2734 F(fun(MoreBytes, S1) -> scan_cdata(MoreBytes, S1, Pos, Parents, Acc) end, 2735 fatal_fun(unexpected_end), 2736 S); 2737scan_cdata("]]>" ++ T, S0, Pos, Parents, Acc) -> 2738 ?bump_col(3), 2739 {#xmlText{pos = Pos, 2740 parents = Parents, 2741 value = lists:reverse(Acc), 2742 type = cdata}, T, S}; 2743scan_cdata(Str, S0, Pos, Parents, Acc) -> 2744 {Ch,T} = to_ucs(S0#xmerl_scanner.encoding,Str), 2745 case xmerl_lib:is_char(Ch) of 2746 true -> 2747 ?bump_col(1), 2748 scan_cdata(T, S, Pos, Parents, [Ch|Acc]); 2749 false -> 2750 ?fatal({unexpected_char,Ch}, S0) 2751 end. 2752 2753 2754%%%%%%% [67] Reference 2755%% returns a three tuple {Result,RestBuf,State} 2756 2757scan_reference([], S=#xmerl_scanner{continuation_fun = F}) -> 2758 ?dbg("cont()...~n", []), 2759 F(fun(MoreBytes, S1) -> scan_reference(MoreBytes, S1) end, 2760 fatal_fun(unexpected_end), 2761 S); 2762scan_reference("#x" ++ T, S0) -> 2763 %% [66] CharRef 2764 ?bump_col(1), 2765 if hd(T) /= $; -> 2766 scan_char_ref_hex(T, S, 0); 2767 true -> 2768 ?fatal(invalid_char_ref, S) 2769 end; 2770scan_reference("#" ++ T, S0) -> 2771 %% [66] CharRef 2772 ?bump_col(1), 2773 if hd(T) /= $; -> 2774 scan_char_ref_dec(T, S, []); 2775 true -> 2776 ?fatal(invalid_char_ref, S) 2777 end; 2778scan_reference(T, S) -> 2779 case catch scan_entity_ref(T, S) of 2780 {'EXIT', _} -> 2781 ?fatal(error_scanning_entity_ref,S); 2782 Other -> 2783 Other 2784 end. 2785 2786 2787%% Chapter 4.4.2: ... the replacement text of entities used to escape 2788%% markup delimiters (the entities amp, lt, gt, apos, quot) is always treated 2789%% as data. (The string "AT&T;" expands to "AT&T;" and the remaining 2790%% ampersand is not recognized as an entity-reference delimiter.)" 2791%% 2792%% How to achieve this? My current approach is to insert the *strings* "&", 2793%% "<", ">", "'", and "\"" instead of the characters. The processor will 2794%% ignore them when performing multiple expansions. This means, for now, that 2795%% the character data output by the processor is (1-2 levels) deep. 2796%% At some suitable point, we should flatten these, so that application-level 2797%% processors should not have to be aware of this detail. 2798 2799scan_entity_ref([], S=#xmerl_scanner{continuation_fun = F}) -> 2800 ?dbg("cont()...~n", []), 2801 F(fun(MoreBytes, S1) -> scan_entity_ref(MoreBytes, S1) end, 2802 fatal_fun(unexpected_end), 2803 S); 2804scan_entity_ref("amp;" ++ T, S0) -> 2805 ?bump_col(4), 2806 {"&", T, S}; 2807scan_entity_ref("lt;" ++ T, S0) -> 2808 ?bump_col(3), 2809 {"<", T, S}; 2810scan_entity_ref("gt;" ++ T, S0) -> 2811 ?bump_col(3), 2812 {">", T, S}; 2813scan_entity_ref("apos;" ++ T, S0) -> 2814 ?bump_col(5), 2815 {"'", T, S}; 2816scan_entity_ref("quot;" ++ T, S0) -> 2817 ?bump_col(5), 2818 {"\"", T, S}; 2819scan_entity_ref(T, S) -> 2820 {Name, _NamespaceInfo, T1, S1} = scan_name(T, S), 2821 T2 = scan_mandatory(";",T1,1,S1,expected_entity_reference_semicolon), 2822% ";" ++ T2 = T1, 2823 S2 = S1, 2824 Entity = expand_reference(Name, S2), 2825 {Entity, T2, S2}. 2826 2827 2828%%%%%%% [69] PEReference 2829 2830scan_pe_reference(T, S) -> 2831 {Name, _NamespaceInfo, T1, S1} = scan_name(T, S), 2832 T2 = scan_mandatory(";",T1,1,S1,expected_parsed_entity_reference_semicolon), 2833% ";" ++ T2 = T1, 2834 {Name, T2, S1#xmerl_scanner{col = S1#xmerl_scanner.col+1}}. 2835 2836expand_pe_reference(Name, #xmerl_scanner{rules_read_fun = Read} = S,WS) -> 2837 case Read(parameter_entity, Name, S) of 2838 undefined -> 2839 ?fatal({unknown_parameter_entity, Name}, S); % WFC or VC failure 2840 Err={error,_Reason} -> 2841 ?fatal(Err,S); 2842 Tuple when is_tuple(Tuple) -> 2843 Tuple; 2844 Result -> 2845 if 2846 WS == in_literal -> Result; 2847 true -> " "++Result++" " 2848 end 2849 end. 2850 2851% Currently unused 2852% 2853% expand_external_pe_reference(Name, #xmerl_scanner{rules_read_fun = Read} = S) -> 2854% case Read(parameter_entity, Name, S) of 2855% undefined -> 2856% ?fatal({unknown_parameter_entity, Name}, S); 2857% Result -> 2858% fetch_DTD(Result,S) 2859% end. 2860 2861 2862%%%%%%% [68] EntityReference 2863 2864expand_reference(Name, #xmerl_scanner{environment={external,{entity,_}}}) -> 2865 atom_to_list(Name); 2866expand_reference(Name, #xmerl_scanner{environment=internal_parsed_entity}) -> 2867 atom_to_list(Name); 2868expand_reference(Name, #xmerl_scanner{rules_read_fun = Read} = S) -> 2869 case Read(entity, Name, S) of 2870 undefined -> 2871 ?fatal({unknown_entity_ref, Name}, S); 2872 {_,external,{error,enoent}} -> 2873 ?fatal({error,{entity_target_not_found,{error,enoent},Name}},S); 2874 {DefEnv,EntType,Value} -> 2875 wfc_Entity_Declared(DefEnv,S,Name), 2876 Value2 = string_to_char_set(S#xmerl_scanner.encoding,Value), 2877 wfc_Internal_parsed_entity(EntType,Value2,S), 2878 Value 2879 end. 2880 2881 2882%%%%%%% [66] CharRef 2883 2884scan_char_ref_dec([], S=#xmerl_scanner{continuation_fun = F}, Acc) -> 2885 ?dbg("cont()...~n", []), 2886 F(fun(MoreBytes, S1) -> scan_char_ref_dec(MoreBytes, S1, Acc) end, 2887 fatal_fun(unexpected_end), 2888 S); 2889scan_char_ref_dec([H|T], S0, Acc) when H >= $0, H =< $9 -> 2890 ?bump_col(1), 2891 scan_char_ref_dec(T, S, [H|Acc]); 2892scan_char_ref_dec(";" ++ T, S0, Acc) -> 2893 ?bump_col(1), 2894 Ref = list_to_integer(lists:reverse(Acc)), 2895 {Ch,_} = wfc_legal_char(Ref,S), 2896 {[Ch], T, S}. %% changed return value from [[Ref]] 2897 2898 2899scan_char_ref_hex([], S=#xmerl_scanner{continuation_fun = F}, Acc) -> 2900 ?dbg("cont()...~n", []), 2901 F(fun(MoreBytes, S1) -> scan_char_ref_hex(MoreBytes, S1, Acc) end, 2902 fatal_fun(unexpected_end), 2903 S); 2904scan_char_ref_hex([H|T], S0, Acc) when H >= $0, H =< $9 -> 2905 ?bump_col(1), 2906 Dec = H - $0, 2907 scan_char_ref_hex(T, S, (Dec bor (Acc bsl 4))); 2908scan_char_ref_hex([H|T], S0, Acc) when H >= $a, H =< $f -> 2909 ?bump_col(1), 2910 Dec = (H - $a) + 10, 2911 scan_char_ref_hex(T, S, (Dec bor (Acc bsl 4))); 2912scan_char_ref_hex([H|T], S0, Acc) when H >= $A, H =< $F -> 2913 ?bump_col(1), 2914 Dec = (H - $A) + 10, 2915 scan_char_ref_hex(T, S, (Dec bor (Acc bsl 4))); 2916scan_char_ref_hex(";" ++ T, S0, Acc) -> 2917 ?bump_col(1), 2918 {Ch,_} = wfc_legal_char(Acc,S), 2919 {[Ch], T, S}. %% changed return value from [[Acc]] 2920 2921 2922 2923%%%%%%% [25] Eq 2924%%% Eq ::= S? '=' S? 2925scan_eq(T, S) -> 2926 ?strip1, 2927 case T1 of 2928 [$=|T2] -> 2929 S2 = S1#xmerl_scanner{col=S1#xmerl_scanner.col+1}, 2930 ?strip3, 2931 {T3, S3}; 2932 _ -> 2933 ?fatal(assignment_expected,S) 2934 end. 2935 2936 2937%% scan_name/2 2938%% 2939%% We perform some checks here to make sure that the names conform to 2940%% the "Namespaces in XML" specification. This is an option. 2941%% 2942%% Qualified Name: 2943%% [6] QName ::= (Prefix ':')? LocalPart 2944%% [7] Prefix ::= NCName 2945%% [8] LocalPart ::= NCName 2946%% [4] NCName ::= (Letter | '_') (NCNameChar)* 2947%% [5] NCNameChar ::= Letter | Digit | '.' | '-' | '_' 2948%% | CombiningChar | Extender 2949 2950 2951%% The effect of XML Names (namespace) conformance is that: 2952%% - All element types and attribute names contain either zero or one colon 2953%% - No entity names, PI targets, or notation names contain any colons. 2954%% 2955%% scan_name_no_colons/2 will ensure that the name contains no colons iff 2956%% the scanner has been told to be namespace conformant. Otherwise, it will 2957%% behave exactly like scan_name/2. 2958%% 2959scan_name_no_colons(Str, S) -> 2960 NSC = S#xmerl_scanner.namespace_conformant, 2961 case NSC of 2962 true -> 2963 {Target, NSI, T1, S1} = 2964 scan_name(Str,S#xmerl_scanner{namespace_conformant=no_colons}), 2965 {Target,NSI,T1,S1#xmerl_scanner{namespace_conformant=NSC}}; 2966 false -> 2967 scan_name(Str, S) 2968 end. 2969 2970 2971 2972%% [5] Name ::= (Letter | '_' | ':') (NameChar)* 2973scan_name([], S=#xmerl_scanner{continuation_fun = F}) -> 2974 ?dbg("cont()...~n", []), 2975 F(fun(MoreBytes, S1) -> scan_name(MoreBytes, S1) end, 2976 fatal_fun(unexpected_end), 2977 S); 2978scan_name(Str = [$:|T], S0 = #xmerl_scanner{namespace_conformant = NSC}) -> 2979 if NSC == false -> 2980 ?bump_col(1), 2981 scan_nmtoken(T, S, [$:], NSC); 2982 NSC == no_colons -> 2983 ?fatal({invalid_NCName, lists:sublist(Str, 1, 6)}, S0); 2984 true -> 2985 %% In order to conform with the "Namespaces in XML" spec, 2986 %% we cannot allow names to begin with ":" 2987 ?fatal({invalid_NCName, lists:sublist(Str, 1, 6)}, S0) 2988 end; 2989scan_name([$_|T], S0 = #xmerl_scanner{namespace_conformant = NSC}) -> 2990 ?bump_col(1), 2991 scan_nmtoken(T, S, [$_], NSC); 2992scan_name("%"++_T,S=#xmerl_scanner{environment=prolog}) -> 2993 ?fatal({error,{wfc_PEs_In_Internal_Subset}},S); 2994scan_name("%"++T,S0=#xmerl_scanner{environment={external,_}}) -> 2995 %% parameter entity that expands to a name 2996 ?bump_col(1), 2997 {PERefName, T1, S1} = scan_pe_reference(T, S), 2998 ExpRef = expand_pe_reference(PERefName, S1,as_PE), 2999 {_,T2,S2} = strip(ExpRef ++ T1,S1), 3000 scan_name(T2,S2); 3001scan_name(Str, S0 = #xmerl_scanner{namespace_conformant = NSC}) -> 3002 {Ch,T} = to_ucs(S0#xmerl_scanner.encoding,Str), 3003 case xmerl_lib:is_letter(Ch) of 3004 true -> 3005 ?bump_col(1), 3006 scan_nmtoken(T, S, [Ch], NSC); 3007 false -> 3008 ?fatal({invalid_name, lists:sublist(Str, 1, 6)}, S0) 3009 end; 3010scan_name(Str, S) -> 3011 ?fatal({invalid_name, Str}, S). 3012 3013 3014 3015 3016 3017 3018scan_nmtoken(Str, S, Acc, NSC) -> 3019 scan_nmtoken(Str, S, Acc, _Prefix = [], _Local = Acc, NSC,isLatin1(hd(Acc),true)). 3020 3021%% scan_nmtoken/2 3022%% [7] NmToken ::= (NameChar)+ 3023scan_nmtoken([], S=#xmerl_scanner{continuation_fun = F}) -> 3024 ?dbg("cont()...~n", []), 3025 F(fun(MoreBytes, S1) -> scan_nmtoken(MoreBytes, S1) end, 3026 fatal_fun(unexpected_end), 3027 S); 3028scan_nmtoken("%"++T, S0=#xmerl_scanner{environment={external,_}}) -> 3029 ?bump_col(1), 3030 {PERefName, T1, S1} = scan_pe_reference(T, S), 3031 ExpRef = expand_pe_reference(PERefName, S1,as_PE), 3032 {_,T2,S2} = strip(ExpRef ++ T1,S1), 3033 scan_nmtoken(T2,S2); 3034scan_nmtoken(Str, S) -> 3035 {Ch,T} = to_ucs(S#xmerl_scanner.encoding,Str), 3036 case xmerl_lib:is_namechar(Ch) of 3037 true -> 3038 scan_nmtoken(T, S#xmerl_scanner{col = S#xmerl_scanner.col+1}, 3039 _Acc = [Ch], _Prefix = [], _Local = [Ch], 3040 _NamespaceConformant = false,isLatin1(Ch,true)); 3041 false -> 3042 ?fatal({invalid_nmtoken, lists:sublist(Str, 1, 6)}, S) 3043 end. 3044 3045 3046scan_nmtoken([], S=#xmerl_scanner{continuation_fun = F}, 3047 Acc, Prefix, Local, NSC,IsLatin1) -> 3048 ?dbg("cont()...~n", []), 3049 F(fun(MoreBytes, S1) -> scan_nmtoken(MoreBytes,S1,Acc,Prefix,Local,NSC,IsLatin1) end, 3050 fun(S1) -> {list_to_atom(lists:reverse(Acc)), 3051 namespace_info(Prefix, Local),[],S1} end, 3052 S); 3053%% whitespace marks the end of a name 3054scan_nmtoken(Str = [H|_], S, Acc, Prefix, Local, _NSC,true) when ?whitespace(H) -> 3055 %% we don't strip here because the occurrence of whitespace may be an error 3056 %% e.g. <!ELEMENT spec (front, body, back ?)> 3057 NmString = lists:reverse(Acc), 3058 {list_to_atom(NmString), namespace_info(Prefix, Local), Str, S}; 3059scan_nmtoken(Str = [$:|_], S, Acc, [], _Local, no_colons,_IsLatin1) -> 3060 ?fatal({invalid_NCName, 3061 lists:sublist(lists:reverse(Acc) ++ Str, 1, 6)}, S); 3062scan_nmtoken([$:|T], S0, Acc, [], Local, NSC, IsLatin1) -> 3063 ?bump_col(1), 3064 scan_nmtoken(T, S, [$:|Acc], lists:reverse(Local), [], NSC,IsLatin1); 3065scan_nmtoken(Str = [$:|_T], S, Acc, _Prefix, _Local, _NSC = true,_IsLatin1) -> 3066 %% non-empty Prefix means that we've encountered a ":" already. 3067 %% Conformity with "Namespaces in XML" requires 3068 %% at most one colon in a name 3069 ?fatal({invalid_NCName, 3070 lists:sublist(lists:reverse(Acc) ++ Str, 1, 6)}, S); 3071 3072%% non-namechar also marks the end of a name 3073scan_nmtoken(Str, S0, Acc, Prefix, Local, NSC,IsLatin1) -> 3074 ?bump_col(1), 3075 {Ch,T} = to_ucs(S#xmerl_scanner.encoding,Str), 3076 case {xmerl_lib:is_namechar(Ch),IsLatin1} of 3077 {true,_} -> 3078 scan_nmtoken(T, S, [Ch|Acc], Prefix, [Ch|Local], NSC,isLatin1(Ch,IsLatin1)); 3079 {_,true} -> 3080 NmStr = lists:reverse(Acc), 3081 {list_to_atom(NmStr), namespace_info(Prefix, Local), Str, S}; 3082 _ -> 3083 {lists:reverse(Acc), namespace_info(Prefix, Local), Str, S} 3084 end. 3085 3086namespace_info([], _) -> 3087 []; 3088namespace_info(Prefix, Local) -> 3089 {Prefix, lists:reverse(Local)}. 3090 3091isLatin1(_Ch,false) -> 3092 false; 3093isLatin1(Ch,_) when Ch > 255 -> 3094 false; 3095isLatin1(_,_) -> 3096 true. 3097 3098%%%%%%% [11] SystemLiteral 3099 3100scan_system_literal([], S=#xmerl_scanner{continuation_fun = F}) -> 3101 ?dbg("cont()...~n", []), 3102 F(fun(MoreBytes, S1) -> scan_system_literal(MoreBytes, S1) end, 3103 fatal_fun(unexpected_end), 3104 S); 3105scan_system_literal("\"" ++ T, S) -> 3106 scan_system_literal(T, S, $", []); 3107scan_system_literal("'" ++ T, S) -> 3108 scan_system_literal(T, S, $', []). 3109 3110 3111scan_system_literal([], S=#xmerl_scanner{continuation_fun = F}, 3112 Delimiter, Acc) -> 3113 ?dbg("cont()...~n", []), 3114 F(fun(MoreBytes, S1) -> scan_system_literal(MoreBytes,S1,Delimiter,Acc) end, 3115 fatal_fun(unexpected_end), 3116 S); 3117scan_system_literal([H|T], S, H, Acc) -> 3118 {lists:reverse(Acc), T, S#xmerl_scanner{col = S#xmerl_scanner.col+1}}; 3119scan_system_literal("#"++_R, S, _H, _Acc) -> 3120 %% actually not a fatal error 3121 ?fatal(fragment_identifier_in_system_literal,S); 3122scan_system_literal(Str, S, Delimiter, Acc) -> 3123 {Ch,T} = to_ucs(S#xmerl_scanner.encoding,Str), 3124 scan_system_literal(T, S#xmerl_scanner{col = S#xmerl_scanner.col+1}, 3125 Delimiter, [Ch|Acc]). 3126 3127 3128%%%%%%% [12] PubidLiteral 3129 3130scan_pubid_literal([], S=#xmerl_scanner{continuation_fun = F}) -> 3131 ?dbg("cont()...~n", []), 3132 F(fun(MoreBytes, S1) -> scan_pubid_literal(MoreBytes, S1) end, 3133 fatal_fun(unexpected_end), 3134 S); 3135scan_pubid_literal([H|T], S) when H == $"; H == $' -> 3136 scan_pubid_literal(T, S#xmerl_scanner{col = S#xmerl_scanner.col+1}, H, []); 3137scan_pubid_literal([H|_T], S) -> 3138 ?fatal({invalid_pubid_char, H}, S). 3139 3140 3141scan_pubid_literal([], S=#xmerl_scanner{continuation_fun = F}, 3142 Delimiter, Acc) -> 3143 ?dbg("cont()...~n", []), 3144 F(fun(MoreBytes, S1) -> scan_pubid_literal(MoreBytes,S1,Delimiter,Acc) end, 3145 fatal_fun(unexpected_end), 3146 S); 3147scan_pubid_literal([H|T], S, H, Acc) -> 3148 {lists:reverse(Acc), T, S#xmerl_scanner{col = S#xmerl_scanner.col+1}}; 3149scan_pubid_literal(Str = [H|_], S, Delimiter, Acc) when ?whitespace(H) -> 3150 %% Before matching public identifiers, all whitespace must be normalized, 3151 %% so we do that here 3152 {_, T, S1} = pub_id_strip(Str, S), 3153 scan_pubid_literal(T, S1, Delimiter, [16#20|Acc]); 3154scan_pubid_literal([H|T], S, Delimiter, Acc) -> 3155 case is_pubid_char(H) of 3156 true -> 3157 scan_pubid_literal( 3158 T, S#xmerl_scanner{col = S#xmerl_scanner.col+1}, 3159 Delimiter, [H|Acc]); 3160 false -> 3161 ?fatal({invalid_pubid_char, H}, S) 3162 end. 3163 3164%% We do not match whitespace here, even though they're allowed in public 3165%% identifiers. This is because we normalize this whitespace as we scan 3166%% (see above in scan_pubid_literal()) 3167%% 3168is_pubid_char(X) when X >= $a, X =< $z -> true; 3169is_pubid_char(X) when X >= $A, X =< $Z -> true; 3170is_pubid_char(X) when X >= $0, X =< $9 -> true; 3171is_pubid_char(X) -> 3172 lists:member(X, "-'()+,./:=?;!*#@$_%"). 3173 3174 3175%%%%%%% [46] contentspec 3176 3177scan_contentspec([], S=#xmerl_scanner{continuation_fun = F}) -> 3178 ?dbg("cont()...~n", []), 3179 F(fun(MoreBytes, S1) -> scan_contentspec(MoreBytes, S1) end, 3180 fatal_fun(unexpected_end), 3181 S); 3182scan_contentspec("EMPTY" ++ T, S0) -> 3183 ?bump_col(5), 3184 {empty, T, S}; 3185scan_contentspec("ANY" ++ T, S0) -> 3186 ?bump_col(3), 3187 {any, T, S}; 3188scan_contentspec("%" ++ _T, S=#xmerl_scanner{environment=prolog}) -> 3189 ?fatal({error,{wfc_PEs_In_Internal_Subset}},S); 3190scan_contentspec("%" ++ T, S0) -> 3191 ?bump_col(1), 3192 {PERefName, T1, S1} = scan_pe_reference(T, S), 3193 ExpRef = expand_pe_reference(PERefName, S1,as_PE), 3194 {_,T2,S2} = strip(ExpRef ++ T1,S1), 3195 scan_contentspec(T2, S2); 3196scan_contentspec("(" ++ T, S0) -> 3197 ?bump_col(1), 3198 ?strip1, 3199 scan_elem_content(T1, S1); 3200scan_contentspec(_Str,S) -> 3201 ?fatal(unexpected_character,S). 3202 3203 3204%%%%%%% [47] children 3205%%%%%%% [51] Mixed 3206 3207scan_elem_content(T, S) -> 3208 scan_elem_content(T, S, _Context = children, _Mode = unknown, _Acc = []). 3209 3210scan_elem_content([], S=#xmerl_scanner{continuation_fun = F}, 3211 Context, Mode, Acc) -> 3212 ?dbg("cont()...~n", []), 3213 F(fun(MoreBytes,S1) -> scan_elem_content(MoreBytes,S1,Context,Mode,Acc) end, 3214 fatal_fun(unexpected_end), 3215 S); 3216scan_elem_content(")" ++ T, S0, Context, Mode0, Acc0) -> 3217 ?bump_col(1), 3218 {Mode, Acc} = case {Mode0, Acc0} of 3219 {unknown, [_X]} -> 3220 {seq, Acc0}; 3221 {M, _L} when M == seq; M == choice -> 3222 {Mode0, lists:reverse(Acc0)} 3223 end, 3224 {Occurrence, T1, S1} = scan_occurrence(T, S), 3225 vc_No_Duplicate_Types(S,Context,Acc), 3226 case {Occurrence, Context,Acc} of 3227 {once, mixed,['#PCDATA']} -> ok; % It is not ok when there are 3228 % more names than '#PCDATA' 3229 % and no '*'. 3230 {'*', mixed,_} -> ok; 3231 {Other, mixed,_} -> 3232 ?fatal({illegal_for_mixed_content, Other}, S1); 3233 _ -> 3234 ok 3235 end, 3236 ?strip2, 3237 {format_elem_content({Occurrence, {Mode, Acc}}), T2, S2}; 3238scan_elem_content("#PCDATA" ++ _T, S, not_mixed, _Mode, _Acc) -> 3239 ?fatal({error,{extra_set_of_parenthesis}},S); 3240scan_elem_content("#PCDATA" ++ _T, S, _Cont, Mode, Acc) 3241 when Mode==choice;Mode==seq;Acc/=[] -> 3242 ?fatal({error,{invalid_format_of_mixed_content}},S); 3243scan_elem_content("#PCDATA" ++ T, S0, _Context, Mode, Acc) -> 3244 ?bump_col(7), 3245 ?strip1, 3246 scan_elem_content(T1, S1, mixed, Mode, ['#PCDATA'|Acc]); 3247scan_elem_content("," ++ _T, S, _Context, choice, _Acc) -> 3248 ?fatal({mixing_comma_and_vertical_bar_in_content_model},S); 3249scan_elem_content("," ++ T, S0, Context, _Mode, Acc) -> 3250 ?bump_col(1), 3251 ?strip1, 3252 scan_elem_content2(T1, S1, Context, seq, Acc); 3253scan_elem_content("|" ++ _T, S, _Context, seq, _Acc) -> 3254 ?fatal({mixing_comma_and_vertical_bar_in_content_model},S); 3255scan_elem_content("|" ++ T, S0, Context, _Mode, Acc) -> 3256 ?bump_col(1), 3257 ?strip1, 3258 scan_elem_content2(T1, S1, Context, choice, Acc); 3259scan_elem_content(T, S, Context, Mode, Acc) -> 3260 scan_elem_content2(T, S, Context, Mode, Acc). 3261 3262scan_elem_content2("(" ++ _T, S, mixed, _Mode, _Acc) -> 3263 ?fatal({error, 3264 {element_names_must_not_be_parenthesized_in_mixed_content}},S); 3265scan_elem_content2("(" ++ T, S0, Context, Mode, Acc) -> 3266 ?bump_col(1), 3267 ?strip1, 3268 {Inner, T2, S2} = scan_elem_content(T1, S1, not_mixed, unknown, []), 3269 scan_elem_content(T2, S2, Context, Mode, [Inner|Acc]); 3270scan_elem_content2("%" ++ _T,S=#xmerl_scanner{environment=prolog},_Context,_Mode,_Acc) -> 3271 ?fatal({error,{wfc_PEs_In_Internal_Subset}},S); 3272scan_elem_content2("%" ++ T, S0, Context, Mode, Acc) -> 3273 ?bump_col(1), 3274 {PERefName, T1, S1} = scan_pe_reference(T, S), 3275 ExpRef = expand_pe_reference(PERefName, S1,as_PE), 3276 {_,T2,S2}=strip(ExpRef++T1,S1), 3277 scan_elem_content(T2, S2, Context, Mode, Acc); 3278scan_elem_content2(T, S, Context, Mode, Acc) -> 3279 {Name, _NameStr, T1, S1} = scan_name(T, S), 3280 {Occurrence, T2, S2} = scan_occurrence(T1, S1), 3281 case {Occurrence, Context} of 3282 {once, mixed} -> ok; 3283 {Other, mixed} -> 3284 ?fatal({illegal_for_mixed_content, Other}, S1); 3285 _ -> 3286 ok 3287 end, 3288 ?strip3, 3289 mandatory_delimeter_wfc(T3,S3), 3290 NewAcc = [format_elem_content({Occurrence, Name}) | Acc], 3291 scan_elem_content(T3, S3, Context, Mode, NewAcc). 3292 3293 3294format_elem_content({once, What}) -> What; 3295format_elem_content(Other) -> Other. 3296 3297 3298scan_occurrence([], S=#xmerl_scanner{continuation_fun = F}) -> 3299 ?dbg("cont()...~n", []), 3300 F(fun(MoreBytes, S1) -> scan_occurrence(MoreBytes, S1) end, 3301 fatal_fun(unexpected_end), 3302 S); 3303scan_occurrence([$?|T], S0) -> 3304 ?bump_col(1), 3305 {'?', T, S}; 3306scan_occurrence([$+|T], S0) -> 3307 ?bump_col(1), 3308 {'+', T, S}; 3309scan_occurrence([$*|T], S0) -> 3310 ?bump_col(1), 3311 {'*', T, S}; 3312scan_occurrence(T, S) -> 3313 {once, T , S}. 3314 3315%%% Tests of Validity Constraints 3316 3317 3318%% first part of VC: Name Token 3319vc_Valid_Char(_AT,C,S) -> 3320 case xmerl_lib:is_namechar(C) of 3321 true -> 3322 ok; 3323 _ -> 3324 ?fatal({error,{validity_constraint_Name_Token,C}},S) 3325 end. 3326 3327 3328 3329vc_ID_Attribute_Default(_,#xmerl_scanner{validation=Valid}) 3330 when Valid /= dtd -> 3331 ok; 3332vc_ID_Attribute_Default({_,'ID',_,Def,_},_S) 3333 when Def=='#IMPLIED';Def=='#REQUIRED' -> 3334 ok; 3335vc_ID_Attribute_Default({_,'ID',_,Def,_},S) -> 3336 ?fatal({error,{validity_constraint_error_ID_Attribute_Default,Def}},S). 3337 3338vc_Enumeration({_Name,{_,NameList},DefaultVal,_,_},S) 3339 when is_list(DefaultVal) -> 3340 case lists:member(list_to_atom(DefaultVal),NameList) of 3341 true -> 3342 ok; 3343 _ -> 3344 ?fatal({error,{vc_enumeration,list_to_atom(DefaultVal),NameList}},S) 3345 end; 3346vc_Enumeration({_Name,{_,_NameList},_DefaultVal,_,_},_S) -> 3347 ok. 3348 3349vc_Entity_Name({_Name,'ENTITY',DefaultVal,_,_},S) when is_list(DefaultVal) -> 3350 Read = S#xmerl_scanner.rules_read_fun, 3351 case Read(entity,list_to_atom(DefaultVal),S) of 3352 {_,external,{_,{ndata,_}}} -> 3353 ok; 3354 _ -> ?fatal({error,{vc_Entity_Name,list_to_atom(DefaultVal)}},S) 3355 end; 3356vc_Entity_Name({_Name,'ENTITY',_,_,_},_S) -> 3357 ok; 3358vc_Entity_Name({_,'ENTITIES',DefaultVal,_,_},S) when is_list(DefaultVal) -> 3359 Read = S#xmerl_scanner.rules_read_fun, 3360 NameListFun = fun([],Acc,_St,_Fun) -> 3361 lists:reverse(Acc); 3362 (Str,Acc,St,Fun) -> 3363 {N,_,St2,Str2} = scan_name(Str,St), 3364 Fun(Str2,[N|Acc],St2,Fun) 3365 end, 3366 NameList = NameListFun(DefaultVal,[],S,NameListFun), 3367 VcFun = 3368 fun(X) -> 3369 case Read(entity,X,S) of 3370 {_,external,{_,{ndata,_}}} -> 3371 ok; 3372 _ -> ?fatal({error,{vc_Entity_Name,X}},S) 3373 end 3374 end, 3375 lists:foreach(VcFun,NameList); 3376vc_Entity_Name({_,'ENTITIES',_,_,_},_S) -> 3377 ok. 3378 3379vc_No_Duplicate_Types(#xmerl_scanner{validation=dtd} = S,mixed,Acc) -> 3380 CheckDupl = 3381 fun([H|T],F) -> 3382 case lists:member(H,T) of 3383 true -> 3384 ?fatal({no_duplicate_types_allowed,H},S); 3385 _ -> F(T,F) 3386 end; 3387 ([],_) -> ok 3388 end, 3389 CheckDupl(Acc,CheckDupl); 3390vc_No_Duplicate_Types(_,_,_) -> 3391 ok. 3392 3393 3394%%% Tests of Well-Formededness Constraints 3395 3396 3397mandatory_delimeter_wfc(","++_T,_S) -> 3398 ok; 3399mandatory_delimeter_wfc("|"++_T,_S) -> 3400 ok; 3401mandatory_delimeter_wfc(")"++_T,_S) -> 3402 ok; 3403mandatory_delimeter_wfc("%"++_T,_S) -> 3404 %% a parameter reference is ok 3405 ok; 3406mandatory_delimeter_wfc(T,S) -> 3407 ?fatal({comma_or_vertical_bar_mandatory_between_names_in_content_model,T},S). 3408 3409 3410wfc_unique_att_spec([],_S) -> 3411 ok; 3412wfc_unique_att_spec([#xmlAttribute{name=N,expanded_name=EN}|Atts],S) -> 3413 case lists:keymember(N,#xmlAttribute.name,Atts) of 3414 true -> 3415 ?fatal({error,{unique_att_spec_required,N}},S); 3416 _ -> 3417 case S#xmerl_scanner.namespace_conformant andalso 3418 lists:keymember(EN, #xmlAttribute.expanded_name, Atts) of 3419 true -> 3420 ?fatal({error,{unique_att_spec_required,EN}},S); 3421 _ -> 3422 wfc_unique_att_spec(Atts,S) 3423 end 3424 end. 3425 3426wfc_legal_char(Chars,S) when is_list(Chars)-> 3427 {Ch,Rest} = to_ucs(S#xmerl_scanner.encoding,Chars), 3428 case xmerl_lib:is_char(Ch) of 3429 true -> 3430 {Ch,Rest}; 3431 _ -> 3432 ?fatal({error,{wfc_Legal_Character,Ch}},S) 3433 end; 3434wfc_legal_char(Ch,S) -> 3435 case xmerl_lib:is_char(Ch) of 3436 true -> 3437 {Ch,[]}; 3438 _ -> 3439 ?fatal({error,{wfc_Legal_Character,Ch}},S) 3440 end. 3441 3442 3443wfc_whitespace_betw_attrs([WS |_]=L,S) when ?whitespace(WS) -> 3444 {L,S}; 3445wfc_whitespace_betw_attrs([$/ |_]=L,S) -> 3446 {L,S}; 3447wfc_whitespace_betw_attrs([$> |_]=L,S) -> 3448 {L,S}; 3449wfc_whitespace_betw_attrs([],S=#xmerl_scanner{continuation_fun = F}) -> 3450 ?dbg("cont()...~n", []), 3451 F(fun(MoreBytes, S1) -> wfc_whitespace_betw_attrs(MoreBytes, S1) end, 3452 fatal_fun(unexpected_end), 3453 S); 3454wfc_whitespace_betw_attrs(_,S) -> 3455 ?fatal({whitespace_required_between_attributes},S). 3456 3457wfc_Entity_Declared({external,_},S=#xmerl_scanner{standalone=yes},Name) -> 3458 ?fatal({reference_to_externally_defed_entity_standalone_doc,Name},S); 3459wfc_Entity_Declared({external,_},_S,_) -> 3460 ok; 3461wfc_Entity_Declared(_Env,_S,_) -> 3462 ok. 3463 3464wfc_Internal_parsed_entity(internal,Value,S) -> 3465 %% WFC test that replacement text matches production content 3466 scan_content(Value,S#xmerl_scanner{environment=internal_parsed_entity}, 3467 _Name=[],[],S#xmerl_scanner.space,_Lang=[],_Prnt=[], 3468 #xmlNamespace{}); 3469wfc_Internal_parsed_entity(_,_,_) -> 3470 ok. 3471 3472vc_Element_valid(_Name, {"xmlns", _}, 3473 S = #xmerl_scanner{namespace_conformant = true}) -> 3474 ?fatal({error,{illegal_element_prefix,xmlns}},S); 3475vc_Element_valid(Name, _, S) -> 3476 vc_Element_valid(Name, S). 3477 3478vc_Element_valid(_Name,#xmerl_scanner{environment=internal_parsed_entity}) -> 3479 ok; 3480vc_Element_valid(Name,S=#xmerl_scanner{rules_read_fun=Read, 3481 validation=dtd}) -> 3482 case Read(elem_def,Name,S) of 3483 #xmlElement{elementdef=undeclared} -> 3484 ?fatal({error,{error_missing_element_declaration_in_DTD,Name}},S); undefined -> 3485 ?fatal({error,{error_missing_element_declaration_in_DTD,Name}},S); _ -> ok 3486 end; 3487vc_Element_valid(_,_) -> 3488 ok. 3489 3490%%%%%%% [74] PEDef 3491 3492 3493scan_pe_def([], S=#xmerl_scanner{continuation_fun = F}, PEName) -> 3494 ?dbg("cont()...~n", []), 3495 F(fun(MoreBytes, S1) -> scan_pe_def(MoreBytes, S1, PEName) end, 3496 fatal_fun(unexpected_end), 3497 S); 3498scan_pe_def("'" ++ T, S0, PEName) -> 3499 ?bump_col(1), 3500 scan_entity_value(T, S, $', PEName,parameter); 3501scan_pe_def("\"" ++ T, S0, PEName) -> 3502 ?bump_col(1), 3503 scan_entity_value(T, S, $", PEName,parameter); 3504scan_pe_def(Str, S, _PEName) -> 3505 scan_external_id(Str, S). 3506 3507 3508%%%%%%% [82] NotationDecl 3509 3510scan_notation_decl(T, #xmerl_scanner{rules_write_fun = Write, 3511 rules_read_fun=Read, 3512 rules_delete_fun=Delete} = S) -> 3513 {Name, _NameStr, T1, S1} = scan_name_no_colons(T, S), 3514 {_,T2,S2} = mandatory_strip(T1,S1), 3515 {Def, T3, S3} = scan_notation_decl1(T2, S2), 3516 ?strip4, 3517 T5 = scan_mandatory(">",T4,1,S4,expected_end_tag_notation_declaration), 3518% ">" ++ T5 = T4, 3519 case Read(notation,Name,S) of 3520 undeclared -> Delete(notation,Name,S4); 3521 _ -> ok 3522 end, 3523 S5 = Write(notation, Name, Def, S4), 3524 {T5, S5}. 3525 3526scan_notation_decl1([], S=#xmerl_scanner{continuation_fun = F}) -> 3527 ?dbg("cont()...~n", []), 3528 F(fun(MoreBytes, S1) -> scan_notation_decl1(MoreBytes, S1) end, 3529 fatal_fun(unexpected_end), 3530 S); 3531scan_notation_decl1("SYSTEM" ++ T, S0) -> 3532 ?bump_col(6), 3533 {_,T1,S1} = mandatory_strip(T,S), 3534 {SL, T2, S2} = scan_system_literal(T1, S1), 3535 {{system, SL}, T2, S2}; 3536scan_notation_decl1("PUBLIC" ++ T, S0) -> 3537 ?bump_col(6), 3538 {_,T1,S1} = mandatory_strip(T,S), 3539 {PIDL, T2, S2} = scan_pubid_literal(T1, S1), 3540 ?strip3, 3541 case T3 of 3542 ">" ++ _ -> 3543 {{public, PIDL}, T3, 3544 S3#xmerl_scanner{col = S3#xmerl_scanner.col+1}}; 3545 _ -> 3546 {SL, T4, S4} = scan_system_literal(T3, S3), 3547 {{public, PIDL, SL}, T4, S4} 3548 end. 3549 3550%%%%%%% [75] ExternalID 3551 3552scan_external_id([], S=#xmerl_scanner{continuation_fun = F}) -> 3553 ?dbg("cont()...~n", []), 3554 F(fun(MoreBytes, S1) -> scan_external_id(MoreBytes, S1) end, 3555 fatal_fun(unexpected_end), 3556 S); 3557scan_external_id("SYSTEM" ++ T, S0) -> 3558 ?bump_col(6), 3559 {_,T1,S1} = mandatory_strip(T,S), 3560 {SL, T2, S2} = scan_system_literal(T1, S1), 3561 {{system, SL}, T2, S2}; 3562scan_external_id("PUBLIC" ++ T, S0) -> 3563 ?bump_col(6), 3564 {_,T1,S1} = mandatory_strip(T,S), 3565 {PIDL, T2, S2} = scan_pubid_literal(T1, S1), 3566 {_,T3,S3} = mandatory_strip(T2,S2), 3567 {SL, T4, S4} = scan_system_literal(T3, S3), 3568 {{public, PIDL, SL}, T4, S4}. 3569 3570 3571%%%%%%% [9] EntityValue 3572 3573%% Note that we have two different scan functions for EntityValue 3574%% They differ in that this one checks for recursive calls to the same 3575%% parameter entity. 3576 3577scan_entity_value(Str, S, Delim, Name, Namespace) -> 3578 scan_entity_value(Str, S, Delim, _Acc = [], Name, Namespace,[]). 3579 3580 3581scan_entity_value([], S=#xmerl_scanner{environment={external,{entity,_}}}, 3582 _Delim,Acc,_,_,[]) -> 3583 {lists:flatten(lists:reverse(Acc)), [], S}; 3584scan_entity_value([], S=#xmerl_scanner{environment={external,{entity,_}}, 3585 validation=dtd}, 3586 _Delim,_Acc,PEName,_,_) -> 3587 {{error,{failed_VC_Proper_Declaration_PE_Nesting,1,PEName}},[],S}; 3588scan_entity_value([],S, 3589 no_delim,Acc,_,_,[]) -> 3590 {lists:flatten(lists:reverse(Acc)),[],S}; 3591scan_entity_value([],S=#xmerl_scanner{validation=dtd}, 3592 no_delim,_Acc,PEName,_,_PENesting) -> 3593 {{error,{failed_VC_Proper_Declaration_PE_Nesting,2,PEName}},[],S}; 3594scan_entity_value([], S=#xmerl_scanner{continuation_fun = F}, 3595 Delim, Acc, PEName,Namespace,PENesting) -> 3596 ?dbg("cont()...~n", []), 3597 F(fun(MoreBytes, S1) -> 3598 scan_entity_value(MoreBytes,S1, 3599 Delim,Acc,PEName,Namespace,PENesting) 3600 end, 3601 fatal_fun(unexpected_end), 3602 S); 3603scan_entity_value([Delim|T], S=#xmerl_scanner{validation=dtd}, 3604 Delim,_Acc,PEName,_NS,PENesting) when length(PENesting) /= 0 -> 3605 {{error,{failed_VC_Proper_Declaration_PE_Nesting,3,PEName}},T,S}; 3606scan_entity_value([Delim|T], S0, 3607 Delim, Acc, _PEName,_NS,_PENesting) -> 3608 ?bump_col(1), 3609 {lists:flatten(lists:reverse(Acc)), T, S}; 3610scan_entity_value("%" ++ _T,S=#xmerl_scanner{environment=prolog},_,_,_,_,_) -> 3611 ?fatal({error,{wfc_PEs_In_Internal_Subset}},S); 3612% %% This is a PEdecl in an external entity 3613% scan_entity_value([$%,WS|T], S0, Delim, Acc, PEName,Namespace,PENesting) 3614% when ?whitespace(WS) -> 3615% ?bump_col(2), 3616% scan_entity_value(T, S, Delim, [WS,$%|Acc], PEName,Namespace,PENesting); 3617scan_entity_value("%" ++ T, S0, Delim, Acc, PEName,Namespace,PENesting) -> 3618 ?bump_col(1), 3619 {PERefName, T1, S1} = scan_pe_reference(T, S), 3620 if PERefName == PEName,Namespace==parameter -> 3621 ?fatal({illegal_recursion_in_PE, PEName}, S1); 3622 true -> 3623 {ExpandedRef,S2} = 3624 case expand_pe_reference(PERefName, S1, in_literal) of 3625 %% actually should pe ref be expanded as_PE but 3626 %% handle whitespace explicitly in this case. 3627 Tuple when is_tuple(Tuple) -> 3628 %% {system,URI} or {public,URI} 3629 %% Included in literal. 3630 {ExpRef,Sx}=fetch_not_parse(Tuple,S1), 3631 {EntV, _, S5} = 3632 scan_entity_value(ExpRef, Sx, no_delim,[], 3633 PERefName,parameter,[]), 3634 %% should do an update Write(parameter_entity) 3635 %% so next expand_pe_reference is faster 3636 {string_to_char_set(S5#xmerl_scanner.encoding, EntV), S5}; 3637 ExpRef -> 3638 {string_to_char_set(S1#xmerl_scanner.encoding, ExpRef) ,S1} 3639 end, 3640 %% single or duoble qoutes are not treated as delimeters 3641 %% in passages "included in literal" 3642 S3 = S2#xmerl_scanner{col=S2#xmerl_scanner.col+1}, 3643 {Acc2,_,S4} = scan_entity_value(ExpandedRef,S3,no_delim,Acc, 3644 PEName,Namespace,[]), 3645% {_,T2,S5} = strip(" "++T1,S4), 3646 scan_entity_value(T1,S4#xmerl_scanner{line=S3#xmerl_scanner.line, 3647 col=S3#xmerl_scanner.col}, 3648 Delim,lists:reverse(Acc2), 3649 PEName,Namespace,PENesting) 3650% scan_entity_value(T1,S4,Delim,lists:reverse(Acc2), 3651% PEName,Namespace,PENesting) 3652 end; 3653scan_entity_value("&" ++ T, S0, Delim, Acc, PEName,Namespace,PENesting) -> 3654 %% This is either a character entity or a general entity (internal 3655 %% or external) reference. An internal general entity shall not be 3656 %% expanded in an entity def XML1.0 section 4.5. 3657 ?bump_col(1), 3658 case T of 3659 "#"++_T -> 3660 {ExpRef, T1, S1} = scan_reference(T, S), 3661 Tok = pe_nesting_token(ExpRef++T1,Namespace,S1#xmerl_scanner.validation), 3662 case markup_delimeter(ExpRef) of 3663 true -> 3664 scan_entity_value(T1, S1, Delim, [ExpRef|Acc], PEName, 3665 Namespace,pe_push(Tok,PENesting,S1)); 3666 _ -> 3667 ExpRef2 = string_to_char_set(S#xmerl_scanner.encoding,ExpRef), 3668 scan_entity_value(ExpRef2 ++ T1, S1, Delim, Acc, PEName, 3669 Namespace,pe_push(Tok,PENesting,S1)) 3670 end; 3671 _ -> %% General Entity is bypassed, though must check for 3672 %% recursion: save referenced name now and check for 3673 %% recursive reference after the whole entity definition is 3674 %% completed. 3675 {Name, _NamespaceInfo, T1, S1} = scan_name(T,S), 3676 T2=scan_mandatory(";",T1,1,S1,expected_entity_reference_semicolon), 3677 S2=save_refed_entity_name(Name,PEName,S1), 3678 scan_entity_value(T2,S2,Delim,[";",atom_to_list(Name),"&"|Acc],PEName,Namespace,PENesting) 3679 end; 3680%% The following clauses is for PE Nesting VC constraint 3681%% Start delimeter for ConditionalSection 3682scan_entity_value("<!["++T,S0,Delim,Acc,PEName,parameter=NS,PENesting)-> 3683 ?bump_col(3), 3684 scan_entity_value(T,S,Delim,["<!["|Acc],PEName,NS, 3685 pe_push("<![",PENesting,S)); 3686%% Start delimeter for ConditionalSection (2) 3687scan_entity_value("["++T,S0,Delim,Acc,PEName,parameter=NS,PENesting)-> 3688 ?bump_col(1), 3689 scan_entity_value(T,S,Delim,["["|Acc],PEName,NS, 3690 pe_push("[",PENesting,S)); 3691%% Start delimeter for comment 3692scan_entity_value("<!--"++T,S0,Delim,Acc,PEName,parameter=NS,PENesting)-> 3693 ?bump_col(4), 3694 scan_entity_value(T,S,Delim,["<!--"|Acc],PEName,NS, 3695 pe_push("<!--",PENesting,S)); 3696%% Start delimeter for ElementDecl, AttListDecl,EntityDecl,NotationDecl 3697scan_entity_value("<!"++ T,S0,Delim,Acc,PEName, parameter=NS,PENesting) -> 3698 ?bump_col(2), 3699 scan_entity_value(T,S,Delim,["<!"|Acc],PEName,NS, 3700 pe_push("<!",PENesting,S)); 3701%% Start delimeter for PI 3702scan_entity_value("<?"++T,S0,Delim,Acc,PEName, parameter=NS,PENesting) -> 3703 ?bump_col(2), 3704 scan_entity_value(T,S,Delim,["<?"|Acc],PEName,NS, 3705 pe_push("<?",PENesting,S)); 3706%% Start delimeter for elements that matches the proper stop delimeter 3707%% for a markupdecl 3708scan_entity_value("</"++T,S0,Delim,Acc,PEName,parameter=NS,PENesting)-> 3709 ?bump_col(2), 3710 scan_entity_value(T,S,Delim,["</"|Acc],PEName,NS, 3711 pe_push("</",PENesting,S)); 3712scan_entity_value("<"++T,S0,Delim,Acc,PEName,parameter=NS,PENesting)-> 3713 ?bump_col(1), 3714 scan_entity_value(T,S,Delim,["<"|Acc],PEName,NS, 3715 pe_push("<",PENesting,S)); 3716%% Delimeter for contentspecs 3717scan_entity_value("("++T,S0,Delim,Acc,PEName,parameter=NS,PENesting)-> 3718 ?bump_col(1), 3719 scan_entity_value(T,S,Delim,["("|Acc],PEName,NS, 3720 pe_push("(",PENesting,S)); 3721%% Stop delimeter for ElementDecl, AttListDecl,EntityDecl,NotationDecl 3722scan_entity_value(">"++ T,S0,Delim,Acc,PEName, parameter=NS,PENesting) -> 3723 ?bump_col(1), 3724 scan_entity_value(T,S,Delim,[">"|Acc],PEName,NS, 3725 pe_pop(">",PENesting,S)); 3726%% Stop delimeter for PI 3727scan_entity_value("?>"++ T,S0,Delim,Acc,PEName, parameter=NS,PENesting) -> 3728 ?bump_col(2), 3729 scan_entity_value(T,S,Delim,["?>"|Acc],PEName,NS, 3730 pe_pop("?>",PENesting,S)); 3731%% Stop delimeter for comment 3732scan_entity_value("-->"++ T,S0,Delim,Acc,PEName, parameter=NS,PENesting) -> 3733 ?bump_col(3), 3734 scan_entity_value(T,S,Delim,["-->"|Acc],PEName,NS, 3735 pe_pop("-->",PENesting,S)); 3736%% Stop delimeter for ConditionalSection 3737scan_entity_value("]]>"++ T,S0,Delim,Acc,PEName, parameter=NS,PENesting) -> 3738 ?bump_col(3), 3739 scan_entity_value(T,S,Delim,["]]>"|Acc],PEName,NS, 3740 pe_pop("]]>",PENesting,S)); 3741%% Stop delimeter added to match a content start delimeter included 3742scan_entity_value("/>"++ T,S0,Delim,Acc,PEName, parameter=NS,PENesting) -> 3743 ?bump_col(2), 3744 scan_entity_value(T,S,Delim,["/>"|Acc],PEName,NS, 3745 pe_pop("/>",PENesting,S)); 3746scan_entity_value(")"++ T,S0,Delim,Acc,PEName, parameter=NS,PENesting) -> 3747 ?bump_col(1), 3748 scan_entity_value(T,S,Delim,[")"|Acc],PEName,NS, 3749 pe_pop(")",PENesting,S)); 3750scan_entity_value("\n"++T, S, Delim, Acc, PEName,Namespace,PENesting) -> 3751 scan_entity_value(T, S#xmerl_scanner{line=S#xmerl_scanner.line+1}, 3752 Delim, ["\n"|Acc], PEName,Namespace,PENesting); 3753scan_entity_value(Str, S0, Delim, Acc, PEName,Namespace,PENesting) -> 3754 {Ch,T} = to_ucs(S0#xmerl_scanner.encoding,Str), 3755 case xmerl_lib:is_char(Ch) of 3756 true -> 3757 ?bump_col(1), 3758 scan_entity_value(T, S, Delim, [Ch|Acc], PEName,Namespace,PENesting); 3759 false -> 3760 ?fatal({unexpected_char,Ch}, S0) 3761 end. 3762 3763 3764 3765save_refed_entity_name(Name,PEName,S) -> 3766 case predefined_entity(Name) of 3767 true -> 3768 S; 3769 _ -> 3770 save_refed_entity_name1(Name,PEName,S) 3771 end. 3772 3773save_refed_entity_name1(Name,PEName, 3774 S=#xmerl_scanner{entity_references=ERefs}) -> 3775 case lists:keysearch(PEName,1,ERefs) of 3776 {value,{_,Refs}} -> 3777 NewRefs = 3778 case lists:member(Name,Refs) of 3779 true ->Refs; 3780 _ -> [Name|Refs] 3781 end, 3782 S#xmerl_scanner{entity_references=lists:keyreplace(PEName,1,ERefs, 3783 {PEName,NewRefs}) 3784 }; 3785 _ -> 3786 S#xmerl_scanner{entity_references=[{PEName,[Name]}|ERefs]} 3787 end. 3788 3789 3790 3791pe_push(Tok,Stack,_S) when Tok=="<!";Tok=="<?";Tok=="<!--";Tok=="<!["; 3792 Tok=="[";Tok=="<";Tok=="</";Tok=="(" -> 3793 [Tok|Stack]; 3794pe_push(Tok,Stack,#xmerl_scanner{validation=dtd}) 3795 when Tok==")";Tok==">";Tok=="?>";Tok=="]]>";Tok=="-->";Tok=="/>"-> 3796 [Tok|Stack]; 3797pe_push(_,Stack,_S) -> 3798 Stack. 3799 3800pe_pop(">",["<!"|Rest],_S) -> Rest; 3801pe_pop("?>",["<?"|Rest],_S) -> Rest; 3802pe_pop("-->",["<!--"|Rest],_S) -> Rest; 3803pe_pop("]]>",["[","<!["|Rest],_S) -> Rest; 3804pe_pop("/>",["<"|Rest],_S) -> Rest; 3805pe_pop(">",["<"|Rest],_S) -> Rest; 3806pe_pop(">",["</"|Rest],_S) -> Rest; 3807pe_pop(")",["("|Rest],_S) -> Rest; 3808pe_pop(Token,_Stack,S=#xmerl_scanner{validation=dtd}) -> 3809 ?fatal({error,{failed_VC_Proper_Declaration_PE_Nesting,5,Token}},S); 3810pe_pop(_,Rest,_) -> 3811 Rest. 3812 3813pe_nesting_token("<!"++_T,parameter,dtd) -> "<!"; 3814pe_nesting_token("<?"++_T,parameter,dtd) -> "<?"; 3815pe_nesting_token("<!--"++_T,parameter,dtd) -> "<!--"; 3816pe_nesting_token("<!["++_T,parameter,dtd) -> "<!["; 3817pe_nesting_token("["++_T,parameter,dtd) -> "["; 3818pe_nesting_token("("++_T,parameter,dtd) -> "("; 3819pe_nesting_token(">"++_T,parameter,dtd) -> ">"; 3820pe_nesting_token("?>"++_T,parameter,dtd) -> "?>"; 3821pe_nesting_token("-->"++_T,parameter,dtd) -> "-->"; 3822pe_nesting_token("]]>"++_T,parameter,dtd) -> "]]>"; 3823pe_nesting_token(")"++_T,parameter,dtd) -> ")"; 3824pe_nesting_token("/>"++_T,parameter,dtd) -> "/>"; 3825pe_nesting_token(_,_,_) -> false. 3826 3827predefined_entity(amp) -> true; 3828predefined_entity(lt) -> true; 3829predefined_entity(gt) -> true; 3830predefined_entity(apos) -> true; 3831predefined_entity(quot) -> true; 3832predefined_entity(_) -> false. 3833 3834check_entity_recursion(EName, 3835 S=#xmerl_scanner{entity_references=EntityRefList}) -> 3836 Set = sofs:family(EntityRefList), 3837 case catch sofs:family_to_digraph(Set, [acyclic]) of 3838 {'EXIT',{cyclic,_}} -> 3839 ?fatal({illegal_recursion_in_Entity, EName}, S); 3840 DG -> 3841 digraph:delete(DG), 3842 ok 3843 end. 3844 3845 3846 3847 3848%%%%%%% [15] Comment 3849scan_comment(Str, S) -> 3850 scan_comment(Str, S, _Pos = undefined, _Parents = [], _Lang = []). 3851 3852scan_comment(Str,S=#xmerl_scanner{col=C,event_fun=Event}, Pos, Parents, Lang) -> 3853 Comment = #xmlComment{pos = Pos, 3854 parents = Parents, 3855 language = Lang, 3856 value = undefined}, 3857 S1 = #xmerl_scanner{} = Event(#xmerl_event{event = started, 3858 line = S#xmerl_scanner.line, 3859 col = C, 3860 pos = Pos, 3861 data = Comment}, S), 3862 3863 scan_comment1(Str, S1, Pos, Comment, _Acc = []). 3864 3865scan_comment1([], S=#xmerl_scanner{continuation_fun = F}, 3866 Pos, Comment, Acc) -> 3867 ?dbg("cont()...~n", []), 3868 F(fun(MoreBytes, S1) -> scan_comment1(MoreBytes, S1, Pos, Comment, Acc) end, 3869 fatal_fun(unexpected_end), 3870 S); 3871scan_comment1("-->" ++ T, S0 = #xmerl_scanner{col = C, 3872 event_fun = Event, 3873 hook_fun = Hook}, 3874 _Pos, Comment, Acc) -> 3875 ?bump_col(3), 3876 Comment1 = Comment#xmlComment{value = lists:reverse(Acc)}, 3877 S1=#xmerl_scanner{}=Event(#xmerl_event{event = ended, 3878 line=S#xmerl_scanner.line, 3879 col = C, 3880 data = Comment1}, S), 3881 {Ret, S2} = Hook(Comment1, S1), 3882 {_,T3,S3}=strip(T,S2), 3883 {Ret,T3,S3}; 3884scan_comment1("--"++T,S,_Pos,_Comment,_Acc) -> 3885 ?fatal({invalid_comment,"--"++[hd(T)]}, S); 3886scan_comment1("\n" ++ T, S=#xmerl_scanner{line = L}, Pos, Cmt, Acc) -> 3887 scan_comment1(T, S#xmerl_scanner{line=L+1,col=1},Pos, Cmt, "\n" ++ Acc); 3888scan_comment1("\r\n" ++ T, S=#xmerl_scanner{line = L}, Pos, Cmt, Acc) -> 3889 %% CR followed by LF is read as a single LF 3890 scan_comment1(T, S#xmerl_scanner{line=L+1,col=1}, Pos, Cmt, "\n" ++ Acc); 3891scan_comment1("\r" ++ T, S=#xmerl_scanner{line = L}, Pos, Cmt, Acc) -> 3892 %% CR not followed by LF is read as a LF 3893 scan_comment1(T, S#xmerl_scanner{line=L+1,col=1}, Pos, Cmt, "\n" ++ Acc); 3894scan_comment1(Str, S=#xmerl_scanner{col = C}, Pos, Cmt, Acc) -> 3895 {Ch,T} = wfc_legal_char(Str,S), 3896 scan_comment1(T, S#xmerl_scanner{col=C+1}, Pos, Cmt, [Ch|Acc]). 3897 3898%%%%%%% 3899 3900scan_markup_completion_gt([$>|_R]=T,S) -> 3901 {T,S}; 3902scan_markup_completion_gt([$%|T],S0) -> 3903 ?bump_col(1), 3904 {Name,T1,S1} = scan_pe_reference(T,S), 3905 ExpandedRef = expand_pe_reference(Name,S1,as_PE), 3906 {_,T2,S2} = strip(ExpandedRef++T1,S1), 3907 scan_markup_completion_gt(T2,S2); 3908scan_markup_completion_gt(T,S) -> 3909 ?fatal({error,{malformed_syntax_entity_completion,T}},S). 3910 3911 3912scan_mandatory(Pattern,T,N,S,ErrorMsg) -> 3913 case lists:prefix(Pattern,T) of 3914 true -> 3915 lists:nthtail(N,T); 3916 _ -> 3917 ?fatal(ErrorMsg,S) 3918 end. 3919 3920 3921strip(Str,S) -> 3922 strip(Str,S,all). 3923 3924strip([], S=#xmerl_scanner{continuation_fun = F},_) -> 3925 ?dbg("cont()... stripping whitespace~n", []), 3926 F(fun(MoreBytes, S1) -> strip(MoreBytes, S1) end, 3927 fun(S1) -> {[], [], S1} end, 3928 S); 3929strip("\s" ++ T, S=#xmerl_scanner{col = C},Lim) -> 3930 strip(T, S#xmerl_scanner{col = C+1},Lim); 3931strip("\t" ++ _T, S ,no_tab) -> 3932 ?fatal({error,{no_tab_allowed}},S); 3933strip("\t" ++ T, S=#xmerl_scanner{col = C},Lim) -> 3934 strip(T, S#xmerl_scanner{col = expand_tab(C)},Lim); 3935strip("\n" ++ T, S=#xmerl_scanner{line = L},Lim) -> 3936 strip(T, S#xmerl_scanner{line = L+1, col = 1},Lim); 3937strip("\r\n" ++ T, S=#xmerl_scanner{line = L},Lim) -> 3938 %% CR followed by LF is read as a single LF 3939 strip(T, S#xmerl_scanner{line = L+1, col = 1},Lim); 3940strip("\r" ++ T, S=#xmerl_scanner{line = L},Lim) -> 3941 %% CR not followed by LF is read as a LF 3942 strip(T, S#xmerl_scanner{line = L+1, col = 1},Lim); 3943strip(Str, S,_Lim) -> 3944 {[], Str, S}. 3945 3946%% demands a whitespace, though a parameter entity is ok, it will 3947%% expand with a whitespace on each side. 3948mandatory_strip([],S) -> 3949 ?fatal({error,{whitespace_was_expected}},S); 3950mandatory_strip(T,S) when ?whitespace(hd(T)) -> 3951 strip(T,S,all); 3952mandatory_strip([$%|T],S) when ?whitespace(hd(T)) -> %this is not a PERefence, but an PEDeclaration 3953 ?fatal({error,{whitespace_was_expected}},S); 3954mandatory_strip([$%|_T]=T,S) -> 3955 {[],T,S}; 3956mandatory_strip(_T,S) -> 3957 ?fatal({error,{whitespace_was_expected}},S). 3958 3959%% strip but don't accept tab 3960pub_id_strip(Str, S) -> 3961 strip(Str,S,no_tab). 3962 3963 3964normalize("&"++T,S,IsNorm) -> 3965 case scan_reference(T, S) of 3966 {ExpRef, T1, S1} when ?whitespace(hd(ExpRef)) -> 3967 ExpRef2 = string_to_char_set(S#xmerl_scanner.encoding,ExpRef), 3968 normalize(ExpRef2++T1,S1,IsNorm); 3969 _ -> 3970 {"&"++T,S,IsNorm} 3971 end; 3972normalize(T,S,IsNorm) -> 3973 case strip(T,S) of 3974 {_,T,S} -> 3975 {T,S,IsNorm}; 3976 {_,T1,S1} -> 3977 normalize(T1,S1,true) 3978 end. 3979 3980 3981%% Optimization: 3982%% - avoid building list of spaces or tabs; 3983%% - avoid reverse; 3984%% - compact two common indentation patterns. 3985%% Note: only to be called when a \n was found. 3986fast_accumulate_whitespace(" " ++ T, S, _) -> 3987 fast_acc_spaces(T, S, 1); 3988fast_accumulate_whitespace("\t"++T, S, _) -> 3989 fast_acc_tabs(T, S, 1); 3990fast_accumulate_whitespace("<"++_=R, S, _T) -> 3991 #xmerl_scanner{common_data = CD, line = Line} = S, 3992 {done, {element(3, CD), R, S#xmerl_scanner{col = 1, line = Line + 1}}}; 3993fast_accumulate_whitespace(_, S, T) -> 3994 accumulate_whitespace(T, S, []). 3995 3996fast_acc_spaces(" " ++ T, S, N) -> 3997 fast_acc_spaces(T, S, N + 1); 3998fast_acc_spaces(T, S, N) -> 3999 fast_acc_end(T, S, N, N, $\s, 1). 4000 4001fast_acc_tabs("\t" ++ T, S, N) -> 4002 fast_acc_tabs(T, S, N + 1); 4003fast_acc_tabs(T, S, N) -> 4004 fast_acc_end(T, S, N, N * 8 + 1, $\t, 2). 4005 4006fast_acc_end(T, S, N, Col, C, CD_I) -> 4007 #xmerl_scanner{common_data = CD, line = Line0} = S, 4008 Line = Line0 + 1, 4009 try 4010 $< = hd(T), 4011 {done,{element(N, element(CD_I, CD)), T, 4012 S#xmerl_scanner{col = Col, line = Line}}} 4013 catch _:_ -> 4014 accumulate_whitespace(T, S, Line, Col, lists:duplicate(N, C)++"\n") 4015 end. 4016 4017 4018%%% @spec accumulate_whitespace(T::string(),S::global_state(), 4019%%% atom(),Acc::string()) -> {Acc, T1, S1} 4020%%% 4021%%% @doc Function to accumulate and normalize whitespace. 4022accumulate_whitespace(T, S, preserve, Acc) -> 4023 accumulate_whitespace(T, S, Acc); 4024accumulate_whitespace(T, S, normalize, Acc) -> 4025 {_WsAcc, T1, S1} = accumulate_whitespace(T, S, []), 4026 {[$\s|Acc], T1, S1}. 4027 4028accumulate_whitespace(T, S, Acc) -> 4029 #xmerl_scanner{line = Line, col = Col} = S, 4030 accumulate_whitespace(T, S, Line, Col, Acc). 4031 4032accumulate_whitespace([], S0, Line, Col, Acc) -> 4033 #xmerl_scanner{continuation_fun = F} = S0, 4034 S = S0#xmerl_scanner{line = Line, col = Col}, 4035 ?dbg("cont()...~n", []), 4036 F(fun(MoreBytes, S1) -> accumulate_whitespace(MoreBytes, S1, Acc) end, 4037 fun(S1) -> {Acc, [], S1} end, 4038 S); 4039accumulate_whitespace("\s" ++ T, S, Line, Col, Acc) -> 4040 accumulate_whitespace(T, S, Line, Col+1, [$\s|Acc]); 4041accumulate_whitespace("\t" ++ T, S, Line, Col, Acc) -> 4042 accumulate_whitespace(T, S, Line, expand_tab(Col), [$\t|Acc]); 4043accumulate_whitespace("\n" ++ T, S, Line, _Col, Acc) -> 4044 accumulate_whitespace(T, S, Line+1, 1, [$\n|Acc]); 4045accumulate_whitespace("\r\n" ++ T, S, Line, _Col, Acc) -> 4046 %% CR followed by LF is read as a single LF 4047 accumulate_whitespace(T, S, Line+1, 1, [$\n|Acc]); 4048accumulate_whitespace("\r" ++ T, S, Line, _Col, Acc) -> 4049 %% CR not followed by LF is read as a LF 4050 accumulate_whitespace(T, S, Line+1, 1, [$\n|Acc]); 4051accumulate_whitespace(Str, S, Line, Col, Acc) -> 4052 {Acc, Str, S#xmerl_scanner{line = Line, col = Col}}. 4053 4054expand_tab(Col) -> 4055 Rem = (Col-1) rem 8, 4056 _NewCol = Col + 8 - Rem. 4057 4058%% validation_mode(Validation) 4059%% Validation = off | dtd | schema | true | false 4060%% true and false are obsolete 4061validation_mode(false) -> 4062 off; 4063validation_mode(true) -> 4064 dtd; 4065validation_mode(Other) -> 4066 Other. 4067 4068 4069schemaLocations(El,#xmerl_scanner{schemaLocation=[]}) -> 4070 schemaLocations(El); 4071schemaLocations(El,#xmerl_scanner{schemaLocation=SL}) -> 4072 case SL of 4073 [{_,_}|_] -> 4074 {ok,SL}; 4075 _ -> 4076 schemaLocations(El) 4077 end. 4078 4079schemaLocations(#xmlElement{attributes=Atts,xmlbase=_Base}) -> 4080 Pred = fun(#xmlAttribute{name=schemaLocation}) -> false; 4081 (#xmlAttribute{nsinfo={_,"schemaLocation"}}) -> false; 4082 (_) -> true 4083 end, 4084 case lists:dropwhile(Pred,Atts) of 4085 [#xmlAttribute{value=Paths}|_] -> 4086 4087 case string:tokens(Paths," \n\t\r") of 4088 L when length(L) > 0 -> 4089 case length(L) rem 2 of 4090 0 -> 4091 PairList = 4092 fun([],_Fun) -> 4093 []; 4094 ([SLNS,SLLoc|Rest],Fun) -> 4095 [{SLNS,SLLoc}|Fun(Rest,Fun)] 4096 end, 4097 {ok,PairList(L,PairList)}; 4098 _ -> 4099 {error,{schemaLocation_attribute,namespace_location_not_in_pair}} 4100 end; 4101 _ -> 4102 {error,{missing_schemaLocation}} 4103 end; 4104 [] -> 4105 {error,{missing_schemaLocation}} 4106 end. 4107 4108inherit_options(S) -> 4109 %%?dbg("xsdbase: ~p~n",[S#xmerl_scanner.xmlbase]), 4110 [{xsdbase,S#xmerl_scanner.xmlbase}]. 4111 4112handle_schema_result({XSDRes=#xmlElement{},_},S5) -> 4113 {XSDRes,S5}; 4114handle_schema_result({error,Reason},S5) -> 4115 ?fatal({failed_schema_validation,Reason},S5). 4116 4117%%% Helper functions 4118 4119-compile({inline, [fatal_fun/1]}). 4120 4121-spec fatal_fun(_) -> fun((_) -> no_return()). 4122 4123fatal_fun(Reason) -> 4124 fun(S) -> ?fatal(Reason, S) end. 4125 4126fatal(Reason, S) -> 4127 exit({fatal, {Reason, 4128 {file,S#xmerl_scanner.filename}, 4129 {line,S#xmerl_scanner.line}, 4130 {col,S#xmerl_scanner.col}}}). 4131 4132%% preformat formats tokens in L1 and L2, L2 separated by Sep into a 4133%% list 4134preformat(L1,L2,Sep) -> 4135 Format1= lists:flatten(lists:duplicate(length(L1)-1,"~s ")++"~s"), 4136 Format2 = lists:flatten(lists:duplicate(length(L2)-1, 4137 " ~s"++Sep)++" ~s"), 4138 4139 lists:flatten(io_lib:format(Format1++Format2,L1++L2)). 4140 4141 4142%% BUG when we are many <!ATTLIST ..> balise none attributes has save in rules 4143rules_write(Context, Name, Value, #xmerl_scanner{rules = T} = S) -> 4144 case ets:lookup(T, {Context, Name}) of 4145 [] -> 4146 ets:insert(T, {{Context, Name}, Value}); 4147 _ -> 4148 ok 4149 end, 4150 S. 4151 4152 4153rules_read(Context, Name, #xmerl_scanner{rules = T}) -> 4154 case ets:lookup(T, {Context, Name}) of 4155 [] -> 4156 undefined; 4157 [{_, V}] -> 4158 V 4159 end. 4160 4161rules_delete(Context,Name,#xmerl_scanner{rules = T}) -> 4162 ets:delete(T,{Context,Name}). 4163 4164to_ucs(Encoding, Chars) when Encoding=="utf-8"; Encoding == undefined -> 4165 utf8_2_ucs(Chars); 4166to_ucs(_,[C|Rest]) -> 4167 {C,Rest}. 4168 4169utf8_2_ucs([A,B,C,D|Rest]) when A band 16#f8 =:= 16#f0, 4170 B band 16#c0 =:= 16#80, 4171 C band 16#c0 =:= 16#80, 4172 D band 16#c0 =:= 16#80 -> 4173 %% 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv 4174 case ((D band 16#3f) bor ((C band 16#3f) bsl 6) bor 4175 ((B band 16#3f) bsl 12) bor ((A band 16#07) bsl 18)) of 4176 Ch when Ch >= 16#10000 -> 4177 {Ch,Rest}; 4178 Ch -> 4179 {{error,{bad_character,Ch}},Rest} 4180 end; 4181utf8_2_ucs([A,B,C|Rest]) when A band 16#f0 =:= 16#e0, 4182 B band 16#c0 =:= 16#80, 4183 C band 16#c0 =:= 16#80 -> 4184 %% 1110vvvv 10vvvvvv 10vvvvvv 4185 case ((C band 16#3f) bor ((B band 16#3f) bsl 6) bor 4186 ((A band 16#0f) bsl 12)) of 4187 Ch when Ch >= 16#800 -> 4188 {Ch,Rest}; 4189 Ch -> 4190 {{error,{bad_character,Ch}},Rest} 4191 end; 4192utf8_2_ucs([A,B|Rest]) when A band 16#e0 =:= 16#c0, 4193 B band 16#c0 =:= 16#80 -> 4194 %% 110vvvvv 10vvvvvv 4195 case ((B band 16#3f) bor ((A band 16#1f) bsl 6)) of 4196 Ch when Ch >= 16#80 -> 4197 {Ch,Rest}; 4198 Ch -> 4199 {{error,{bad_character,Ch}},Rest} 4200 end; 4201utf8_2_ucs([A|Rest]) when A < 16#80 -> 4202 {A,Rest}; 4203utf8_2_ucs([A|Rest]) -> 4204 {{error,{bad_character,A}},Rest}. 4205 4206%% to_char_set("iso-10646-utf-1",Ch) -> 4207%% [Ch]; 4208%% to_char_set(UTF8,Ch) when UTF8 =:= "utf-8"; UTF8 =:= undefined -> 4209%% ucs_2_utf8(Ch); 4210%% to_char_set(_,Ch) -> 4211%% [Ch]. 4212 4213ucs_2_utf8(Ch) when Ch < 128 -> 4214 %% 0vvvvvvv 4215 [Ch]; 4216ucs_2_utf8(Ch) when Ch < 16#0800 -> 4217 %% Ch: -----vvv vvvvvvvv 4218 %% 110vvvvv 10vvvvvv 4219 %% O1 = (Ch band 16#07c0) bsr 6, 4220 %% O2 = (Ch band 16#003f), 4221 [((Ch band 16#07c0) bsr 6) bor 16#c0,(Ch band 16#003f) bor 16#80]; 4222ucs_2_utf8(Ch) when Ch < 16#10000 -> 4223 %% Ch: vvvvvvvv vvvvvvvv 4224 %% 1110vvvv 10vvvvvv 10vvvvvv 4225 %% O1 = (Ch band 16#f000) bsr 12 4226 %% O2 = (Ch band 16#0fc0) bsr 6 4227 %% O3 = (Ch band 16#003f) 4228 [((Ch band 16#f000) bsr 12) bor 16#e0, 4229 ((Ch band 16#0fc0) bsr 6) bor 16#80, 4230 (Ch band 16#003f) bor 16#80]; 4231ucs_2_utf8(Ch) when Ch < 16#200000 -> 4232 %% Ch: ---vvvvv vvvvvvvv vvvvvvvv 4233 %% 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv 4234 %% O1 = (Ch band 16#1c0000) bsr 18 4235 %% O2 = (Ch band 16#03f000) bsr 12 4236 %% O3 = (Ch band 16#000fc0) bsr 6 4237 %% O4 = (Ch band 16#00003f) 4238 [((Ch band 16#1c0000) bsr 18) bor 16#f0, 4239 ((Ch band 16#03f000) bsr 12) bor 16#80, 4240 ((Ch band 16#000fc0) bsr 6) bor 16#80, 4241 (Ch band 16#00003f) bor 16#80]. 4242 4243 4244string_to_char_set(Enc,Str) when Enc =:= "utf-8"; Enc =:= undefined -> 4245 lists:flatten([ucs_2_utf8(X)||X <- Str]); 4246string_to_char_set(_,Str) -> 4247 Str. 4248 4249%% diagnose(Line) -> 4250%% Mem=erlang:memory(), 4251%% {OldTot,OldLine} = get_total(), 4252%% NewTot = 4253%% case {lists:keysearch(total,1,Mem),OldTot*1.1} of 4254%% {{_,{_,Tot}},Tot110} when Tot > Tot110 -> 4255%% ?dbg("From ~p to ~p, total memory: ~p (~p)~n",[OldLine,Line,Tot,OldTot]), 4256%% Tot; 4257%% {{_,{_,Tot}},_} -> 4258%% Tot 4259%% end, 4260%% put_total({NewTot,Line}). 4261 4262%% get_total() -> 4263%% case get(xmerl_mem) of 4264%% undefined -> 4265%% put(xmerl_mem,{0,0}), 4266%% {0,0}; 4267%% M -> M 4268%% end. 4269 4270%% put_total(M) -> 4271%% put(xmerl_mem,M). 4272