1%%
2%% %CopyrightBegin%
3%%
4%% Copyright Ericsson AB 2003-2018. All Rights Reserved.
5%%
6%% Licensed under the Apache License, Version 2.0 (the "License");
7%% you may not use this file except in compliance with the License.
8%% You may obtain a copy of the License at
9%%
10%%     http://www.apache.org/licenses/LICENSE-2.0
11%%
12%% Unless required by applicable law or agreed to in writing, software
13%% distributed under the License is distributed on an "AS IS" BASIS,
14%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15%% See the License for the specific language governing permissions and
16%% limitations under the License.
17%%
18%% %CopyrightEnd%
19%%
20
21%% Description  : Simgle-pass XML scanner. See xmerl.hrl for data defs.
22
23%% @doc This module is the interface to the XML parser, it handles XML 1.0.
24%%     The XML parser is activated through
25%%     <tt>xmerl_scan:string/[1,2]</tt> or
26%%     <tt>xmerl_scan:file/[1,2]</tt>.
27%%     It returns records of the type defined in xmerl.hrl.
28%% See also <a href="xmerl_examples.html">tutorial</a> on customization
29%% functions.
30%% @type global_state(). <p>
31%% The global state of the scanner, represented by the #xmerl_scanner{} record.
32%% </p>
33%% @type option_list(). <p>Options allow to customize the behaviour of the
34%%     scanner.
35%% See also <a href="xmerl_examples.html">tutorial</a> on customization
36%% functions.
37%% </p>
38%% <p>
39%% Possible options are:
40%% </p>
41%% <dl>
42%%  <dt><code>{acc_fun, Fun}</code></dt>
43%%    <dd>Call back function to accumulate contents of entity.</dd>
44%%  <dt><code>{continuation_fun, Fun} |
45%%            {continuation_fun, Fun, ContinuationState}</code></dt>
46%%    <dd>Call back function to decide what to do if the scanner runs into EOF
47%%     before the document is complete.</dd>
48%%  <dt><code>{event_fun, Fun} |
49%%            {event_fun, Fun, EventState}</code></dt>
50%%    <dd>Call back function to handle scanner events.</dd>
51%%  <dt><code>{fetch_fun, Fun} |
52%%            {fetch_fun, Fun, FetchState}</code></dt>
53%%    <dd>Call back function to fetch an external resource.</dd>
54%%  <dt><code>{hook_fun, Fun} |
55%%            {hook_fun, Fun, HookState}</code></dt>
56%%    <dd>Call back function to process the document entities once
57%%     identified.</dd>
58%%  <dt><code>{close_fun, Fun}</code></dt>
59%%    <dd>Called when document has been completely parsed.</dd>
60%%  <dt><code>{rules, ReadFun, WriteFun, RulesState} |
61%%            {rules, Rules}</code></dt>
62%%    <dd>Handles storing of scanner information when parsing.</dd>
63%%  <dt><code>{user_state, UserState}</code></dt>
64%%    <dd>Global state variable accessible from all customization functions</dd>
65%%
66%%  <dt><code>{fetch_path, PathList}</code></dt>
67%%    <dd>PathList is a list of
68%%     directories to search when fetching files. If the file in question
69%%     is not in the fetch_path, the URI will be used as a file
70%%     name.</dd>
71%%  <dt><code>{space, Flag}</code></dt>
72%%    <dd>'preserve' (default) to preserve spaces, 'normalize' to
73%%    accumulate consecutive whitespace and replace it with one space.</dd>
74%%  <dt><code>{line, Line}</code></dt>
75%%    <dd>To specify starting line for scanning in document which contains
76%%    fragments of XML.</dd>
77%%  <dt><code>{namespace_conformant, Flag}</code></dt>
78%%    <dd>Controls whether to behave as a namespace conformant XML parser,
79%%    'false' (default) to not otherwise 'true'.</dd>
80%%  <dt><code>{validation, Flag}</code></dt>
81%%    <dd>Controls whether to process as a validating XML parser:
82%%    'off' (default) no validation, or validation 'dtd' by DTD or 'schema'
83%%    by XML Schema. 'false' and 'true' options are obsolete
84%%    (i.e. they may be removed in a future release), if used 'false'
85%%    equals 'off' and 'true' equals 'dtd'.</dd>
86%%  <dt><code>{schemaLocation, [{Namespace,Link}|...]}</code></dt>
87%%    <dd>Tells explicitly which XML Schema documents to use to validate
88%%    the XML document. Used together with the
89%%    <code>{validation,schema}</code> option.</dd>
90%%  <dt><code>{quiet, Flag}</code></dt>
91%%    <dd>Set to 'true' if xmerl should behave quietly and not output any
92%%    information to standard output (default 'false').</dd>
93%%  <dt><code>{doctype_DTD, DTD}</code></dt>
94%%    <dd>Allows to specify DTD name when it isn't available in the XML
95%%    document. This option has effect only together with
96%%    <code>{validation,'dtd'</code> option.</dd>
97%%  <dt><code>{xmlbase, Dir}</code></dt>
98%%    <dd>XML Base directory. If using string/1 default is current directory.
99%%    If using file/1 default is directory of given file.</dd>
100%%  <dt><code>{encoding, Enc}</code></dt>
101%%    <dd>Set default character set used (default UTF-8).
102%%    This character set is used only if not explicitly given by the XML
103%%    declaration. </dd>
104%%  <dt><code>{document, Flag}</code></dt>
105%%    <dd>Set to 'true' if xmerl should return a complete XML document
106%%    as an xmlDocument record (default 'false').</dd>
107%%  <dt><code>{comments, Flag}</code></dt>
108%%    <dd>Set to 'false' if xmerl should skip comments otherwise they will
109%%    be returned as xmlComment records (default 'true').</dd>
110%%  <dt><code>{default_attrs, Flag}</code></dt>
111%%    <dd>Set to 'true' if xmerl should add to elements missing attributes
112%%    with a defined default value (default 'false').</dd>
113%% </dl>
114%% @type xmlElement() = #xmlElement{}.
115%% The record definition is found in xmerl.hrl.
116%% @type xmlDocument() = #xmlDocument{}.
117%% The record definition is found in xmerl.hrl.
118%% @type document() = xmlElement() | xmlDocument(). <p>
119%% The document returned by <tt>xmerl_scan:string/[1,2]</tt> and
120%% <tt>xmerl_scan:file/[1,2]</tt>. The type of the returned record depends on
121%% the value of the document option passed to the function.
122%% </p>
123
124-module(xmerl_scan).
125-vsn('0.20').
126-date('03-09-16').
127
128%% main API
129-export([string/1, string/2,
130	 file/1, file/2]).
131
132%% access functions for various states
133-export([user_state/1, user_state/2,
134	 event_state/1, event_state/2,
135	 hook_state/1, hook_state/2,
136	 rules_state/1, rules_state/2,
137	 fetch_state/1, fetch_state/2,
138	 cont_state/1, cont_state/2]).
139
140%% helper functions. To xmerl_lib ??
141-export([accumulate_whitespace/4]).
142
143-export_type([xmlElement/0]).
144
145%-define(debug, 1).
146-include("xmerl.hrl").		% record def, macros
147-include("xmerl_internal.hrl").
148-include_lib("kernel/include/file.hrl").
149
150-type xmlElement() :: #xmlElement{}.
151
152-define(fatal(Reason, S),
153	if
154	    S#xmerl_scanner.quiet ->
155		ok;
156	    true ->
157		error_logger:error_msg("~p- fatal: ~p~n", [?LINE, Reason]),
158		ok
159	end,
160	fatal(Reason, S)).
161
162
163-define(ustate(U, S), S#xmerl_scanner{user_state = U}).
164
165
166%% Functions to access the various states
167
168%%% @spec user_state(S::global_state()) -> global_state()
169%%% @equiv user_state(UserState,S)
170user_state(#xmerl_scanner{user_state = S}) -> S.
171
172%%% @spec event_state(S::global_state()) -> global_state()
173%%% @equiv event_state(EventState,S)
174event_state(#xmerl_scanner{fun_states = #xmerl_fun_states{event = S}}) -> S.
175
176%%% @spec hook_state(S::global_state()) -> global_state()
177%%% @equiv hook_state(HookState,S)
178hook_state(#xmerl_scanner{fun_states = #xmerl_fun_states{hook = S}}) -> S.
179
180%%% @spec rules_state(S::global_state()) -> global_state()
181%%% @equiv rules_state(RulesState,S)
182rules_state(#xmerl_scanner{fun_states = #xmerl_fun_states{rules = S}}) -> S.
183
184%%% @spec fetch_state(S::global_state()) -> global_state()
185%%% @equiv fetch_state(FetchState,S)
186fetch_state(#xmerl_scanner{fun_states = #xmerl_fun_states{fetch = S}}) -> S.
187
188%%% @spec cont_state(S::global_state()) -> global_state()
189%%% @equiv cont_state(ContinuationState,S)
190cont_state(#xmerl_scanner{fun_states = #xmerl_fun_states{cont = S}}) -> S.
191
192
193%%%% Functions to modify the various states
194
195%%% @spec user_state(UserState, S::global_state()) -> global_state()
196%%% @doc For controlling the UserState, to be used in a user function.
197%%% See <a href="xmerl_examples.html">tutorial</a> on customization functions.
198user_state(X, S) ->
199    S#xmerl_scanner{user_state = X}.
200
201%%% @spec event_state(EventState, S::global_state()) -> global_state()
202%%% @doc For controlling the EventState, to be used in an event
203%%% function, and called at the beginning and at the end of a parsed entity.
204%%% See <a href="xmerl_examples.html">tutorial</a> on customization functions.
205event_state(X, S=#xmerl_scanner{fun_states = FS}) ->
206    FS1 = FS#xmerl_fun_states{event = X},
207    S#xmerl_scanner{fun_states = FS1}.
208
209%%% @spec hook_state(HookState, S::global_state()) -> global_state()
210%%% @doc For controlling the HookState, to be used in a hook
211%%% function, and called when the parser has parsed a complete entity.
212%%% See <a href="xmerl_examples.html">tutorial</a> on customization functions.
213hook_state(X, S=#xmerl_scanner{fun_states = FS}) ->
214    FS1 = FS#xmerl_fun_states{hook = X},
215    S#xmerl_scanner{fun_states = FS1}.
216
217%%% @spec rules_state(RulesState, S::global_state()) -> global_state()
218%%% @doc For controlling the RulesState, to be used in a rules
219%%% function, and called when the parser store scanner information in a rules
220%%% database.
221%%% See <a href="xmerl_examples.html">tutorial</a> on customization functions.
222rules_state(X, S=#xmerl_scanner{fun_states = FS}) ->
223    FS1 = FS#xmerl_fun_states{rules = X},
224    S#xmerl_scanner{fun_states = FS1}.
225
226%%% @spec fetch_state(FetchState, S::global_state()) -> global_state()
227%%% @doc For controlling the FetchState, to be used in a fetch
228%%% function, and called when the parser fetch an external resource (eg. a DTD).
229%%% See <a href="xmerl_examples.html">tutorial</a> on customization functions.
230fetch_state(X, S=#xmerl_scanner{fun_states = FS}) ->
231    FS1 = FS#xmerl_fun_states{fetch = X},
232    S#xmerl_scanner{fun_states = FS1}.
233
234%%% @spec cont_state(ContinuationState, S::global_state()) -> global_state()
235%%% @doc For controlling the ContinuationState, to be used in a continuation
236%%% function, and called when the parser encounters the end of the byte stream.
237%%% See <a href="xmerl_examples.html">tutorial</a> on customization functions.
238cont_state(X, S=#xmerl_scanner{fun_states = FS}) ->
239    FS1 = FS#xmerl_fun_states{cont = X},
240    S#xmerl_scanner{fun_states = FS1}.
241
242
243%% @spec file(Filename::string()) -> {xmlElement(),Rest}
244%%   Rest = list()
245%% @equiv file(Filename, [])
246file(F) ->
247    file(F, []).
248
249%% @spec file(Filename::string(), Options::option_list()) -> {document(),Rest}
250%%   Rest = list()
251%%% @doc Parse file containing an XML document
252file(F, Options) ->
253    ExtCharset=case lists:keysearch(encoding,1,Options) of
254		   {value,{_,Val}} -> Val;
255		   false -> undefined
256	       end,
257    case int_file(F,Options,ExtCharset) of
258	{Res, Tail,S=#xmerl_scanner{close_fun=Close}} ->
259	    Close(S), % for side effects only - final state is dropped
260	    {Res,Tail};
261	{error, Reason} ->
262	    {error, Reason}
263    end.
264
265int_file(F, Options,_ExtCharset) ->
266     %%?dbg("int_file F=~p~n",[F]),
267    case file:read_file(F) of
268	{ok, Bin} ->
269	    int_string(binary_to_list(Bin), Options, filename:dirname(F),F);
270	Error ->
271	    Error
272    end.
273
274int_file_decl(F, Options,_ExtCharset) ->
275%     ?dbg("int_file_decl F=~p~n",[F]),
276    case file:read_file(F) of
277	{ok, Bin} ->
278	    int_string_decl(binary_to_list(Bin), Options, filename:dirname(F),F);
279	Error ->
280	    Error
281    end.
282
283%% @spec string(Text::list()) -> {xmlElement(),Rest}
284%%   Rest = list()
285%% @equiv string(Text, [])
286string(Str) ->
287    string(Str, []).
288
289%% @spec string(Text::list(),Options::option_list()) -> {document(),Rest}
290%%   Rest = list()
291%%% @doc Parse string containing an XML document
292string(Str, Options) ->
293     {Res, Tail, S=#xmerl_scanner{close_fun = Close}} =
294	int_string(Str, Options,file_name_unknown),
295    Close(S),    % for side effects only - final state is dropped
296    {Res,Tail}.
297
298int_string(Str, Options,FileName) ->
299    {ok,  XMLBase} = file:get_cwd(),
300    int_string(Str, Options, XMLBase, FileName).
301
302int_string(Str, Options, XMLBase, FileName) ->
303    S0=initial_state0(Options,XMLBase),
304    S = S0#xmerl_scanner{filename=FileName},
305    %%?dbg("int_string1, calling xmerl_lib:detect_charset~n",[]),
306
307    %% In case of no encoding attribute in document utf-8 is default, but
308    %% another character set may be detected with help of Byte Order Marker or
309    %% with help of the encoding of the first 4 bytes.
310    case xmerl_lib:detect_charset(S#xmerl_scanner.encoding,Str) of
311	{auto,'iso-10646-utf-1',Str2} ->
312	    scan_document(Str2, S#xmerl_scanner{encoding="iso-10646-utf-1"});
313	{external,'iso-10646-utf-1',Str2} ->
314	    scan_document(Str2, S#xmerl_scanner{encoding="iso-10646-utf-1"});
315	{undefined,undefined,Str2} -> %% no auto detection
316	    scan_document(Str2, S);
317	{external,ExtCharset,Str2} ->
318	    %% no auto detection, ExtCharset is an explicitly provided
319	    %% 7 bit,8 bit or utf-8 encoding
320	    scan_document(Str2, S#xmerl_scanner{encoding=atom_to_list(ExtCharset)})
321    end.
322
323int_string_decl(Str, Options, XMLBase, FileName) ->
324    S0=initial_state0(Options,XMLBase),
325    S = S0#xmerl_scanner{filename=FileName},
326    case xmerl_lib:detect_charset(S#xmerl_scanner.encoding,Str) of
327	{auto,'iso-10646-utf-1',Str2} ->
328	    scan_decl(Str2, S#xmerl_scanner{encoding="iso-10646-utf-1"});
329	{external,'iso-10646-utf-1',Str2} ->
330	    scan_decl(Str2, S#xmerl_scanner{encoding="iso-10646-utf-1"});
331	{undefined,undefined,Str2} ->
332	    scan_decl(Str2, S);
333	{external,ExtCharset,Str2} ->
334	    scan_decl(Str2, S#xmerl_scanner{encoding=atom_to_list(ExtCharset)})
335    end.
336
337
338
339initial_state0(Options,XMLBase) ->
340    CommonData = common_data(),
341    initial_state(Options, #xmerl_scanner{
342		    event_fun = fun event/2,
343		    hook_fun = fun hook/2,
344		    acc_fun = fun acc/3,
345		    fetch_fun = fun fetch/2,
346		    close_fun = fun close/1,
347		    continuation_fun = fun cont/3,
348		    rules_read_fun = fun rules_read/3,
349		    rules_write_fun = fun rules_write/4,
350		    rules_delete_fun= fun rules_delete/3,
351		    xmlbase = XMLBase,
352                    common_data = CommonData
353		   }).
354
355initial_state([{event_fun, F}|T], S) ->
356    initial_state(T, S#xmerl_scanner{event_fun = F});
357initial_state([{event_fun, F, ES}|T], S) ->
358    S1 = event_state(ES, S#xmerl_scanner{event_fun = F}),
359    initial_state(T, S1);
360initial_state([{acc_fun, F}|T], S) ->
361    initial_state(T, S#xmerl_scanner{acc_fun = F});
362initial_state([{hook_fun, F}|T], S) ->
363    initial_state(T, S#xmerl_scanner{hook_fun = F});
364initial_state([{hook_fun, F, HS}|T], S) ->
365    S1 = hook_state(HS, S#xmerl_scanner{hook_fun = F}),
366    initial_state(T, S1);
367initial_state([{close_fun, F}|T], S) ->
368    initial_state(T, S#xmerl_scanner{close_fun = F});
369initial_state([{fetch_fun, F}|T], S) ->
370    initial_state(T, S#xmerl_scanner{fetch_fun = F});
371initial_state([{fetch_fun, F, FS}|T], S) ->
372    S1 = fetch_state(FS, S#xmerl_scanner{fetch_fun = F}),
373    initial_state(T, S1);
374initial_state([{fetch_path, P}|T], S) ->
375    initial_state(T, S#xmerl_scanner{fetch_path = P});
376initial_state([{continuation_fun, F}|T], S) ->
377    initial_state(T, S#xmerl_scanner{continuation_fun = F});
378initial_state([{continuation_fun, F, CS}|T], S) ->
379    S1 = cont_state(CS, S#xmerl_scanner{continuation_fun = F}),
380    initial_state(T, S1);
381initial_state([{rules, R}|T], S) ->
382    initial_state(T, S#xmerl_scanner{rules = R,
383				     keep_rules = true});
384initial_state([{rules, Read, Write, RS}|T], S) ->
385    S1 = rules_state(RS, S#xmerl_scanner{rules_read_fun = Read,
386					 rules_write_fun = Write,
387					 keep_rules = true}),
388    initial_state(T, S1);
389initial_state([{user_state, F}|T], S) ->
390    initial_state(T, S#xmerl_scanner{user_state = F});
391initial_state([{space, L}|T], S) ->
392    initial_state(T, S#xmerl_scanner{space = L});
393initial_state([{line, L}|T], S) ->
394    initial_state(T, S#xmerl_scanner{line = L});
395initial_state([{namespace_conformant, F}|T], S) when F==true; F==false ->
396    initial_state(T, S#xmerl_scanner{namespace_conformant = F});
397initial_state([{validation, F}|T], S)
398  when F==off; F==dtd; F==schema; F==true; F==false ->
399    initial_state(T, S#xmerl_scanner{validation = validation_value(F)});
400initial_state([{schemaLocation, SL}|T], S) when is_list(SL) ->
401    initial_state(T, S#xmerl_scanner{schemaLocation=SL});
402initial_state([{quiet, F}|T], S) when F==true; F==false ->
403    initial_state(T, S#xmerl_scanner{quiet = F});
404initial_state([{doctype_DTD,DTD}|T], S) ->
405    initial_state(T,S#xmerl_scanner{doctype_DTD = DTD});
406initial_state([{document, F}|T], S) when is_boolean(F) ->
407    initial_state(T,S#xmerl_scanner{document = F});
408initial_state([{comments, F}|T], S) when is_boolean(F) ->
409    initial_state(T,S#xmerl_scanner{comments = F});
410initial_state([{default_attrs, F}|T], S) when is_boolean(F) ->
411    initial_state(T,S#xmerl_scanner{default_attrs = F});
412initial_state([{text_decl,Bool}|T], S) ->
413    initial_state(T,S#xmerl_scanner{text_decl=Bool});
414initial_state([{environment,Env}|T], S) ->
415    initial_state(T,S#xmerl_scanner{environment=Env});
416initial_state([{xmlbase, D}|T], S) ->
417    initial_state(T, S#xmerl_scanner{xmlbase = D});
418initial_state([{encoding, Enc}|T], S) ->
419    initial_state(T, S#xmerl_scanner{encoding = Enc});
420initial_state([], S=#xmerl_scanner{rules = undefined}) ->
421    Tab = ets:new(rules, [set, public]),
422    S#xmerl_scanner{rules = Tab};
423initial_state([], S) ->
424    S.
425
426validation_value(true) ->
427    dtd;
428validation_value(false) ->
429    off;
430validation_value(F) ->
431    F.
432
433%% Used for compacting (some) indentations.
434%% See also fast_accumulate_whitespace().
435common_data() ->
436    {comdata(lists:duplicate(60, $\s), []),
437     comdata(lists:duplicate(15, $\t), []),
438     "\n"}.
439
440comdata([], CD)->
441    list_to_tuple(CD);
442comdata([_ | T]=L, CD) ->
443    comdata(T, [[$\n | L] | CD]).
444
445%%% -----------------------------------------------------
446%%% Default modifier functions
447
448%%% Hooks:
449%%% - {element, Line, Name, Attrs, Content}
450%%% - {processing_instruction, Line, Data}
451
452hook(X, State) ->
453    {X, State}.
454
455%%% Events:
456%%%
457%%% #xmerl_event{event : started | ended,
458%%%              line  : integer(),
459%%%		 col   : integer(),
460%%%              data}
461%%%
462%%% Data		Events
463%%% document		started, ended
464%%% #xmlElement		started, ended
465%%% #xmlAttribute	ended
466%%% #xmlPI		ended
467%%% #xmlComment		ended
468%%% #xmlText		ended
469event(_X, S) ->
470    S.
471
472%% The acc/3 function can return either {Acc´, S'} or {Acc', Pos', S'},
473%% where Pos' can be derived from X#xmlElement.pos, X#xmlText.pos, or
474%% X#xmlAttribute.pos (whichever is the current object type.)
475%% The acc/3 function is not allowed to redefine the type of object
476%% being defined, but _is_ allowed to either ignore it or split it
477%% into multiple objects (in which case {Acc',Pos',S'} should be returned.)
478%% If {Acc',S'} is returned, Pos will be incremented by 1 by default.
479%% Below is an example of an acceptable operation
480acc(#xmlText{value = Text}, [X = #xmlText{value = AccText}], S) ->
481    {[X#xmlText{value = AccText ++ Text}], S};
482acc(X, Acc, S) ->
483    {[X|Acc], S}.
484
485fetch({system, URI}, S) ->
486    fetch_URI(URI, S);
487fetch({public, _PublicID, URI}, S) ->
488    fetch_URI(URI, S).
489
490%%% Always assume an external resource can be found locally! Thus
491%%% don't bother fetching with e.g. HTTP. Returns the path where the
492%%% resource is found.  The path to the external resource is given by
493%%% URI directly or the option fetch_path (additional paths) or
494%%% directory (base path to external resource)
495fetch_URI(URI, S) ->
496    %% assume URI is a filename
497    Split = filename:split(URI),
498    Filename = fun([])->[];(X)->lists:last(X) end (Split),
499    Fullname =
500	case Split of %% how about Windows systems?
501	    ["file:"|Name]-> %% absolute path, see RFC2396 sect 3
502		%% file:/dtd_name
503		filename:join(["/"|Name]);
504	    ["/"|Rest] when Rest /= [] ->
505		%% absolute path name
506		URI;
507	    ["http:"|_Rest] ->
508		{http,URI};
509	    [] -> %% empty systemliteral
510		[];
511	    _ ->
512		filename:join(S#xmerl_scanner.xmlbase, URI)
513	end,
514    Path = path_locate(S#xmerl_scanner.fetch_path, Filename, Fullname),
515    ?dbg("fetch(~p) -> {file, ~p}.~n", [URI, Path]),
516    {ok, Path, S}.
517
518path_locate(_, _, {http,_}=URI) ->
519    URI;
520path_locate(_, _, []) ->
521    [];
522path_locate([Dir|Dirs], FN, FullName) ->
523    F = filename:join(Dir, FN),
524    case file:read_file_info(F) of
525	{ok, #file_info{type = regular}} ->
526	    {file,F};
527	_ ->
528	    path_locate(Dirs, FN, FullName)
529    end;
530path_locate([], _FN, FullName) ->
531    {file,FullName}.
532
533
534cont(_F, Exception, US) ->
535    Exception(US).
536
537close(S) ->
538    S.
539
540
541%%% -----------------------------------------------------
542%%% Scanner
543
544%%% [1] document ::= prolog element Misc*
545scan_document(Str0, S=#xmerl_scanner{event_fun = Event,
546				     line = L, col = C,
547				     environment=Env,
548				     encoding=Charset,
549				     document=Document,
550				     validation=ValidateResult}) ->
551    S1 = Event(#xmerl_event{event = started,
552			    line = L,
553			    col = C,
554			    data = document}, S),
555
556    %% Transform to given character set.
557    %% Note that if another character set is given in the encoding
558    %% attribute in a XML declaration that one will be used later
559    Str=if
560	    Charset == "utf-8" ->
561		Str0;
562	    Charset =/= undefined -> % Default character set is UTF-8
563		xmerl_ucs:to_unicode(Str0, list_to_atom(Charset));
564	    true -> %% Charset is undefined if no external input is
565                    %% given, and no auto detection of character
566                    %% encoding was made.
567		Str0
568	end,
569%%     M1 = erlang:memory(),
570%%     ?dbg("Memory status before prolog: ~p~n",[M1]),
571    {Prolog, Pos, T1, S2} = scan_prolog(Str, S1, _StartPos = 1),
572%%     M2 = erlang:memory(),
573%%     ?dbg("Memory status after prolog: ~p~n",[M2]),
574    %%?dbg("scan_document 2, prolog parsed~n",[]),
575    T2 = scan_mandatory("<", T1, 1, S2, expected_element_start_tag),
576%%     M3 = erlang:memory(),
577%%     ?dbg("Memory status before element: ~p~n",[M3]),
578    {Res, T3, S3} = scan_element(T2,S2,Pos),
579%%     M4 = erlang:memory(),
580%%     ?dbg("Memory status after element: ~p~n",[M4]),
581    {Misc, _Pos1, Tail, S4}=scan_misc(T3, S3, Pos + 1),
582%%     M5 = erlang:memory(),
583%%     ?dbg("Memory status after misc: ~p~n",[M5]),
584
585    S5 = #xmerl_scanner{} = Event(#xmerl_event{event = ended,
586					       line = S4#xmerl_scanner.line,
587					       col = S4#xmerl_scanner.col,
588					       data = document}, S4),
589
590    {Res2, S6} = case validation_mode(ValidateResult) of
591	     off ->
592		 {Res, cleanup(S5)};
593	     dtd when Env == element; Env == prolog ->
594		 check_decl2(S5),
595		 case xmerl_validate:validate(S5, Res) of
596		     {'EXIT', {error, Reason}} ->
597			 S5b = cleanup(S5),
598			 ?fatal({failed_validation, Reason}, S5b);
599		     {'EXIT', Reason} ->
600			 S5b = cleanup(S5),
601			 ?fatal({failed_validation, Reason}, S5b);
602		     {error, Reason} ->
603			 S5b = cleanup(S5),
604			 ?fatal({failed_validation, Reason}, S5b);
605		     {error, Reason, _Next} ->
606			 S5b = cleanup(S5),
607			 ?fatal({failed_validation, Reason}, S5b);
608		     _XML ->
609			 {Res, cleanup(S5)}
610		 end;
611	     schema ->
612		 case schemaLocations(Res, S5) of
613		     {ok, Schemas} ->
614			 _ = cleanup(S5),
615			 %%?dbg("Schemas: ~p~nRes: ~p~ninhertih_options(S): ~p~n",
616			 %%          [Schemas,Res,inherit_options(S5)]),
617			 XSDRes = xmerl_xsd:process_validate(Schemas, Res,
618							     inherit_options(S5)),
619			 handle_schema_result(XSDRes, S5);
620		     _ ->
621			 {Res, cleanup(S5)}
622		 end;
623	     _ ->
624		 {Res, cleanup(S5)}
625	 end,
626
627    Res3 =
628	case Document of
629	    true ->
630		Content = lists:reverse(Prolog, [Res2 | lists:reverse(Misc)]),
631		#xmlDocument{content = Content};
632	    false ->
633		Res2
634	end,
635    {Res3, Tail, S6}.
636
637
638scan_decl(Str, S=#xmerl_scanner{event_fun = Event,
639				line = L, col = C,
640				environment=_Env,
641				encoding=_Charset,
642				validation=_ValidateResult}) ->
643    S1 = Event(#xmerl_event{event = started,
644			    line = L,
645			    col = C,
646			    data = document}, S),
647
648    case scan_prolog(Str, S1, _StartPos = 1) of
649	{_,_,T2="<"++_, S2} ->
650	    {{S2#xmerl_scanner.user_state,T2},[],S2};
651	{_,_,[], S2}->
652	    {[],[],S2};
653	{_,_,T2, S2} ->
654	    {_,_,S3} = scan_content(T2,S2,[],_Attrs=[],S2#xmerl_scanner.space,
655				    _Lang=[],_Parents=[],#xmlNamespace{}),
656	    {T2,[],S3}
657    end.
658
659
660%%% [22] Prolog
661%%% prolog    ::=    XMLDecl? Misc* (doctypedecl Misc*)?
662%%%
663%% empty text declarations are handled by the first function clause.
664scan_prolog(T, S, Pos) ->
665    scan_prolog(T, S, Pos, []).
666scan_prolog([], S=#xmerl_scanner{continuation_fun = F}, Pos, Acc) ->
667    ?dbg("cont()...~n", []),
668    F(fun(MoreBytes, S1) -> scan_prolog(MoreBytes, S1, Pos, Acc) end,
669      fun(S1) -> {Acc, Pos, [], S1} end,
670      S);
671scan_prolog("<?xml"++T,
672	    S0=#xmerl_scanner{encoding=Charset0,col=Col,line=L},
673	    Pos,Acc) when ?whitespace(hd(T)) ->
674    {Charset, T3, S3} =
675    if
676	Col==1,L==1,S0#xmerl_scanner.text_decl==true ->
677	    ?dbg("prolog(\"<?xml\")~n", []),
678	    ?bump_col(5),
679	    {_,T1,S1} = mandatory_strip(T,S),
680	    {Decl,T2, S2}=scan_text_decl(T1,S1),
681	    Encoding=Decl#xmlDecl.encoding,
682	    {Encoding, T2, S2#xmerl_scanner{encoding=Encoding}};
683	Col==1,L==1 ->
684	    ?dbg("prolog(\"<?xml\")~n", []),
685	    ?bump_col(5),
686	    {Decl,T2, S2}=scan_xml_decl(T, S),
687	    Encoding=Decl#xmlDecl.encoding,
688	    {Encoding, T2, S2#xmerl_scanner{encoding=Encoding}};
689	true ->
690	    ?fatal({xml_declaration_must_be_first_in_doc,Col,L},S0)
691    end,
692    %% Charset0 is either (1) 'iso-10646-utf-1' (transformation by
693    %% auto detection), (2) undefined (no auto detection and no
694    %% external encoding), (3) any other encoding format that must be
695    %% conformant to the internal explicitly given encoding. The two
696    %% former cases implies that the explicit internal encoding
697    %% (Charset) may be different from Charset0.
698
699    %% Now transform to declared character set.
700    if
701	Charset==Charset0 -> % Document already transformed to this charset!
702	    scan_prolog(T3, S3, Pos, Acc);
703	Charset0=/=undefined ->
704	    %% For example may an external entity
705	    %% have the BOM for utf-16 and the internal
706	    %% explicit encoding='utf-16', then it will be auto
707	    %% detected and transformed, Charset0 will be
708	    %% 'iso-10646-utf-1', and Charset will be 'utf-16', all
709	    %% legal.
710	    %%
711	    scan_prolog(T3,S3#xmerl_scanner{encoding=Charset0},Pos,Acc);
712	Charset == "utf-8" ->
713	    scan_prolog(T3, S3, Pos, Acc);
714	Charset=/=undefined -> % Document not previously transformed
715	    T4=xmerl_ucs:to_unicode(T3,list_to_atom(Charset)),
716	    scan_prolog(T4, S3, Pos, Acc);
717	true -> % No encoding info given
718	    scan_prolog(T3, S3, Pos, Acc)
719    end;
720scan_prolog("<!DOCTYPE" ++ T,
721	    S0=#xmerl_scanner{environment=prolog,encoding=_Charset},
722	    Pos, Acc) ->
723    ?dbg("prolog(\"<!DOCTYPE\")~n", []),
724    ?bump_col(9),
725    %% If no known character set assume it is UTF-8
726    T1=if
727	%%   Charset==undefined -> xmerl_ucs:to_unicode(T,'utf-8');
728	   true -> T
729       end,
730    {T2, S1} = scan_doctype(T1, S),
731    scan_misc(T2, S1, Pos, Acc);
732scan_prolog(Str="%"++_T,S=#xmerl_scanner{environment={external,_}},
733	    Pos,Acc) ->
734    {T, S1} = scan_ext_subset(Str,S),
735    {Acc, Pos, T, S1};
736scan_prolog(Str, S0 = #xmerl_scanner{user_state=_US,encoding=_Charset},
737	    Pos,Acc) ->
738    ?dbg("prolog(\"<\")~n", []),
739
740    %% Check for Comments, PI before possible DOCTYPE declaration
741    ?bump_col(1),
742    %% If no known character set assume it is UTF-8
743    T=if
744%%	  Charset==undefined -> xmerl_ucs:to_unicode(Str,'utf-8');
745	  true -> Str
746      end,
747    {Acc1, Pos1, T1, S1}=scan_misc(T, S, Pos, Acc),
748    scan_prolog2(T1,S1,Pos1,Acc1).
749
750
751
752scan_prolog2([], S=#xmerl_scanner{continuation_fun = F}, Pos, Acc) ->
753    ?dbg("cont()...~n", []),
754    F(fun(MoreBytes, S1) -> scan_prolog2(MoreBytes, S1, Pos, Acc) end,
755      fun(S1) -> {Acc, Pos, [], S1} end,
756      S);
757scan_prolog2("<!DOCTYPE" ++ T, S0=#xmerl_scanner{environment=prolog},
758	     Pos, Acc) ->
759    ?dbg("prolog(\"<!DOCTYPE\")~n", []),
760    ?bump_col(9),
761    {T1, S1} = scan_doctype(T, S),
762    scan_misc(T1, S1, Pos, Acc);
763scan_prolog2(Str = "<!" ++ _, S, Pos, Acc) ->
764    ?dbg("prolog(\"<!\")~n", []),
765    %% In e.g. a DTD, we jump directly to markup declarations
766    {T, S1} = scan_ext_subset(Str, S),
767    {Acc, Pos, T, S1};
768scan_prolog2(Str, S0 = #xmerl_scanner{user_state=_US},Pos,Acc) ->
769    ?dbg("prolog(\"<\")~n", []),
770
771    %% Here we consider the DTD provided by doctype_DTD option,
772    S1 =
773	case S0 of
774	    #xmerl_scanner{validation=dtd,doctype_DTD=DTD} when is_list(DTD) ->
775		S=fetch_DTD(undefined,S0),
776		check_decl(S),
777		S;
778	    _ -> S0
779	end,
780    %% Check for more Comments and PI after DOCTYPE declaration
781%    ?bump_col(1),
782    scan_misc(Str, S1, Pos, Acc).
783
784
785
786
787%%% [27] Misc ::=   	Comment | PI | S
788%% Note:
789%% - Neither of Comment and PI are returned in the resulting parsed
790%%   structure.
791%% - scan_misc/3 implements Misc* as that is how the rule is always used
792scan_misc(T, S, Pos) ->
793    scan_misc(T, S, Pos, []).
794scan_misc([], S=#xmerl_scanner{continuation_fun = F}, Pos, Acc) ->
795    ?dbg("cont()...~n", []),
796    F(fun(MoreBytes, S1) -> scan_misc(MoreBytes, S1, Pos, Acc) end,
797      fun(S1) -> {Acc, Pos, [], S1} end,
798      S);
799scan_misc("<!--" ++ T, S0=#xmerl_scanner{acc_fun = F, comments=CF}, Pos, Acc) -> % Comment
800    ?bump_col(4),
801    {C, T1, S1} = scan_comment(T, S, Pos, _Parents = [], _Lang = []),
802    case CF of
803	true ->
804	    {Acc2, Pos2, S3} =
805		case F(C, Acc, S1) of
806		    {Acc1, S2} ->
807			{Acc1, Pos + 1, S2};
808		    {Acc1, Pos1, S2} ->
809			{Acc1, Pos1, S2}
810		end,
811	    scan_misc(T1, S3, Pos2, Acc2);
812	false ->
813	    scan_misc(T1, S1, Pos, Acc)
814    end;
815scan_misc("<?" ++ T, S0=#xmerl_scanner{acc_fun = F}, Pos, Acc) -> % PI
816    ?dbg("prolog(\"<?\")~n", []),
817    ?bump_col(2),
818    {PI, T1, S1} = scan_pi(T, S, Pos, []),
819    {Acc2, Pos2, S3} = case F(PI, Acc, S1) of
820			   {Acc1, S2} ->
821			       {Acc1, Pos + 1, S2};
822			   {Acc1, Pos1, S2} ->
823			       {Acc1, Pos1, S2}
824		       end,
825    scan_misc(T1,S3,Pos2,Acc2);
826scan_misc(T=[H|_T], S, Pos, Acc) when ?whitespace(H) ->
827    ?dbg("prolog(whitespace)~n", []),
828    {_,T1,S1}=strip(T,S),
829    scan_misc(T1,S1,Pos,Acc);
830scan_misc(T,S,Pos,Acc) ->
831    {Acc,Pos,T,S}.
832
833
834cleanup(S=#xmerl_scanner{keep_rules = false,
835			 rules = Rules}) ->
836    ets:delete(Rules),
837    S#xmerl_scanner{rules = undefined};
838cleanup(S) ->
839    S.
840
841%%% Prolog and Document Type Declaration XML 1.0 Section 2.8
842%% [23] XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>'
843%% [24] VersionInfo ::= S 'version' Eq ("'" VersionNum "'" | '"' VersionNum '"')
844scan_xml_decl(T, S) ->
845    %% VersionInfo [24] is mandatory
846    {_,T1,S1} = mandatory_strip(T,S),
847    {T2,S2} =
848	case T1 of
849	    "version" ++ _T2 ->
850		{_T2,S1#xmerl_scanner{col=S1#xmerl_scanner.col+7}};
851	    _ -> ?fatal(expected_version_attribute,S1)
852	end,
853    {T3, S3} = scan_eq(T2, S2),
854    {Vsn, T4, S4} = scan_xml_vsn(T3, S3),
855    Attr = #xmlAttribute{name = version,
856			 parents = [{xml, _XMLPos = 1}],
857			 value = Vsn},
858    scan_xml_decl(T4, S4, #xmlDecl{vsn = Vsn,
859				   attributes = [Attr]}).
860
861scan_xml_decl([], S=#xmerl_scanner{continuation_fun = F}, Decl) ->
862    ?dbg("cont()...~n", []),
863    F(fun(MoreBytes, S1) -> scan_xml_decl(MoreBytes, S1, Decl) end,
864      fun(S1) -> {[], [], S1} end,
865      S);
866scan_xml_decl("?>" ++ T, S0, Decl) ->
867    ?bump_col(2),
868    return_xml_decl(T,S,Decl);
869scan_xml_decl(T,S=#xmerl_scanner{event_fun = _Event},Decl) when ?whitespace(hd(T)) ->
870    {_,T1,S1}=mandatory_strip(T,S),
871    scan_xml_decl2(T1,S1,Decl);
872scan_xml_decl(_T,S=#xmerl_scanner{event_fun = _Event},_Decl) ->
873    ?fatal(preformat([expected,one,'of:'],['?>',whitespace_character],","),S).
874
875scan_xml_decl2("?>" ++ T, S0,Decl) ->
876    ?bump_col(2),
877    return_xml_decl(T,S,Decl);
878scan_xml_decl2("encoding" ++ T, S0 = #xmerl_scanner{event_fun = Event},
879	      Decl0 = #xmlDecl{attributes = Attrs}) ->
880    %% [80] EncodingDecl
881    ?bump_col(8),
882    {T1, S1} = scan_eq(T, S),
883    {EncName, T2, S2} = scan_enc_name(T1, S1),
884    LowEncName=xmerl_lib:to_lower(EncName),
885    Attr = #xmlAttribute{name = encoding,
886			 parents = [{xml, _XMLPos = 1}],
887			 value = LowEncName},
888    Decl = Decl0#xmlDecl{encoding = LowEncName,
889			 attributes = [Attr|Attrs]},
890    S3 = #xmerl_scanner{} = Event(#xmerl_event{event = ended,
891					       line = S0#xmerl_scanner.line,
892					       col = S0#xmerl_scanner.col,
893					       data = Attr}, S2),
894    case T2 of
895	"?>" ++ _T3 ->
896	    scan_xml_decl3(T2,S3,Decl);
897	_ ->
898	    {_,T3,S4} = mandatory_strip(T2,S3),
899	    scan_xml_decl3(T3, S4, Decl)
900    end;
901scan_xml_decl2(T="standalone" ++ _T,S,Decl) ->
902    scan_xml_decl3(T,S,Decl);
903scan_xml_decl2(_BadString,S,_Decl) ->
904        ?fatal(preformat([expected,one,'of:'],['?>',standalone,encoding],","),S).
905%    ?fatal(lists:flatten(io_lib:format("~s ~s ~s: ~s, ~s, ~s",[expected,one,'of','?>',standalone,encoding])),S).
906%    ?fatal({expected_one_of,"?>",standalone,encoding},S).
907
908scan_xml_decl3("?>" ++ T, S0,Decl) ->
909    ?bump_col(2),
910    return_xml_decl(T,S,Decl);
911scan_xml_decl3("standalone" ++ T,S0 = #xmerl_scanner{event_fun = Event},
912	      Decl0 = #xmlDecl{attributes = Attrs}) ->
913    %% [32] SDDecl
914    ?bump_col(10),
915    {T1, S1} = scan_eq(T, S),
916    {StValue,T2,S2}=scan_standalone_value(T1,S1),
917    Attr = #xmlAttribute{name = standalone,
918			 parents = [{xml, _XMLPos = 1}],
919			 value = StValue},
920    Decl = Decl0#xmlDecl{standalone = StValue,
921			 attributes = [Attr|Attrs]},
922    S3 = #xmerl_scanner{} = Event(#xmerl_event{event = ended,
923					       line = S0#xmerl_scanner.line,
924					       col = S0#xmerl_scanner.col,
925					       data = Attr}, S2),
926    {_,T3,S4} = strip(T2,S3),
927    T4 = scan_mandatory("?>",T3,2,S4,expected_xml_decl_endtag),
928%%    "?>" ++ T4 = T3,
929    return_xml_decl(T4, S4#xmerl_scanner{col=S4#xmerl_scanner.col+2}, Decl).
930
931
932return_xml_decl(T,S=#xmerl_scanner{hook_fun = _Hook,
933				   event_fun = Event},
934		Decl0 = #xmlDecl{attributes = Attrs}) ->
935    ?strip1,
936    Decl = Decl0#xmlDecl{attributes = lists:reverse(Attrs)},
937    S2 = #xmerl_scanner{} = Event(#xmerl_event{event = ended,
938					       line = S#xmerl_scanner.line,
939					       col = S#xmerl_scanner.col,
940					       data = Decl}, S1),
941%%    {Ret, S3} = Hook(Decl, S2),
942%%    {Ret, T1, S3}.
943    {Decl, T1, S2}.
944
945
946scan_standalone_value("'yes'" ++T,S0)->
947    ?bump_col(5),
948    {'yes',T,S#xmerl_scanner{standalone=yes}};
949scan_standalone_value("\"yes\"" ++T,S0)->
950    ?bump_col(5),
951    {'yes',T,S#xmerl_scanner{standalone=yes}};
952scan_standalone_value("'no'" ++T,S0) ->
953    ?bump_col(4),
954    {'no',T,S};
955scan_standalone_value("\"no\"" ++T,S0) ->
956    ?bump_col(4),
957    {'no',T,S}.
958
959%%%
960%%% Text declaration XML 1.0 section 4.3.1
961%%% [77] TextDecl  ::= '<?xml' VersionInfo? EncodingDecl S? '?>'
962scan_text_decl(T,S=#xmerl_scanner{event_fun = Event}) ->
963    {#xmlDecl{attributes=Attrs}=Decl0,T1,S1} = scan_optional_version(T,S),
964    T2 =
965	case T1 of
966	    "encoding" ++ _T2 -> _T2;
967	    _ ->
968		?fatal(expected_encoding_attribute,S1)
969        end,
970    S2 = S1#xmerl_scanner{col = S1#xmerl_scanner.col + 8},
971    {T3, S3} = scan_eq(T2, S2),
972    {EncName, T4, S4} = scan_enc_name(T3, S3),
973    LowEncName=xmerl_lib:to_lower(EncName),
974    ?strip5,
975    Attr = #xmlAttribute{name = encoding,
976			 parents = [{xml,1}],
977			 value = LowEncName},
978    Decl = Decl0#xmlDecl{encoding = LowEncName,
979 			 attributes = [Attr|Attrs]},
980    S6=#xmerl_scanner{} = Event(#xmerl_event{event = ended,
981					     line = S5#xmerl_scanner.line,
982					     col = S5#xmerl_scanner.col,
983					     data = Attr}, S5),
984    scan_text_decl(T5,S6,Decl).
985
986scan_text_decl("?>"++T,S0 = #xmerl_scanner{hook_fun = _Hook,
987					   event_fun = Event},
988	       Decl0 = #xmlDecl{attributes = Attrs}) ->
989    ?bump_col(2),
990    ?strip1,
991    Decl = Decl0#xmlDecl{attributes = lists:reverse(Attrs)},
992    S2 = #xmerl_scanner{} = Event(#xmerl_event{event = ended,
993					       line = S0#xmerl_scanner.line,
994					       col = S0#xmerl_scanner.col,
995					       data = Decl}, S1),
996%%     {Ret, S3} = Hook(Decl, S2),
997%%     {Ret, T1, S3};
998    {Decl, T1, S2};
999scan_text_decl([H|_T],S,_) ->
1000    ?fatal({unexpected_character_in_text_declaration,H},S).
1001
1002scan_optional_version("version"++T,S0) ->
1003    ?bump_col(7),
1004    ?strip1,
1005    {T2, S2} = scan_eq(T1, S1),
1006    {Vsn, T3, S3} = scan_xml_vsn(T2, S2),
1007    {_,T4,S4} = mandatory_strip(T3,S3),
1008    Attr = #xmlAttribute{name = version,parents = [{xml,1}],value = Vsn},
1009    {#xmlDecl{attributes=[Attr]},T4,S4};
1010scan_optional_version(T,S) ->
1011    {#xmlDecl{attributes=[]},T,S}.
1012
1013
1014
1015%%%%%%% [81] EncName
1016scan_enc_name([], S=#xmerl_scanner{continuation_fun = F}) ->
1017    ?dbg("cont()...~n", []),
1018    F(fun(MoreBytes, S1) -> scan_enc_name(MoreBytes, S1) end,
1019      fatal_fun(expected_encoding_name),
1020      S);
1021scan_enc_name([H|T], S0) when H >= $"; H =< $' ->
1022    ?bump_col(1),
1023    scan_enc_name(T, S, H, []).
1024
1025
1026scan_enc_name([], S=#xmerl_scanner{continuation_fun = F}, Delim, Acc) ->
1027    ?dbg("cont()...~n", []),
1028    F(fun(MoreBytes, S1) -> scan_enc_name(MoreBytes, S1, Delim, Acc) end,
1029      fatal_fun(expected_encoding_name),
1030      S);
1031scan_enc_name([H|T], S0, Delim, Acc) when H >= $a, H =< $z ->
1032    ?bump_col(1),
1033    scan_enc_name2(T, S, Delim, [H|Acc]);
1034scan_enc_name([H|T], S0, Delim, Acc) when H >= $A, H =< $Z ->
1035    ?bump_col(1),
1036    scan_enc_name2(T, S, Delim, [H|Acc]);
1037scan_enc_name([H|_T],S,_Delim,_Acc) ->
1038    ?fatal({error,{unexpected_character_in_Enc_Name,H}},S).
1039
1040scan_enc_name2([], S=#xmerl_scanner{continuation_fun = F}, Delim, Acc) ->
1041    ?dbg("cont()...~n", []),
1042    F(fun(MoreBytes, S1) -> scan_enc_name2(MoreBytes, S1, Delim, Acc) end,
1043      fatal_fun(expected_encoding_name),
1044      S);
1045scan_enc_name2([H|T], S0, H, Acc) ->
1046    ?bump_col(1),
1047    {lists:reverse(Acc), T, S};
1048scan_enc_name2([H|T], S0, Delim, Acc) when H >= $a, H =< $z ->
1049    ?bump_col(1),
1050    scan_enc_name2(T, S, Delim, [H|Acc]);
1051scan_enc_name2([H|T], S0, Delim, Acc) when H >= $A, H =< $Z ->
1052    ?bump_col(1),
1053    scan_enc_name2(T, S, Delim, [H|Acc]);
1054scan_enc_name2([H|T], S0, Delim, Acc) when H >= $0, H =< $9 ->
1055    ?bump_col(1),
1056    scan_enc_name2(T, S, Delim, [H|Acc]);
1057scan_enc_name2([H|T], S0, Delim, Acc) when H == $.; H == $_; H == $- ->
1058    ?bump_col(1),
1059    scan_enc_name2(T, S, Delim, [H|Acc]).
1060
1061
1062%%%%%%% [26] VersionNum
1063%%% VersionNum    ::=    ([a-zA-Z0-9_.:] | '-')+
1064scan_xml_vsn([], S=#xmerl_scanner{continuation_fun = F}) ->
1065    ?dbg("cont()...~n", []),
1066    F(fun(MoreBytes, S1) -> scan_xml_vsn(MoreBytes, S1) end,
1067      fatal_fun(unexpected_end),
1068      S);
1069scan_xml_vsn([H|T], S) when H==$"; H==$'->
1070    xml_vsn(T, S#xmerl_scanner{col = S#xmerl_scanner.col+1}, H, []).
1071
1072xml_vsn([], S=#xmerl_scanner{continuation_fun = F}, Delim, Acc) ->
1073    ?dbg("cont()...~n", []),
1074    F(fun(MoreBytes, S1) -> xml_vsn(MoreBytes, S1, Delim, Acc) end,
1075      fatal_fun(unexpected_end),
1076      S);
1077xml_vsn([H|T], S=#xmerl_scanner{col = C}, H, Acc) ->
1078    {lists:reverse(Acc), T, S#xmerl_scanner{col = C+1}};
1079xml_vsn([H|T], S=#xmerl_scanner{col = C},Delim, Acc) when H >= $a, H =< $z ->
1080    xml_vsn(T, S#xmerl_scanner{col = C+1}, Delim, [H|Acc]);
1081xml_vsn([H|T], S=#xmerl_scanner{col = C},Delim, Acc) when H >= $A, H =< $Z ->
1082    xml_vsn(T, S#xmerl_scanner{col = C+1}, Delim, [H|Acc]);
1083xml_vsn([H|T], S=#xmerl_scanner{col = C},Delim, Acc) when H >= $0, H =< $9 ->
1084    xml_vsn(T, S#xmerl_scanner{col = C+1}, Delim, [H|Acc]);
1085xml_vsn([H|T], S=#xmerl_scanner{col = C}, Delim, Acc) ->
1086    case lists:member(H, "_.:-") of
1087	true ->
1088	    xml_vsn(T, S#xmerl_scanner{col = C+1}, Delim, [H|Acc]);
1089	false ->
1090	    ?fatal({invalid_vsn_char, H}, S)
1091    end.
1092
1093%%%%%%% [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
1094
1095scan_pi([], S=#xmerl_scanner{continuation_fun = F}, Pos, Ps) ->
1096    ?dbg("cont()...~n", []),
1097    F(fun(MoreBytes, S1) -> scan_pi(MoreBytes, S1, Pos, Ps) end,
1098      fatal_fun(unexpected_end),
1099      S);
1100scan_pi(Str = [H1,H2,H3 | T],S0=#xmerl_scanner{line = L, col = C}, Pos, Ps)
1101  when H1==$x;H1==$X ->
1102    %% names beginning with [xX][mM][lL] are reserved for future use.
1103    ?bump_col(3),
1104    if
1105	((H2==$m) or (H2==$M)) and
1106	((H3==$l) or (H3==$L)) ->
1107	    scan_wellknown_pi(T,S,Pos,Ps);
1108	true ->
1109	    {Target, _NamespaceInfo, T1, S1} = scan_name(Str, S),
1110	    scan_pi(T1, S1, Target, L, C, Pos, Ps, [])
1111    end;
1112scan_pi(Str, S=#xmerl_scanner{line = L, col = C}, Pos, Ps) ->
1113    {Target, _NamespaceInfo, T1, S1} = scan_name(Str, S),
1114    scan_pi(T1, S1, Target, L, C, Pos, Ps, []).
1115
1116
1117%%% More info on xml-stylesheet can be found at:
1118%%%   "Associating Style Sheets with XML documents", Version 1.0,
1119%%%   W3C Recommendation 29 June 1999 (http://www.w3.org/TR/xml-stylesheet/)
1120scan_wellknown_pi("-stylesheet"++T, S0=#xmerl_scanner{line=L,col=C},Pos,Ps) ->
1121    ?dbg("prolog(\"<?xml-stylesheet\")~n", []),
1122    ?bump_col(16),
1123    scan_pi(T, S, "xml-stylesheet",L,C,Pos,Ps,[]);
1124scan_wellknown_pi(Str,S,_Pos,_Ps) ->
1125    ?fatal({invalid_target_name, lists:sublist(Str, 1, 10)}, S).
1126
1127
1128
1129scan_pi([], S=#xmerl_scanner{continuation_fun = F}, Target,
1130	L, C, Pos, Ps, Acc) ->
1131    ?dbg("cont()...~n", []),
1132    F(fun(MoreBytes, S1) -> scan_pi(MoreBytes, S1, Target,
1133				    L, C, Pos, Ps, Acc) end,
1134      fatal_fun(unexpected_end),
1135      S);
1136scan_pi("?>" ++ T, S0 = #xmerl_scanner{hook_fun = Hook,
1137				       event_fun = Event},
1138	Target, L, C, Pos, Ps, Acc) ->
1139    ?bump_col(2),
1140    PI = #xmlPI{name = Target,
1141		parents = Ps,
1142		pos = Pos,
1143		value = lists:reverse(Acc)},
1144    S1 = #xmerl_scanner{} = Event(#xmerl_event{event = ended,
1145					       line = L,
1146					       col = C,
1147					       data = PI}, S),
1148    {Ret, S2} = Hook(PI, S1),
1149    {Ret, T, S2};
1150scan_pi([H|T], S, Target, L, C, Pos, Ps, Acc) when ?whitespace(H) ->
1151    ?strip1,
1152    scan_pi2(T1, S1, Target, L, C, Pos, Ps, Acc);
1153scan_pi([H|_T],S,_Target, _L, _C, _Pos, _Ps, _Acc) ->
1154    ?fatal({expected_whitespace_OR_end_of_PI,{char,H}}, S).
1155
1156scan_pi2([], S=#xmerl_scanner{continuation_fun = F}, Target,
1157	 L, C, Pos, Ps, Acc) ->
1158    ?dbg("cont()...~n", []),
1159    F(fun(MoreBytes, S1) -> scan_pi2(MoreBytes, S1, Target,
1160				     L, C, Pos, Ps, Acc) end,
1161      fatal_fun(unexpected_end),
1162      S);
1163scan_pi2("?>" ++ T, S0 = #xmerl_scanner{hook_fun = Hook,
1164				       event_fun = Event},
1165	 Target, L, C, Pos, Ps, Acc) ->
1166    ?bump_col(2),
1167    PI = #xmlPI{name = Target,
1168		parents = Ps,
1169		pos = Pos,
1170		value = lists:reverse(Acc)},
1171    S1 = #xmerl_scanner{} = Event(#xmerl_event{event = ended,
1172					       line = L,
1173					       col = C,
1174					       data = PI}, S),
1175    {Ret, S2} = Hook(PI, S1),
1176    {Ret, T, S2};
1177scan_pi2(Str, S0, Target, L, C, Pos, Ps, Acc) ->
1178    ?bump_col(1),
1179    {Ch,T} = wfc_legal_char(Str,S),
1180    scan_pi2(T, S, Target, L, C, Pos, Ps, [Ch|Acc]).
1181
1182
1183
1184%% [28] doctypedecl ::=
1185%%   '<!DOCTYPE' S Name (S ExternalID)? S? ('[' intSubset ']' S?)? '>'
1186scan_doctype([], S=#xmerl_scanner{continuation_fun = F}) ->
1187    ?dbg("cont()...~n", []),
1188    F(fun(MoreBytes, S1) -> scan_doctype(MoreBytes, S1) end,
1189      fatal_fun(unexpected_end),
1190      S);
1191scan_doctype(T, S) ->
1192    {_,T1,S1} = mandatory_strip(T,S),
1193    {DTName, _NamespaceInfo, T2, S2} = scan_name(T1, S1),
1194    ?strip3,
1195    scan_doctype1(T3, S3#xmerl_scanner{doctype_name =  DTName}).
1196
1197
1198%% [75] ExternalID ::= 'SYSTEM' S SystemLiteral
1199%%		     | 'PUBLIC' S PubidLiteral S SystemLiteral
1200scan_doctype1([], S=#xmerl_scanner{continuation_fun = F}) ->
1201    ?dbg("cont()...~n", []),
1202    F(fun(MoreBytes, S1) -> scan_doctype1(MoreBytes, S1) end,
1203      fatal_fun(unexpected_end),
1204      S);
1205scan_doctype1("PUBLIC" ++ T, S0) ->
1206    ?bump_col(6),
1207    {_,T1,S1} = mandatory_strip(T,S),
1208    {PIDL, T2, S2} = scan_pubid_literal(T1, S1),
1209    {_,T3,S3} = mandatory_strip(T2,S2),
1210    {SL, T4, S4} = scan_system_literal(T3, S3),
1211    ?strip5,
1212    scan_doctype2(T5, S5, {public, PIDL, SL});
1213scan_doctype1("SYSTEM" ++ T, S0) ->
1214    ?bump_col(6),
1215    {_,T1,S1} = mandatory_strip(T,S),
1216    {SL, T2, S2} = scan_system_literal(T1, S1),
1217    ?strip3,
1218    scan_doctype2(T3, S3, {system, SL});
1219scan_doctype1(T, S) ->
1220    scan_doctype2(T, S, undefined).
1221
1222
1223scan_doctype2([], S=#xmerl_scanner{continuation_fun = F},DTD) ->
1224    ?dbg("cont()...~n", []),
1225    F(fun(MoreBytes, S1) -> scan_doctype2(MoreBytes, S1, DTD) end,
1226      fatal_fun(unexpected_end),
1227      S);
1228scan_doctype2("[" ++ T, S0, DTD) ->
1229    ?bump_col(1),
1230    ?strip1,
1231    scan_doctype3(T1, S1, DTD);
1232scan_doctype2(">" ++ T, S0, DTD) ->
1233    ?bump_col(1),
1234    ?strip1,
1235    S2 = fetch_DTD(DTD, S1),
1236    check_decl(S2),
1237    {T1, S2};
1238scan_doctype2(_T,S,_DTD) ->
1239    ?fatal(expected_end_of_DOCTYPE_declaration, S).
1240
1241%% [28a] DeclSep   ::= PEReference | S
1242%% [28b] intSubset ::= (markupdecl | DeclSep)*
1243scan_doctype3([], S=#xmerl_scanner{continuation_fun = F},DTD) ->
1244    ?dbg("cont()...~n", []),
1245    F(fun(MoreBytes, S1) -> scan_doctype3(MoreBytes, S1,DTD) end,
1246      fatal_fun(unexpected_end),
1247      S);
1248scan_doctype3("%" ++ T, S0, DTD) ->
1249    ?bump_col(1),
1250    {PERefName, T1, S1} = scan_pe_reference(T, S),
1251    ?strip2,
1252    case expand_pe_reference(PERefName, S2,as_PE) of
1253	{system, _} = Name ->
1254	    S3 = fetch_DTD(Name, S2),
1255	    check_decl(S3),
1256	    scan_doctype3(T2, S3, DTD);
1257	{public, _} = Name ->
1258	    S3 = fetch_DTD(Name, S2),
1259	    check_decl(S3),
1260	    scan_doctype3(T2, S3, DTD);
1261	{public, _, _} = Name ->
1262	    S3 = fetch_DTD(Name, S2),
1263	    check_decl(S3),
1264	    scan_doctype3(T2, S3, DTD);
1265	ExpRef when is_list(ExpRef) -> % Space added, see Section 4.4.8
1266	    {_,T3,S3} = strip(ExpRef++T2,S2),
1267	    scan_doctype3(T3,S3,DTD)
1268    end;
1269scan_doctype3("]" ++ T, S0, DTD) ->
1270    ?bump_col(1),
1271    ?strip1,
1272    S2 = fetch_DTD(DTD, S1),
1273    check_decl(S2),
1274    T2 = scan_mandatory(">",T1,1,S2,expected_doctype_end_tag),
1275%%    ">" ++ T2 = T1,
1276    {T2, S2};
1277scan_doctype3(T, S, DTD) ->
1278    {_, T1, S1} = scan_markup_decl(T, S),
1279    scan_doctype3(T1, S1, DTD).
1280
1281
1282
1283fetch_DTD(undefined, S=#xmerl_scanner{doctype_DTD=URI}) when is_list(URI)->
1284    %% allow to specify DTD name when it isn't available in xml stream
1285    fetch_DTD({system,URI},S#xmerl_scanner{doctype_DTD=option_provided});
1286fetch_DTD(undefined, S) ->
1287    S;
1288% fetch_DTD(_,S=#xmerl_scanner{validation=false}) ->
1289%     S;
1290fetch_DTD(DTDSpec, S)->
1291    case fetch_and_parse(DTDSpec,S,[{text_decl,true},
1292				    {environment,{external,subset}}]) of
1293	NewS when is_record(NewS,xmerl_scanner) ->
1294	    NewS;
1295	{_Res,_Tail,_Sx} -> % Continue with old scanner data, result in Rules
1296	    S
1297    end.
1298
1299fetch_and_parse(ExtSpec,S=#xmerl_scanner{fetch_fun=Fetch,
1300					 rules=Rules,
1301					 xmlbase = XMLBase},
1302		Options0) ->
1303    RetS =
1304    case Fetch(ExtSpec, S) of
1305	{ok, NewS} ->
1306	    %% For backward compatibility only. This will be removed later!!
1307	    NewS;
1308	{ok, not_fetched,NewS} ->
1309	    NewS;
1310	{ok, DataRet, NewS = #xmerl_scanner{
1311			fetch_path=FetchPath,
1312			user_state = UState,
1313			event_fun = Event,
1314			hook_fun = Hook,
1315			fetch_fun = Fetch1,
1316			close_fun = Close1,
1317			continuation_fun = Cont,
1318			acc_fun = Acc,
1319			rules_read_fun = Read,
1320			rules_write_fun = Write,
1321			validation = Valid,
1322			quiet = Quiet,
1323			encoding = Charset
1324		       }} ->
1325	    EvS = event_state(NewS),
1326	    HoS = hook_state(NewS),
1327	    FeS = fetch_state(NewS),
1328	    CoS = cont_state(NewS),
1329	    Options = Options0++[{fetch_path,FetchPath},
1330				 {user_state, UState},
1331				 {rules, Rules},
1332				 {event_fun, Event, EvS},
1333				 {hook_fun, Hook, HoS},
1334				 {fetch_fun, Fetch1, FeS},
1335				 {close_fun, Close1},
1336				 {continuation_fun, Cont, CoS},
1337				 {rules, Read, Write, ""},
1338				 {acc_fun, Acc},
1339				 {validation,Valid},
1340				 {quiet,Quiet},
1341				 {encoding,Charset}],
1342
1343	    case DataRet of
1344		{file, F} ->
1345		    int_file_decl(F, Options,Charset);
1346		{string, String} ->
1347		    int_string_decl(String, Options,XMLBase,file_name_unknown);
1348		 _ ->
1349		    %% other scheme
1350		    {DataRet,[],NewS}
1351	    end;
1352	Error ->
1353	    ?fatal({error_fetching_DTD, {ExtSpec, Error}}, S)
1354    end,
1355    case RetS of
1356	#xmerl_scanner{} ->
1357	    RetS#xmerl_scanner{text_decl=false,
1358			       environment=S#xmerl_scanner.environment};
1359	_ -> RetS
1360    end.
1361
1362
1363fetch_not_parse(ExtSpec,S=#xmerl_scanner{fetch_fun=Fetch}) ->
1364    case Fetch(ExtSpec,S) of
1365	{ok, not_fetched,_NewS} ->
1366	    ?fatal({error_fetching_external_source,ExtSpec},S);
1367	{ok, DataRet, NewS} ->
1368	    {String,LocationName} =
1369		case DataRet of
1370		    {file,F} ->
1371			{get_file(F,S),F};
1372		    {string,Str} ->
1373			{binary_to_list(Str),file_name_unknown};
1374		    {http,URI} ->
1375			{{http,URI},URI};
1376		    _ -> DataRet
1377		end,
1378	    {String, NewS#xmerl_scanner{filename=LocationName}};
1379	 _ ->
1380	    ?fatal({error_fetching_external_resource,ExtSpec},S)
1381    end.
1382
1383get_file(F,S) ->
1384%     ?dbg("get_file F=~p~n",[F]),
1385    case file:read_file(F) of
1386	{ok,Bin} ->
1387	    binary_to_list(Bin);
1388	Err ->
1389	    ?fatal({error_reading_file,F,Err},S)
1390    end.
1391%% check_decl/1
1392%% Now it is necessary to check that all referenced types is declared,
1393%% since it is legal to reference some xml types before they are
1394%% declared.
1395check_decl(#xmerl_scanner{validation=V}) when V =/= dtd ->
1396    ok;
1397check_decl(#xmerl_scanner{rules=Tab} = S) ->
1398    check_notations(Tab,S),
1399    check_elements(Tab,S), %% check also attribute defs for element
1400    check_entities(Tab,S).
1401
1402check_notations(Tab,S) ->
1403    case ets:match(Tab,{{notation,'$1'},undeclared}) of
1404	[[]] -> ok;
1405	[] ->  ok;
1406	[L] when is_list(L) ->
1407	    ?fatal({error_missing_declaration_in_DTD,hd(L)},S);
1408	Err ->
1409	    ?fatal({error_missing_declaration_in_DTD,Err},S)
1410    end.
1411
1412check_elements(Tab,S) ->
1413    case catch ets:match(Tab,{{elem_def,'_'},'$2'},10) of
1414	{_,_}=M ->
1415	    Fun = fun({Match,'$end_of_table'},_F) ->
1416			  lists:foreach(fun(X)->check_elements2(X,S) end,
1417					Match),
1418			  ok;
1419		     ('$end_of_table',_) ->
1420			  ok;
1421		     ({Match,Cont},F) ->
1422			  lists:foreach(fun(X)->check_elements2(X,S) end,
1423					Match),
1424			  F(ets:match(Cont),F)
1425		  end,
1426	    Fun(M,Fun);
1427	'$end_of_table' -> ok;
1428	Err -> ?fatal({error_missing_declaration_in_DTD,Err},S)
1429    end.
1430
1431% it is not an error to declare attributes for an element that is not
1432% declared.
1433check_elements2([#xmlElement{attributes=Attrs}],S) ->
1434    check_attributes(Attrs,S);
1435check_elements2(_,_) ->
1436    ok.
1437
1438check_attributes([{N1,'ID',_,_,_}=Attr|Rest],S) ->
1439    case lists:keysearch('ID',2,Rest) of
1440	{value,Att2} ->
1441	    ?fatal({error_more_than_one_ID_def,N1,element(1,Att2)},S);
1442	_ ->
1443	    ok
1444    end,
1445    vc_ID_Attribute_Default(Attr,S),
1446    check_attributes(Rest,S);
1447check_attributes([{_,{enumeration,_},_,_,_}=Attr|T],S) ->
1448    vc_Enumeration(Attr,S),
1449    check_attributes(T,S);
1450check_attributes([{_,Ent,_,_,_}=Attr|T],S)
1451  when Ent=='ENTITY';Ent=='ENTITIES' ->
1452    vc_Entity_Name(Attr,S),
1453    check_attributes(T,S);
1454check_attributes([_|T],S) ->
1455    check_attributes(T,S);
1456check_attributes([],_S) ->
1457    ok.
1458
1459check_entities(Tab,S=#xmerl_scanner{validation=dtd}) ->
1460    case ets:match(Tab,{{entity,'$1'},undeclared}) of
1461	[[]] -> ok;
1462	[] ->  ok;
1463	[L] when is_list(L) ->
1464	    ?fatal({error_missing_declaration_in_DTD,hd(L)},S);
1465	Err ->
1466	    ?fatal({error_missing_declaration_in_DTD,Err},S)
1467    end;
1468check_entities(_,_) ->
1469    ok.
1470
1471
1472%% check_decl2/1: checks that all referenced ID attributes are declared
1473check_decl2(S=#xmerl_scanner{rules=Tab}) ->
1474    check_referenced_ids(Tab,S).
1475
1476
1477check_referenced_ids(Tab,S) ->
1478    case ets:match(Tab,{{id,'$1'},undeclared}) of
1479	[[]] -> ok;
1480	[] ->  ok;
1481	[L] when is_list(L) ->
1482	    ?fatal({error_missing_declaration_in_DTD,hd(L)},S);
1483	Err ->
1484	    ?fatal({error_missing_declaration_in_DTD,Err},S)
1485    end.
1486
1487%%%%%%% [30] extSubSet ::= TextDecl? extSubsetDecl
1488
1489scan_ext_subset([], S=#xmerl_scanner{continuation_fun = F}) ->
1490    ?dbg("cont()...~n", []),
1491    F(fun(MoreBytes, S1) -> scan_ext_subset(MoreBytes, S1) end,
1492      fun(S1) -> {[], S1} end,
1493      S);
1494scan_ext_subset("%" ++ T, S0) ->
1495    %% DeclSep [28a]: WFC: PE Between Declarations.
1496    %% The replacement text of a parameter entity reference in a
1497    %% DeclSep must match the production extSubsetDecl.
1498    ?bump_col(1),
1499    {T1,S1} = scan_decl_sep(T,S),
1500    scan_ext_subset(T1, S1);
1501scan_ext_subset("<![" ++ T, S0) ->
1502    ?bump_col(3),
1503    ?strip1,
1504    {_, T2, S2} = scan_conditional_sect(T1, S1),
1505    scan_ext_subset(T2,S2);
1506scan_ext_subset(T, S) when ?whitespace(hd(T)) ->
1507    {_,T1,S1} = strip(T,S),
1508    scan_ext_subset(T1, S1);
1509scan_ext_subset(T, S) ->
1510    {_, T1, S1} = scan_markup_decl(T, S),
1511    scan_ext_subset(T1, S1).
1512
1513
1514%%%%%%% [28a] DeclSep ::= PEReference | S
1515scan_decl_sep(T,S) ->
1516    {PERefName, T1, S1} = scan_pe_reference(T, S),
1517    {ExpandedRef,S2} =
1518	case expand_pe_reference(PERefName,S1,as_PE) of
1519	    Tuple when is_tuple(Tuple) ->
1520		%% {system,URI} or {public,URI}
1521		{ExpRef,_Sx}=fetch_not_parse(Tuple,S1),
1522		{ExpRef,S1};
1523	    ExpRef ->
1524		{ExpRef,S1}
1525	end,
1526    {_,TRef,S3} = strip(ExpandedRef,S2),
1527    {_,S4}=scan_ext_subset(TRef,S3),
1528    {T1,S4}.
1529% scan_decl_sep(T,S=#xmerl_scanner{rules_read_fun=Read,
1530% 				 rules_write_fun=Write,
1531% 				 rules_delete_fun=Delete}) ->
1532%     {PERefName, T1, S1} = scan_pe_reference(T, S),
1533%     {ExpandedRef,S2} =
1534% 	case expand_pe_reference(PERefName,S1,as_PE) of
1535% 	    Tuple when tuple(Tuple) ->
1536% 		%% {system,URI} or {public,URI}
1537% 		{ExpRef,Sx}=fetch_not_parse(Tuple,S1),
1538% 		{EntV,_,_S2} = scan_entity_value(ExpRef, Sx, no_delim,
1539% 						 PERefName,parameter),
1540% 		%% should do an update Write(parameter_entity) so next
1541% 		%% expand_pe_reference is faster
1542% 		Delete(parameter_entity,PERefName,_S2),
1543% 		_S3 = Write(parameter_entity,PERefName,EntV,_S2),
1544% 		EntV2 = Read(parameter_entity,PERefName,_S3),
1545% 		{" " ++ EntV2 ++ " ",_S3};
1546% 	    ExpRef ->
1547% 		{ExpRef,S1}
1548% 	end,
1549%     {_, T3, S3} = strip(ExpandedRef,S2),
1550%     {_T4,S4} = scan_ext_subset(T3,S3),
1551%     strip(T1,S4).
1552
1553%%%%%%% [61] ConditionalSect ::= includeSect | ignoreSect
1554
1555scan_conditional_sect([], S=#xmerl_scanner{continuation_fun = F}) ->
1556    ?dbg("cont()...~n", []),
1557    F(fun(MoreBytes, S1) -> scan_conditional_sect(MoreBytes, S1) end,
1558      fatal_fun(unexpected_end),
1559      S);
1560scan_conditional_sect("IGNORE" ++ T, S0) ->
1561    ?bump_col(6),
1562    ?strip1,
1563    T2 = scan_mandatory("[",T1,1,S,expected_IGNORE_bracket),
1564%    "[" ++ T2 = T1,
1565    {_,T3,S3} = strip(T2,S1),
1566    scan_ignore(T3,S3);
1567scan_conditional_sect("INCLUDE" ++ T, S0) ->
1568    ?bump_col(7),
1569    ?strip1,
1570    T2 = scan_mandatory("[",T1,1,S,expected_INCLUDE_bracket),
1571%    "[" ++ T2 = T1,
1572    {_,T3,S3} = strip(T2,S1),
1573    scan_include(T3, S3);
1574scan_conditional_sect("%"++T,S0) ->
1575    ?bump_col(1),
1576    {PERefName, T1, S1} = scan_pe_reference(T, S),
1577    ExpRef = expand_pe_reference(PERefName, S1,as_PE),
1578    {_,T2,S2} = strip(ExpRef ++ T1,S1),
1579    scan_conditional_sect(T2,S2).
1580
1581
1582%%%% [63] ignoreSect	 ::= '<![' S? 'IGNORE' S? '[' ignoreSectContents* ']]>'
1583%%%% [64] ignoreSectContents ::= Ignore ('<![' ignoreSectContents ']]>' Ignore)*
1584%%%% [65] Ignore ::= Char* - (Char* ('<![' | ']]>') Char*)
1585scan_ignore(Str,S) ->
1586    scan_ignore(Str,S,0).
1587
1588scan_ignore([], S=#xmerl_scanner{continuation_fun = F},Level) ->
1589    ?dbg("cont()...~n", []),
1590    F(fun(MoreBytes, S1) -> scan_ignore(MoreBytes, S1,Level) end,
1591      fatal_fun(unexpected_end),
1592      S);
1593scan_ignore("<![" ++ T, S0,Level) ->
1594    %% nested conditional section. Topmost condition is ignore, though
1595    ?bump_col(3),
1596    scan_ignore(T, S,Level+1);
1597scan_ignore("]]>" ++ T, S0,0) ->
1598    ?bump_col(3),
1599    {[], T, S};
1600scan_ignore("]]>" ++ T, S0,Level) ->
1601    ?bump_col(3),
1602    scan_ignore(T, S,Level-1);
1603scan_ignore([_H|T],S0,Level) ->
1604    ?bump_col(1),
1605    scan_ignore(T,S,Level).
1606
1607
1608%%%%%%% [62] includeSect ::= '<![' S? 'INCLUDE' S? '[' extSubsetDecl ']]>'
1609scan_include([], S=#xmerl_scanner{continuation_fun = F}) ->
1610    ?dbg("cont()...~n", []),
1611    F(fun(MoreBytes, S1) -> scan_include(MoreBytes, S1) end,
1612      fatal_fun(unexpected_end),
1613      S);
1614scan_include("]]>" ++ T, S0) ->
1615    ?bump_col(3),
1616    {[], T, S};
1617scan_include("%" ++ T, S0) ->
1618    ?bump_col(1),
1619    {PERefName, T1, S1} = scan_pe_reference(T, S),
1620    ExpRef = expand_pe_reference(PERefName, S1,as_PE),
1621    {_,T2,S2} = strip(ExpRef ++ T1,S1),
1622    scan_include(T2, S2);
1623scan_include("<![" ++ T, S0) ->
1624    ?bump_col(3),
1625    ?strip1,
1626    {_, T2, S2} = scan_conditional_sect(T1, S1),
1627    ?strip3,
1628    scan_include(T3,S3);
1629scan_include(T, S) ->
1630    {_, T1, S1} = scan_markup_decl(T, S),
1631    scan_include(T1, S1).
1632
1633
1634%%%%%%% [29] markupdecl ::= elementdecl | AttlistDecl | EntityDecl |
1635%%%%%%%                     NotationDecl | PI |Comment
1636%%%%%%% [45] elementdecl ::= '<!ELEMENT' S Name S contentspec S? '>'
1637
1638%% Validity constraint: Unique Type Declaration: No element type may be
1639%% declared more than once.
1640%%
1641scan_markup_decl([], S=#xmerl_scanner{continuation_fun = F}) ->
1642    ?dbg("cont()...~n", []),
1643    F(fun(MoreBytes, S1) -> scan_markup_decl(MoreBytes, S1) end,
1644      fun(S1) -> {[], [], S1} end,
1645      S);
1646scan_markup_decl("<!--" ++ T, S0) ->
1647    ?bump_col(4),
1648    scan_comment(T, S);
1649scan_markup_decl("<?" ++ T, S0) ->
1650    ?bump_col(2),
1651    {_PI, T1, S1} = scan_pi(T, S,_Pos=markup,[]),
1652    strip(T1, S1);
1653scan_markup_decl("<!ELEMENT" ++ T,
1654		 #xmerl_scanner{rules_read_fun = Read,
1655				rules_write_fun = Write,
1656				rules_delete_fun = Delete} = S0) ->
1657    ?bump_col(9),
1658    {_,T1,S1} = mandatory_strip(T,S),
1659    {Ename, _NamespaceInfo, T2, S2} = scan_name(T1, S1),
1660    Element  =
1661	case Read(elem_def, Ename, S2) of
1662	    El = #xmlElement{elementdef=Decl} when Decl =/= undeclared ->
1663		case S2#xmerl_scanner.validation of
1664		    dtd ->
1665			?fatal({already_defined, Ename}, S2);
1666		    _ ->
1667			Delete(elem_def,Ename,S2),
1668			El
1669		end;
1670	    El = #xmlElement{} ->
1671		Delete(elem_def,Ename,S2),
1672		El;
1673	    undefined ->
1674		#xmlElement{}
1675	end,
1676    {_,T3,S3} = mandatory_strip(T2,S2),
1677    {Edef, T4, S4} = scan_contentspec(T3, S3),
1678    ?strip5,
1679    {">" ++ T6,S6} = scan_element_completion(T5,S5),
1680    S7 = Write(elem_def, Ename,
1681	       Element#xmlElement{name = Ename,
1682				  content = Edef,
1683				  elementdef=S6#xmerl_scanner.environment},
1684	       S6#xmerl_scanner{col=S6#xmerl_scanner.col+1}),
1685    strip(T6,S7);
1686scan_markup_decl("<!ENTITY" ++ T, S0) ->
1687    %% <!ENTITY [%] entity.name NDATA notation.name>
1688    %% <!ENTITY [%] entity.name "replacement text">
1689    %% <!ENTITY [%] entity.name SYSTEM "system.identifier">
1690    %% <!ENTITY [%] entity.name PUBLIC public.identifier "system.identifier">
1691    ?bump_col(8),
1692    {_,T1,S1} = mandatory_strip(T,S),
1693    {T2, S2} = scan_entity(T1, S1),
1694    strip(T2,S2);
1695scan_markup_decl("<!NOTATION" ++ T, S0) ->
1696    %% <!NOTATION notation.name "public.identifier" "helper.application">
1697    ?bump_col(10),
1698    {_,T1,S1} = mandatory_strip(T,S),
1699    {T2, S2} = scan_notation_decl(T1, S1),
1700    strip(T2,S2);
1701scan_markup_decl("<!ATTLIST" ++ T,
1702		 #xmerl_scanner{rules_read_fun = Read,
1703				rules_write_fun = Write,
1704				rules_delete_fun= Delete} = S0) ->
1705    %% <!ATTLIST Ename ( AttrName Type Value )*>
1706    ?bump_col(9),
1707    {_,T1,S1} = mandatory_strip(T,S),
1708    {Ename, _NamespaceInfo, T2, S2} = scan_name(T1, S1),
1709%    ?strip3,
1710    {Attributes, T4, S4} = scan_attdef(T2, S2),
1711    {EDEF,MergedAttrs} =
1712	case Read(elem_def, Ename, S4) of
1713	    undefined -> %% this may happen when the ELEMENT is declared in
1714		%% the external DTD but the ATTLIST in the
1715		%% internal DTD.
1716		{#xmlElement{},update_attributes(Attributes,[])};
1717	    Edef = #xmlElement{attributes = OldAttrs} ->
1718		Delete(elem_def,Ename,S4),
1719		%% the slot in rules table must be empty so that the
1720		%% later write has the assumed effect. Read maybe
1721		%% should empty the table slot.
1722		{Edef,update_attributes(Attributes, OldAttrs)}
1723	end,
1724    NewEdef = EDEF#xmlElement{name=Ename,attributes = MergedAttrs},
1725    S5 = Write(elem_def, Ename, NewEdef, S4),
1726    T5 = T4,
1727    strip(T5,S5);
1728scan_markup_decl(_Str,S) ->
1729    ?fatal(expected_markup,S).
1730
1731scan_element_completion(T,S) ->
1732    scan_markup_completion_gt(T,S).
1733
1734update_attributes(NewAttrs, OldAttrs) ->
1735    update_attributes1(NewAttrs,lists:reverse(OldAttrs)).
1736
1737update_attributes1([A = {Name,_Type,_DefaultV,_DefaultD,_Env}|Attrs],
1738		   OldAttrs) ->
1739    case lists:keymember(Name, 1, OldAttrs) of
1740	true ->
1741	    update_attributes1(Attrs, OldAttrs);
1742	false ->
1743	    update_attributes1(Attrs, [A|OldAttrs])
1744    end;
1745update_attributes1([],Acc) ->
1746    lists:reverse(Acc).
1747
1748
1749%%%%%%% [53] AttDef
1750
1751scan_attdef([], S=#xmerl_scanner{continuation_fun = F}) ->
1752    ?dbg("cont()...~n", []),
1753    F(fun(MoreBytes, S1) -> scan_attdef(MoreBytes, S1) end,
1754      fatal_fun(unexpected_end),
1755      S);
1756scan_attdef(T, S) ->
1757    scan_attdef(T, S, _AttrAcc = []).
1758
1759
1760scan_attdef([], S=#xmerl_scanner{continuation_fun = F}, Attrs) ->
1761    ?dbg("cont()...~n", []),
1762    F(fun(MoreBytes, S1) -> scan_attdef(MoreBytes, S1, Attrs) end,
1763      fatal_fun(unexpected_end),
1764      S);
1765scan_attdef(">" ++ T, S0, Attrs) ->
1766    ?bump_col(1),
1767    {lists:reverse(Attrs), T, S};
1768scan_attdef("%" ++ _T, S=#xmerl_scanner{environment=prolog}, _Attrs) ->
1769     ?fatal({error,{wfc_PEs_In_Internal_Subset}},S);
1770scan_attdef("%" ++ T, S0, Attrs) ->
1771    ?bump_col(1),
1772    {PERefName, T1, S1} = scan_pe_reference(T, S),
1773    ExpRef = expand_pe_reference(PERefName, S1,as_PE),
1774    {_,T2,S2} = strip(ExpRef ++ T1,S1),
1775    scan_attdef(T2, S2, Attrs);
1776scan_attdef(T,S,Attrs) ->
1777    {_,T1,S1} = mandatory_strip(T,S),
1778    scan_attdef2(T1,S1,Attrs).
1779
1780scan_attdef2(">" ++ T, S0, Attrs) ->
1781    ?bump_col(1),
1782    {lists:reverse(Attrs), T, S};
1783scan_attdef2("%" ++ _T, S=#xmerl_scanner{environment=prolog}, _Attrs) ->
1784     ?fatal({error,{wfc_PEs_In_Internal_Subset}},S);
1785scan_attdef2("%" ++ T, S0, Attrs) ->
1786    ?bump_col(1),
1787    {PERefName, T1, S1} = scan_pe_reference(T, S),
1788    ExpRef = expand_pe_reference(PERefName, S1,as_PE),
1789    {_,T2,S2} = strip(ExpRef ++ T1,S1),
1790    scan_attdef2(T2, S2, Attrs);
1791scan_attdef2(T, S, Attrs) ->
1792    {AttName, _NamespaceInfo, T1, S1} = scan_name(T, S),
1793    {_,T2,S2} = mandatory_strip(T1,S1),
1794    {AttType, T3, S3} = scan_att_type(T2, S2),
1795    {_,T4,S4} = mandatory_strip(T3,S3),
1796    {{DefaultDecl,DefaultValue}, T5, S5} = scan_default_decl(T4, S4, AttType),
1797    ?strip6,
1798    Attr = {AttName, AttType,DefaultValue,DefaultDecl,
1799	    S#xmerl_scanner.environment},
1800    scan_attdef2(T6, S6, [Attr|Attrs]).
1801
1802
1803%%% [54] StringType
1804scan_att_type([], S=#xmerl_scanner{continuation_fun = F}) ->
1805    ?dbg("cont()...~n", []),
1806    F(fun(MoreBytes, S1) -> scan_att_type(MoreBytes, S1) end,
1807      fatal_fun(unexpected_end),
1808      S);
1809scan_att_type("CDATA" ++ T, S0) ->
1810    ?bump_col(5),
1811    {'CDATA', T, S};
1812%%% [55] TokenizedType
1813scan_att_type("IDREFS" ++ T, S0) ->
1814    ?bump_col(6),
1815    {'IDREFS', T, S};
1816scan_att_type("IDREF" ++ T, S0) ->
1817    ?bump_col(5),
1818    {'IDREF', T, S};
1819scan_att_type("ID" ++ T, S0) ->
1820    ?bump_col(2),
1821    {'ID', T, S};
1822scan_att_type("ENTITY" ++ T, S0) ->
1823    ?bump_col(6),
1824    {'ENTITY', T, S};
1825scan_att_type("ENTITIES" ++ T, S0) ->
1826    ?bump_col(8),
1827    {'ENTITIES', T, S};
1828scan_att_type("NMTOKENS" ++ T, S0) ->
1829    ?bump_col(8),
1830    {'NMTOKENS', T, S};
1831scan_att_type("NMTOKEN" ++ T, S0) ->
1832    ?bump_col(7),
1833    {'NMTOKEN', T, S};
1834%%% [57] EnumeratedType
1835scan_att_type("NOTATION" ++ T, S0) ->
1836    ?bump_col(8),
1837    {_,T1,S1} = mandatory_strip(T,S),
1838    T2 = scan_mandatory("(",T1,1,S1,expected_parenthesis_after_NOTATION),
1839%    "(" ++ T2 = T1,
1840    S2 = S1,
1841    ?strip3,
1842    {Name, _NamespaceInfo, T4, S4} = scan_name(T3, S3),
1843    notation_exists(Name, S4),
1844    ?strip5,
1845    scan_notation_type(T5, S5, [Name]);
1846scan_att_type("(" ++ T, S0) ->
1847    ?bump_col(1),
1848    ?strip1,
1849    {NmToken, _NamespaceInfo, T2, S2} = scan_nmtoken(T1, S1),
1850    ?strip3,
1851    scan_enumeration(T3, S3, [NmToken]);
1852scan_att_type("%" ++ _T, S=#xmerl_scanner{environment=prolog}) ->
1853    ?fatal({error,{wfc_PEs_In_Internal_Subset}},S);
1854scan_att_type("%" ++ T, S0) ->
1855    ?bump_col(1),
1856    {PERefName, T1, S1} = scan_pe_reference(T, S),
1857    ExpRef = expand_pe_reference(PERefName, S1,in_literal),
1858    {ExpRef,T1,S1}.
1859
1860%%% [58] NotationType
1861
1862scan_notation_type([], S=#xmerl_scanner{continuation_fun = F}, Acc) ->
1863    ?dbg("cont()...~n", []),
1864    F(fun(MoreBytes, S1) -> scan_notation_type(MoreBytes, S1, Acc) end,
1865      fatal_fun(unexpected_end),
1866      S);
1867scan_notation_type(")" ++ T, S0, Acc) ->
1868    ?bump_col(1),
1869    {{notation, lists:reverse(Acc)}, T, S};
1870scan_notation_type("|" ++ T, S0, Acc) ->
1871    ?bump_col(1),
1872    ?strip1,
1873    {Name, _NamespaceInfo, T2, S2} = scan_name(T1, S1),
1874    notation_exists(Name, S2),
1875    ?strip3,
1876    scan_notation_type(T3, S3, [Name | Acc]).
1877
1878%%% Validity constraint for NotationType:
1879%%% The used notation names must be declared in the DTD, but they may
1880%%% be declared later.
1881notation_exists(Name, #xmerl_scanner{rules_read_fun = Read,
1882				     rules_write_fun = Write } = S) ->
1883    case Read(notation, Name, S) of
1884	undefined ->
1885	    %% this is legal, since the referenced NOTATION
1886	    %% may be declared later in internal or external
1887	    %% subset.
1888	    Write(notation,Name,undeclared,S);
1889	_Value ->
1890	    ok
1891    end.
1892
1893%%% [59] Enumeration
1894
1895scan_enumeration([], S=#xmerl_scanner{continuation_fun = F}, Acc) ->
1896    ?dbg("cont()...~n", []),
1897    F(fun(MoreBytes, S1) -> scan_enumeration(MoreBytes, S1, Acc) end,
1898      fatal_fun(unexpected_end),
1899      S);
1900scan_enumeration(")" ++ T, S0, Acc) ->
1901    ?bump_col(1),
1902    {{enumeration, lists:reverse(Acc)}, T, S};
1903scan_enumeration("|" ++ T, S0, Acc) ->
1904    ?bump_col(1),
1905    ?strip1,
1906    {NmToken, _NamespaceInfo, T2, S2} = scan_nmtoken(T1, S1),
1907    ?strip3,
1908    scan_enumeration(T3, S3, [NmToken|Acc]).
1909
1910
1911%%%%%%% [60] DefaultDecl
1912
1913scan_default_decl([], S=#xmerl_scanner{continuation_fun = F}, Type) ->
1914    ?dbg("cont()...~n", []),
1915    F(fun(MoreBytes, S1) -> scan_default_decl(MoreBytes, S1, Type) end,
1916      fatal_fun(unexpected_end),
1917      S);
1918scan_default_decl("#REQUIRED" ++ T, S0, _Type) ->
1919    ?bump_col(9),
1920    {{'#REQUIRED',no_value}, T, S};
1921scan_default_decl("#IMPLIED" ++ T, S0, _Type) ->
1922    ?bump_col(8),
1923    {{'#IMPLIED',no_value}, T, S};
1924scan_default_decl("#FIXED" ++ T, S0, Type) ->
1925    ?bump_col(6),
1926    {_,T1,S1} = mandatory_strip(T,S),
1927    {Value,T2,S2,_} = default_value(T1, S1, Type),
1928    {{'#FIXED',Value},T2,S2};
1929scan_default_decl(Str, S, Type) ->
1930    {Value,T1,S1,_} = default_value(Str, S, Type),
1931    {{no_decl,Value},T1,S1}.
1932
1933
1934%% There is room here to validate against Type, but we don't do it at
1935%% the moment.
1936default_value(T, S, Type) ->
1937    {_Val, _T1, _S1,_} = scan_att_value(T, S, Type).
1938
1939
1940%%%%%%% [71] EntityDef
1941
1942scan_entity([], S=#xmerl_scanner{continuation_fun = F}) ->
1943    ?dbg("cont()...~n", []),
1944    F(fun(MoreBytes, S1) -> scan_entity(MoreBytes, S1) end,
1945      fatal_fun(unexpected_end),
1946      S);
1947scan_entity("%" ++ T, #xmerl_scanner{rules_write_fun = Write} = S0) ->
1948    %% parameter entity
1949    ?bump_col(1),
1950    {_,T1,S1} = mandatory_strip(T,S),
1951    {PEName, _NamespaceInfo, T2, S2} = scan_name_no_colons(T1, S1),
1952    {_,T3,S3} = mandatory_strip(T2,S2),
1953    {PEDef, T4, S4} = scan_pe_def(T3, S3, PEName),
1954    ?strip5,
1955    {">" ++ T6,S6} = scan_entity_completion(T5,S5),
1956    S7 = Write(parameter_entity, PEName, PEDef, S6),
1957    {T6, S7};
1958scan_entity(T, #xmerl_scanner{rules_write_fun = Write,
1959			      rules_read_fun = Read,
1960			      rules_delete_fun = Delete} = S) ->
1961    %% generic entity
1962    {EName, _NamespaceInfo, T1, S1} = scan_name_no_colons(T, S),
1963    {_,T2,S2} = mandatory_strip(T1,S1),
1964    {EDef, EntType, T3, S3} = scan_entity_def(T2, S2, EName),
1965    check_entity_recursion(EName,S3),
1966    ?strip4,
1967    {">" ++ T5,S5} = scan_entity_completion(T4,S4),
1968    case Read(entity,EName,S5) of
1969	undeclared -> Delete(entity,EName,S5);
1970	_ -> ok
1971    end,
1972    S6 = Write(entity, EName, {S5#xmerl_scanner.environment,EntType,EDef}, S5),
1973    {T5, S6}.
1974
1975scan_entity_completion(T,S) ->
1976    scan_markup_completion_gt(T,S).
1977
1978%%%%%%% [73] EntityDef
1979
1980scan_entity_def([], S=#xmerl_scanner{continuation_fun = F}, EName) ->
1981    ?dbg("cont()...~n", []),
1982    F(fun(MoreBytes, S1) -> scan_entity_def(MoreBytes, S1, EName) end,
1983      fatal_fun(unexpected_end),
1984      S);
1985scan_entity_def("'" ++ T, S0, EName) ->
1986    ?bump_col(1),
1987    {EVal,T1,S1}=scan_entity_value(T, S, $', EName,general),
1988    {EVal,internal,T1,S1};
1989scan_entity_def("\"" ++ T, S0, EName) ->
1990    ?bump_col(1),
1991    {EVal,T1,S1}=scan_entity_value(T, S, $", EName,general),
1992    {EVal,internal,T1,S1};
1993%% external general entity, parsed or unparsed.
1994scan_entity_def(Str, S, EName) ->
1995    {ExtID, T1, S1} = scan_external_id(Str, S),
1996    {NData, T2, S2} = scan_ndata_decl(T1, S1),
1997    case NData of
1998	{ndata,_} ->
1999	    %% if NDATA exists it is an unparsed ENTITY
2000	    {{ExtID,NData},external,T2,S2};
2001	_ ->
2002	    case fetch_and_parse(ExtID,S2,
2003				 [{text_decl,true},
2004				  {environment,{external,{entity,EName}}}]) of
2005		{{_USret,Entity},_Tail,_Sx} ->
2006		    {Entity, external,T2, S2};
2007		{Entity,_Tail,Sx} ->
2008			OldRef=S2#xmerl_scanner.entity_references,
2009			NewRef=Sx#xmerl_scanner.entity_references,
2010		    {Entity,external,T2,
2011		     S2#xmerl_scanner{entity_references=OldRef++NewRef}};
2012		{error,enoent} -> % this bad entity is declared,
2013                                       % but it may not be referenced,
2014                                       % then it would not be an
2015                                       % error.
2016		    {{error,enoent},external,T2,S2}
2017	    end
2018    end.
2019
2020
2021scan_ndata_decl([], S=#xmerl_scanner{continuation_fun = F}) ->
2022    ?dbg("cont()...~n", []),
2023    F(fun(MoreBytes, S1) -> scan_ndata_decl(MoreBytes, S1) end,
2024      fatal_fun(unexpected_end),
2025      S);
2026scan_ndata_decl(Str = ">"++_T, S) ->
2027    {[], Str, S};
2028scan_ndata_decl(T, S) ->
2029    {_,T1,S1} = mandatory_strip(T,S),
2030    scan_ndata_decl2(T1,S1).
2031scan_ndata_decl2(Str = ">"++_T,S) ->
2032    {[], Str, S};
2033scan_ndata_decl2("NDATA" ++ T,S0 = #xmerl_scanner{rules_read_fun = Read,
2034						rules_write_fun = Write}) ->
2035    ?bump_col(5),
2036    {_,T1,S1} = mandatory_strip(T,S),
2037    {Name, _NamespaceInfo, T2, S2} = scan_name(T1, S1),
2038    case Read(notation, Name, S2) of
2039	undefined -> %% this is legal, since the referenced NOTATION
2040                     %% may be declared later in internal or external
2041                     %% subset.
2042	    Write(notation,Name,undeclared,S2),
2043	    {{ndata,Name},T2,S2};
2044	_Value ->
2045	    {{ndata, Name}, T2, S2}
2046    end.
2047
2048%%%%%%% [39] element
2049
2050scan_element(T, S, Pos) ->
2051    scan_element(T, S, Pos, S#xmerl_scanner.space,
2052		 _Lang = [], _Parents = [], #xmlNamespace{}).
2053
2054scan_element(T, S=#xmerl_scanner{line=L,col=C},
2055	     Pos, SpaceDefault,Lang, Parents, NS) ->
2056    {Name, NamespaceInfo, T1, S1} = scan_name(T, S),
2057    vc_Element_valid(Name,NamespaceInfo,S),
2058    ?strip2,
2059    scan_element(T2, S2, Pos, Name, L, C, _Attrs = [],
2060		 Lang, Parents, NamespaceInfo, NS,
2061		 SpaceDefault).
2062
2063
2064scan_element("/", S=#xmerl_scanner{continuation_fun = F},
2065	     Pos, Name, StartL, StartC, Attrs, Lang, Parents,
2066	     NSI, NS, SpaceDefault) ->
2067    ?dbg("trailing / detected~n", []),
2068    F(fun(MoreBytes, S1) -> scan_element("/" ++ MoreBytes, S1,
2069					 Pos, Name, StartL, StartC, Attrs,
2070					 Lang,Parents,NSI,NS,SpaceDefault) end,
2071      fatal_fun(unexpected_end),
2072      S);
2073scan_element([], S=#xmerl_scanner{continuation_fun = F},
2074	     Pos, Name, StartL, StartC, Attrs, Lang, Parents,
2075	     NSI, NS, SpaceDefault) ->
2076    ?dbg("cont()...~n", []),
2077    F(fun(MoreBytes, S1) -> scan_element(MoreBytes, S1,
2078					 Pos, Name, StartL, StartC, Attrs,
2079					 Lang,Parents,NSI,NS,SpaceDefault) end,
2080      fatal_fun(unexpected_end),
2081      S);
2082scan_element("/>" ++ T, S0 = #xmerl_scanner{hook_fun = Hook,
2083					    event_fun = Event,
2084					    line = L, col = C,
2085					    xmlbase_cache=XMLBase}, Pos,
2086	     Name, _StartL, _StartC, Attrs0, Lang, Parents, NSI,
2087	     Namespace, _SpaceDefault) ->
2088    ?bump_col(2),
2089    Attrs = lists:reverse(Attrs0),
2090    E=processed_whole_element(S, Pos, Name, Attrs, Lang, Parents,NSI,Namespace),
2091
2092    #xmlElement{attributes = Attrs1} = E,
2093    wfc_unique_att_spec(Attrs1,S),
2094    S1 = #xmerl_scanner{} = Event(#xmerl_event{event = ended,
2095					       line = L,
2096					       col = C,
2097					       data = E}, S0),
2098    {Ret, S2} = Hook(E, S1),
2099    S2b=S2#xmerl_scanner{xmlbase=XMLBase},
2100    {Ret, T, S2b};
2101scan_element(">", S=#xmerl_scanner{continuation_fun = F},
2102	     Pos, Name, StartL, StartC, Attrs, Lang, Parents,
2103	     NSI, NS, SpaceDefault) ->
2104    ?dbg("trailing > detected~n", []),
2105    F(fun(MoreBytes, S1) -> scan_element(">" ++ MoreBytes, S1,
2106					 Pos, Name, StartL, StartC, Attrs,
2107					 Lang,Parents,NSI,NS,SpaceDefault) end,
2108      fatal_fun(unexpected_end),
2109      S);
2110scan_element(">" ++ T, S0 = #xmerl_scanner{event_fun = Event,
2111					   hook_fun = Hook,
2112					   line = L, col = C,
2113					   xmlbase_cache=XMLBase,
2114					   space = SpaceOption},
2115	     Pos, Name, StartL, StartC, Attrs0, Lang, Parents,
2116	     NSI, Namespace, SpaceDefault) ->
2117    ?bump_col(1),
2118    Attrs = lists:reverse(Attrs0),
2119    E0=processed_whole_element(S,Pos,Name,Attrs,Lang,Parents,NSI,Namespace),
2120
2121    #xmlElement{attributes = Attrs1} = E0,
2122    wfc_unique_att_spec(Attrs1,S),
2123    XMLSpace = case lists:keysearch('xml:space', #xmlAttribute.name, Attrs1) of
2124		   false ->			SpaceDefault;
2125		   {value, #xmlAttribute{value="default"}} ->	SpaceOption;
2126		   {value, #xmlAttribute{value="preserve"}} ->	preserve;
2127		   _ ->				SpaceDefault
2128	       end,
2129
2130    E0=processed_whole_element(S,Pos,Name,Attrs1,Lang,Parents,NSI,Namespace),
2131    S1 = #xmerl_scanner{} = Event(#xmerl_event{event = started,
2132					       line = StartL,
2133					       col = StartC,
2134					       data = E0}, S),
2135
2136    {Content, T1, S2} = scan_content(T, S1, Name, Attrs1, XMLSpace,
2137				     E0#xmlElement.language,
2138				     [{Name, Pos}|Parents], Namespace),
2139
2140    Element=E0#xmlElement{content=Content,
2141			  xmlbase=E0#xmlElement.xmlbase},
2142    S3 = #xmerl_scanner{} = Event(#xmerl_event{event = ended,
2143					       line = L,
2144					       col = C,
2145					       data = Element}, S2),
2146    {Ret, S4} = Hook(Element, S3),
2147    S4b=S4#xmerl_scanner{xmlbase=XMLBase},
2148    {Ret, T1, S4b};
2149scan_element(T, S, Pos, Name, StartL, StartC, Attrs, Lang, Parents,
2150	     NSI, NS, SpaceDefault) ->
2151    {AttName, NamespaceInfo, T1, S1} = scan_name(T, S),
2152    {T2, S2} = scan_eq(T1, S1),
2153    {AttType,_DefaultDecl} = get_att_type(S2,AttName,Name),
2154    {AttValue, T3a, S3a,IsNorm} = scan_att_value(T2, S2, AttType),
2155%%    check_default_value(S3,DefaultDecl,AttValue),
2156    NewNS = check_namespace(AttName, NamespaceInfo, AttValue, NS),
2157    {T3,S3} = wfc_whitespace_betw_attrs(T3a,S3a),
2158    ?strip4,
2159    AttrPos = case Attrs of
2160		  [] ->
2161		      1;
2162		  [#xmlAttribute{pos = P}|_] ->
2163		      P+1
2164	      end,
2165    Attr = #xmlAttribute{name = AttName,
2166			 parents = [{Name, Pos}|Parents],
2167			 pos = AttrPos,
2168			 language = Lang,
2169			 nsinfo = NamespaceInfo,
2170			 value = AttValue,
2171			 normalized = IsNorm},
2172    XMLBase=if
2173		AttName=='xml:base' ->
2174		    resolve_relative_uri(AttValue,S4#xmerl_scanner.xmlbase);
2175		true ->
2176		    S4#xmerl_scanner.xmlbase
2177	    end,
2178
2179    #xmerl_scanner{event_fun = Event,
2180		   line = Line,
2181		   col = Col} = S4,
2182    S5 = Event(#xmerl_event{event = ended,
2183			    line = Line,
2184			    col = Col,
2185			    data = Attr},
2186	       S4#xmerl_scanner{xmlbase=XMLBase,
2187				xmlbase_cache=S#xmerl_scanner.xmlbase}),
2188    scan_element(T4, S5, Pos, Name, StartL, StartC, [Attr|Attrs],
2189		 Lang, Parents, NSI, NewNS, SpaceDefault).
2190
2191get_default_attrs(S = #xmerl_scanner{rules_read_fun = Read}, ElemName) ->
2192    case Read(elem_def, ElemName, S) of
2193	#xmlElement{attributes = Attrs} ->
2194	    [ {AttName, AttValue} ||
2195	      {AttName, _, AttValue, _, _} <- Attrs, AttValue =/= no_value ];
2196	_ -> []
2197    end.
2198
2199get_att_type(S=#xmerl_scanner{rules_read_fun=Read},AttName,ElemName) ->
2200    case Read(elem_def,ElemName,S) of
2201	#xmlElement{attributes = Attrs} ->
2202	    case lists:keysearch(AttName,1,Attrs) of
2203		{value,{_,AttType,_,DefaultDecl,_}} ->
2204		    {AttType,DefaultDecl};
2205		_ -> {'CDATA',no_value} %% undefined attribute shall be treated as CDATA
2206	    end;
2207	_ -> {'CDATA',no_value}
2208    end.
2209
2210resolve_relative_uri(NewBase="/"++_,CurrentBase) ->
2211    case xmerl_uri:parse(CurrentBase) of
2212	{error,_Reason} ->
2213	    NewBase;
2214	{Scheme,Host,Port,_Path,_Query} ->
2215	    atom_to_list(Scheme)++Host++":"++integer_to_list(Port)++NewBase
2216    end;
2217resolve_relative_uri(NewBase,CurrentBase) ->
2218     filename:join(CurrentBase,NewBase).
2219
2220
2221processed_whole_element(S=#xmerl_scanner{hook_fun = _Hook,
2222					 xmlbase = XMLBase,
2223					 line = _L, col = _C,
2224					 event_fun = _Event},
2225			Pos, Name, Attrs, Lang, Parents, NSI, Namespace) ->
2226    Language = check_language(Attrs, Lang),
2227
2228    AllAttrs =
2229	case S#xmerl_scanner.default_attrs of
2230	    true ->
2231            DefaultAttrs =
2232                [ #xmlAttribute{name = AttName,
2233                                parents = [{Name, Pos} | Parents],
2234                                language = Lang,
2235                                nsinfo = NSI,
2236                                namespace = Namespace,
2237                                value = AttValue,
2238                                normalized = true} ||
2239                  {AttName, AttValue} <- get_default_attrs(S, Name),
2240                  AttValue =/= no_value,
2241                  not lists:keymember(AttName, #xmlAttribute.name, Attrs) ],
2242            lists:append(Attrs, DefaultAttrs);
2243	    false ->
2244		Attrs
2245	end,
2246
2247    {ExpName, ExpAttrs} =
2248	case S#xmerl_scanner.namespace_conformant of
2249	    true ->
2250		%% expand attribute names. We need to do this after having
2251		%% scanned all attributes of the element, since (as far as
2252		%% I can tell), XML Names only specifies that namespace attrs
2253		%% are valid within the whole scope of the element in which
2254		%% they are declared, which should also mean that even if they
2255		%% are declared after some other attributes, the namespace
2256		%% should apply to those attributes as well.
2257		%% Note that the default URI does not apply to attrbute names.
2258		TempNamespace = Namespace#xmlNamespace{default = []},
2259		ExpAttrsX =
2260		    [A#xmlAttribute{
2261		       namespace=Namespace,
2262		       expanded_name=expanded_name(
2263				       A#xmlAttribute.name,
2264				       A#xmlAttribute.nsinfo,
2265						% NSI,
2266				       TempNamespace, S)} || A <- AllAttrs],
2267		{expanded_name(Name, NSI, Namespace, S), ExpAttrsX};
2268	    false ->
2269		{Name, AllAttrs}
2270	end,
2271
2272    #xmlElement{name = Name,
2273		xmlbase = XMLBase,
2274		pos = Pos,
2275		parents = Parents,
2276		attributes = ExpAttrs,
2277		language = Language,
2278		expanded_name = ExpName,
2279		nsinfo = NSI,
2280		namespace = Namespace}.
2281
2282
2283check_language([#xmlAttribute{name='xml:lang',value=Lang}|_], _) ->
2284    Lang;
2285check_language([_|T], Lang) ->
2286    check_language(T, Lang);
2287check_language([], Lang) ->
2288    Lang.
2289
2290
2291check_namespace(xmlns, _, Value, NS) ->
2292    NS#xmlNamespace{default = list_to_atom(Value)};
2293check_namespace(_, {"xmlns", Prefix}, Value,
2294		NS = #xmlNamespace{nodes = Ns}) ->
2295    NS#xmlNamespace{nodes = keyreplaceadd(
2296			      Prefix, 1, Ns, {Prefix, list_to_atom(Value)})};
2297check_namespace(_, _, _, NS) ->
2298    NS.
2299
2300
2301expanded_name(Name, [], #xmlNamespace{default = []}, _S) ->
2302    Name;
2303expanded_name(Name, [], #xmlNamespace{default = URI}, S) ->
2304    case URI of
2305	'http://www.w3.org/XML/1998/namespace' ->
2306	    ?fatal(cannot_bind_default_namespace_to_xml_namespace_name, S);
2307	'http://www.w3.org/2000/xmlns/' ->
2308	    ?fatal(cannot_bind_default_namespace_to_xmlns_namespace_name, S);
2309	_ ->
2310	    {URI, Name}
2311    end;
2312expanded_name(Name, N = {"xmlns", Local}, #xmlNamespace{nodes = Ns}, S) ->
2313    {_, Value} = lists:keyfind(Local, 1, Ns),
2314    case Name of
2315	'xmlns:xml' when Value =:= 'http://www.w3.org/XML/1998/namespace' ->
2316	    N;
2317        'xmlns:xml' when Value =/= 'http://www.w3.org/XML/1998/namespace' ->
2318	    ?fatal({xml_prefix_cannot_be_redeclared, Value}, S);
2319	'xmlns:xmlns' ->
2320	    ?fatal({xmlns_prefix_cannot_be_declared, Value}, S);
2321	_ ->
2322	    case Value of
2323		'http://www.w3.org/XML/1998/namespace' ->
2324		    ?fatal({cannot_bind_prefix_to_xml_namespace, Local}, S);
2325		'http://www.w3.org/2000/xmlns/' ->
2326		    ?fatal({cannot_bind_prefix_to_xmlns_namespace, Local}, S);
2327		_ ->
2328		    N
2329	    end
2330    end;
2331expanded_name(_Name, {"xml", Local}, _NS, _S) ->
2332    {'http://www.w3.org/XML/1998/namespace', list_to_atom(Local)};
2333expanded_name(_Name, {Prefix, Local}, #xmlNamespace{nodes = Ns}, S) ->
2334    case lists:keysearch(Prefix, 1, Ns) of
2335	{value, {_, URI}} ->
2336	    {URI, list_to_atom(Local)};
2337	false ->
2338	    %% A namespace constraint of XML Names is that the prefix
2339	    %% must be declared
2340	    ?fatal({namespace_prefix_not_declared, Prefix}, S)
2341    end.
2342
2343keyreplaceadd(K, Pos, [H|T], Obj) when K == element(Pos, H) ->
2344    [Obj|T];
2345keyreplaceadd(K, Pos, [H|T], Obj) ->
2346    [H|keyreplaceadd(K, Pos, T, Obj)];
2347keyreplaceadd(_K, _Pos, [], Obj) ->
2348    [Obj].
2349
2350%%%%%%% [10] AttValue
2351%% normalize the attribute value according to XML 1.0 section 3.3.3
2352
2353scan_att_value([], S=#xmerl_scanner{continuation_fun = F},AT) ->
2354    ?dbg("cont()...~n", []),
2355    F(fun(MoreBytes, S1) -> scan_att_value(MoreBytes, S1, AT) end,
2356      fatal_fun(unexpected_end),
2357      S);
2358scan_att_value("%"++_T,S=#xmerl_scanner{environment=prolog},_AttType) ->
2359    ?fatal({error,{wfc_PEs_In_Internal_Subset}},S);
2360scan_att_value("%"++T,S0=#xmerl_scanner{rules_read_fun=Read,
2361					rules_write_fun=Write,
2362					rules_delete_fun=Delete},AttType) ->
2363    ?bump_col(1),
2364    {Name,T1,S1} = scan_pe_reference(T,S),
2365    {ExpandedRef,S2} =
2366	case expand_pe_reference(Name,S1,in_literal) of
2367	    Tuple when is_tuple(Tuple) ->
2368		%% {system,URI} or {public,URI}
2369		%% Included in literal, just get external file.
2370		{ExpRef,Sx}=fetch_not_parse(Tuple,S1),
2371		{EntV,_,_S2} = scan_entity_value(ExpRef, Sx, no_delim,
2372						Name,parameter),
2373		%% should do an update Write(parameter_entity) so next
2374		%% expand_pe_reference is faster
2375		Delete(parameter_entity,Name,_S2),
2376		_S3 = Write(parameter_entity,Name,EntV,_S2),
2377		EntV2 = Read(parameter_entity,Name,_S3),
2378		{EntV2,_S3};
2379	    ExpRef ->
2380		{ExpRef,S1}
2381	end,
2382    {_,T2,S3} = strip(ExpandedRef ++ T1,S2),
2383    scan_att_value(T2,S3,AttType);
2384scan_att_value([H|T], S0,'CDATA'=AT) when H == $"; H == $' ->
2385    ?bump_col(1),
2386    scan_att_chars(T, S, H, [],[], AT,false);
2387scan_att_value([H|T], S0,AttType) when H == $"; H == $' ->
2388    ?bump_col(1),
2389    {T1,S1,IsNorm} = normalize(T,S,false),
2390    scan_att_chars(T1, S1, H, [],[], AttType,IsNorm).
2391
2392scan_att_chars([],S=#xmerl_scanner{continuation_fun=F},H,Acc,TmpAcc,AT,IsNorm)->
2393    ?dbg("cont()...~n", []),
2394    F(fun(MoreBytes, S1) ->
2395	      scan_att_chars(MoreBytes, S1, H, Acc,TmpAcc,AT,IsNorm)
2396      end,
2397      fatal_fun(unexpected_end),
2398      S);
2399scan_att_chars([H|T], S0, H, Acc, TmpAcc,AttType,IsNorm) -> % End quote
2400    ?bump_col(1),
2401    check_att_default_val(S#xmerl_scanner.validation,TmpAcc,AttType,S),
2402    {Acc2,S2,IsNorm2} =
2403	if
2404	    AttType == 'CDATA' -> {Acc,S,IsNorm};
2405	    true ->
2406		normalize(Acc,S,IsNorm)
2407	end,
2408    {lists:flatten(lists:reverse(Acc2)), T, S2,IsNorm2};
2409scan_att_chars("&" ++ T, S0, Delim, Acc, TmpAcc,AT,IsNorm) -> % Reference
2410    ?bump_col(1),
2411    {ExpRef, T1, S1} = scan_reference(T, S),
2412    case markup_delimeter(ExpRef) of
2413	true ->
2414	    scan_att_chars(T1,S1,Delim,[ExpRef|Acc],[ExpRef|TmpAcc],AT,IsNorm);
2415	_ ->
2416            case T of
2417                "#" ++ _ ->
2418                    %% normalization rules (sec 3.3.3) require that for
2419                    %% character references, the referenced character be
2420                    %% added directly to the normalized value
2421                    {T2,S2,IsNorm2} =
2422                        if
2423                            ?whitespace(hd(ExpRef)) ->
2424                                normalize(T1, S1, IsNorm);
2425                            true ->
2426                                {T1, S1, IsNorm}
2427                        end,
2428                    scan_att_chars(T2, S2, Delim, ExpRef ++ Acc, TmpAcc, AT, IsNorm2);
2429                _ ->
2430                    Ch = string_to_char_set(S#xmerl_scanner.encoding, ExpRef),
2431                    scan_att_chars(Ch ++ T1, S1, Delim, Acc, TmpAcc, AT, IsNorm)
2432            end
2433    end;
2434scan_att_chars("<" ++ _T, S0, _Delim, _Acc,_, _,_) -> % Tags not allowed here
2435    ?fatal(unexpected_char, S0);
2436scan_att_chars([H|T], S0, Delim, Acc, _TmpAcc,'CDATA',IsNorm)
2437  when ?whitespace(H) ->
2438    ?bump_col(1),
2439    scan_att_chars(T, S, Delim, [$\s|Acc], [],'CDATA',IsNorm);
2440scan_att_chars([H|T], S0, Delim, Acc, TmpAcc,AT,IsNorm)
2441  when ?whitespace(H) ->
2442    ?bump_col(1),
2443    {T1,S1,IsNorm2} = normalize(T,S,IsNorm),
2444    check_att_default_val(S#xmerl_scanner.validation,TmpAcc,AT,S1),
2445    scan_att_chars(T1, S1, Delim, [$\s|Acc],[], AT,IsNorm2);
2446scan_att_chars(Str, S0, Delim, Acc, TmpAcc,AT,IsNorm) ->
2447    ?bump_col(1),
2448    {Ch,T} = to_ucs(S#xmerl_scanner.encoding,Str),
2449    valid_Char(S#xmerl_scanner.validation,AT,Ch,S),
2450    scan_att_chars(T, S, Delim, [Ch|Acc], [Ch|TmpAcc],AT,IsNorm).
2451
2452markup_delimeter("&")->    true;
2453markup_delimeter("\"") ->  true;
2454markup_delimeter("\'") ->  true;
2455markup_delimeter("<") ->   true;
2456markup_delimeter(">") ->   true;
2457markup_delimeter("%") ->   true;
2458markup_delimeter(_) ->     false.
2459
2460check_att_default_val(dtd,[],_Ent,_S) ->
2461    ok;
2462check_att_default_val(dtd,RevName,Ent,S) ->
2463    check_att_default_val(lists:reverse(RevName),Ent,S);
2464check_att_default_val(_,_,_,_) ->
2465    ok.
2466
2467check_att_default_val(Name,Ent,S=#xmerl_scanner{rules_write_fun=Write})
2468  when Ent == 'ENTITY'; Ent == 'ENTITIES' ->
2469    case xmerl_lib:is_letter(hd(Name)) of
2470	true -> ok;
2471	_ -> ?fatal({illegal_first_character,Ent,Name},S)
2472    end,
2473    SName = list_to_atom(Name),
2474    Write(entity,SName,undeclared,S);
2475check_att_default_val(Name,IDR,S=#xmerl_scanner{rules_write_fun=Write})
2476  when IDR == 'IDREF'; IDR == 'IDREFS' ->
2477    case xmerl_lib:is_letter(hd(Name)) of
2478	true -> ok;
2479	_ -> ?fatal({illegal_first_character,IDR,Name},S)
2480    end,
2481    SName = list_to_atom(Name),
2482    Write(id,SName,undeclared,S);
2483check_att_default_val(Name,'ID',S=#xmerl_scanner{rules_write_fun=Write,
2484						 rules_read_fun=Read,
2485						 rules_delete_fun=Delete}) ->
2486    case xmerl_lib:is_name(Name) of
2487	false ->
2488	    ?fatal({'ID_names_must_be_Name_production',Name},S);
2489	_ ->
2490	    ok
2491    end,
2492    SName = if
2493		is_list(Name) -> list_to_atom(Name);
2494		true -> Name
2495	    end,
2496    case Read(id,SName,S) of
2497	undeclared -> %% was referenced in IDREF/IDREFS before defined
2498	    Delete(id,SName,S);
2499	SName -> ?fatal({values_must_be_unique,'ID',SName},S);
2500	undefined -> ok
2501    end,
2502    Write(id,SName,SName,S);
2503check_att_default_val(_,_,_) ->
2504    ok.
2505
2506valid_Char(dtd,AT,C,S) when AT=='NMTOKEN';AT=='NMTOKENS' ->
2507    vc_Valid_Char(AT,C,S);
2508valid_Char(_,_,[C],S) ->
2509    case xmerl_lib:is_char(C) of
2510	true ->
2511	    ok;
2512	false ->
2513	    ?fatal({unexpected_char,C}, S)
2514    end;
2515valid_Char(_,_,C,S) ->
2516    case xmerl_lib:is_char(C) of
2517	true ->
2518	    ok;
2519	false ->
2520	    ?fatal({unexpected_char,C}, S)
2521    end.
2522
2523
2524
2525%%%%%%% [43] content
2526
2527scan_content(T, S, Name, Attrs, Space, Lang, Parents, NS) ->
2528    scan_content(T, S, _Pos = 1, Name, Attrs, Space,
2529                 Lang, Parents, NS, _Acc = [],_MarkupDel=[]).
2530
2531scan_content("<", S= #xmerl_scanner{continuation_fun = F},
2532            Pos, Name, Attrs, Space, Lang, Parents, NS, Acc,_) ->
2533    ?dbg("trailing < detected~n", []),
2534    F(fun(MoreBytes, S1) -> scan_content("<" ++ MoreBytes, S1,
2535					 Pos, Name, Attrs,
2536					 Space, Lang, Parents, NS, Acc,[]) end,
2537      fatal_fun(unexpected_end),
2538      S);
2539scan_content([], S=#xmerl_scanner{environment={external,{entity,_}}},
2540             _Pos, _Name, _Attrs, _Space, _Lang, _Parents, _NS, Acc,_) ->
2541    {lists:reverse(Acc),[],S};
2542scan_content([], S=#xmerl_scanner{environment=internal_parsed_entity},
2543             _Pos, _Name, _Attrs, _Space, _Lang, _Parents, _NS, Acc,_) ->
2544    {lists:reverse(Acc),[],S};
2545scan_content([], S=#xmerl_scanner{continuation_fun = F},
2546             Pos, Name, Attrs, Space, Lang, Parents, NS, Acc,_) ->
2547    ?dbg("cont()...~n", []),
2548    F(fun(MoreBytes, S1) -> scan_content(MoreBytes, S1,
2549					 Pos, Name, Attrs,
2550					 Space, Lang, Parents, NS, Acc,[]) end,
2551      fatal_fun(unexpected_end),
2552      S);
2553scan_content("</" ++ T, S0, _Pos, Name, _Attrs, _Space, _Lang,
2554	     _Parents, _NS, Acc,[]) ->
2555    ?bump_col(2),
2556    {ETagName, _NamespaceInfo, T1, S1} = scan_name(T, S),
2557    if ETagName == Name ->
2558            ok;
2559       true ->
2560            ?fatal({endtag_does_not_match, {was,ETagName,should_have_been, Name}}, S)
2561    end,
2562    ?strip2,
2563    case T2 of
2564        ">" ++ T3 ->
2565            {lists:reverse(Acc), T3, S2};
2566        _ ->
2567	    ?fatal({error,{unexpected_end_of_STag}},S)
2568    end;
2569scan_content([$&|_T]=Str,
2570	     #xmerl_scanner{environment={external,{entity,EName}}} = S0,
2571	     Pos, Name, Attrs, Space, Lang, Parents, NS, Acc,_) ->
2572    {_EntV,T1,S1}=scan_entity_value(Str,S0 ,[],EName,general),
2573    %%This is a problem. All referenced entities in the external entity must be checked for recursion, thus parse the contentbut,skip result.
2574    scan_content(T1,S1,Pos, Name, Attrs, Space, Lang, Parents, NS, Acc,[]);
2575scan_content("&"++T,
2576	     #xmerl_scanner{environment=internal_parsed_entity} = S,
2577	     Pos, Name, Attrs, Space, Lang, Parents, NS, Acc,_) ->
2578    {_, T1, S1} = scan_reference(T, S),
2579    scan_content(T1,S1,Pos, Name, Attrs, Space, Lang, Parents, NS, Acc,[]);
2580scan_content("&" ++ T, S0, Pos, Name, Attrs, Space, Lang, Parents, NS, Acc,[]) ->
2581    ?bump_col(1),
2582    {ExpRef, T1, S1} = scan_reference(T, S),
2583    case markup_delimeter(ExpRef) of
2584	true -> scan_content(ExpRef++T1,S1,Pos,Name,Attrs,Space,Lang,Parents,NS,Acc,ExpRef);
2585	_ ->
2586	    scan_content(string_to_char_set(S1#xmerl_scanner.encoding,ExpRef)++T1,S1,Pos,Name,Attrs,Space,Lang,Parents,NS,Acc,[])
2587    end;
2588scan_content("<!--" ++ T, S0=#xmerl_scanner{acc_fun = F, comments=CF}, Pos, Name, Attrs, Space,
2589	     Lang, Parents, NS, Acc,[]) ->
2590    ?bump_col(4),
2591    {C, T1, S1} = scan_comment(T, S, Pos, Parents, Lang),
2592    case CF of
2593	true ->
2594	    {Acc2, Pos2, S3} =
2595		case F(C, Acc, S1) of
2596		    {Acc1, S2} ->
2597			{Acc1, Pos + 1, S2};
2598		    {Acc1, Pos1, S2} ->
2599			{Acc1, Pos1, S2}
2600		end,
2601	    scan_content(T1, S3, Pos2, Name, Attrs, Space, Lang, Parents, NS, Acc2,[]);
2602	false ->
2603	    scan_content(T1, S1, Pos, Name, Attrs, Space, Lang, Parents, NS, Acc,[])
2604    end;
2605scan_content("<" ++ T, S0, Pos, Name, Attrs, Space, Lang, Parents, NS, Acc,[]) ->
2606    ?bump_col(1),
2607    {Markup, T1, S1} =
2608        scan_content_markup(T, S, Pos, Name, Attrs, Space, Lang, Parents, NS),
2609    AccF = S1#xmerl_scanner.acc_fun,
2610    {NewAcc, NewPos, NewS} = case AccF(Markup, Acc, S1) of
2611				 {Acc2, S2} ->
2612				     {Acc2, Pos+1, S2};
2613				 {Acc2, Pos2, S2} ->
2614				     {Acc2, Pos2, S2}
2615			     end,
2616    scan_content(T1, NewS, NewPos, Name, Attrs, Space, Lang,
2617		 Parents, NS, NewAcc,[]);
2618scan_content([_H|T], S= #xmerl_scanner{environment={external,{entity,_}}},
2619 	     Pos, Name, Attrs, Space, Lang, Parents, NS, Acc,_) ->
2620    %% Guess we have to scan the content to find any internal entity
2621    %% references.
2622    scan_content(T,S,Pos, Name, Attrs, Space, Lang, Parents, NS, Acc,[]);
2623scan_content(T, S=#xmerl_scanner{acc_fun = F,
2624				 event_fun = Event,
2625				 hook_fun=Hook,
2626				 line = _L},
2627             Pos, Name, Attrs, Space, Lang, Parents, NS, Acc,MarkupDel) ->
2628    Text0 = #xmlText{pos = Pos,
2629                     parents = Parents},
2630    S1 = #xmerl_scanner{} = Event(#xmerl_event{event = started,
2631                                               line = S#xmerl_scanner.line,
2632                                               data = Text0}, S),
2633    {Data, T1, S2} =  scan_char_data(T, S1, Space,MarkupDel),
2634    Text = Text0#xmlText{value = Data},
2635    {Ret,S2b} = Hook(Text,S2),
2636    S3 = #xmerl_scanner{} = Event(#xmerl_event{event = ended,
2637                                               line = S2b#xmerl_scanner.line,
2638                                               data = Ret}, S2b),
2639    {NewAcc, NewPos, NewS} = case F(Ret, Acc, S3) of
2640				 {Acc4, S4} ->
2641				     {Acc4, Pos+1, S4};
2642				 {Acc4, Pos4, S4} ->
2643				     {Acc4, Pos4, S4}
2644			     end,
2645    scan_content(T1, NewS, NewPos, Name, Attrs, Space, Lang,
2646		 Parents, NS, NewAcc,[]).
2647
2648
2649scan_content_markup([], S=#xmerl_scanner{continuation_fun = F},
2650		    Pos, Name, Attrs, Space, Lang, Parents, NS) ->
2651    ?dbg("cont()...~n", []),
2652    F(fun(MoreBytes, S1) -> scan_content_markup(
2653			      MoreBytes,S1,Pos,Name,
2654			      Attrs,Space,Lang,Parents,NS) end,
2655      fatal_fun(unexpected_end),
2656      S);
2657scan_content_markup("![CDATA[" ++ T, S0, Pos, _Name, _Attrs,
2658		    _Space, _Lang, Parents, _NS) ->
2659    ?bump_col(8),
2660    scan_cdata(T, S, Pos, Parents);
2661scan_content_markup("?"++T,S0,Pos,_Name,_Attrs,_Space,_Lang,Parents,_NS) ->
2662    ?bump_col(1),
2663    scan_pi(T, S, Pos, Parents);
2664scan_content_markup(T, S, Pos, _Name, _Attrs, Space, Lang, Parents, NS) ->
2665    scan_element(T, S, Pos, Space, Lang, Parents, NS).
2666
2667scan_char_data(T, S, Space,MUD) ->
2668    scan_char_data(T, S, Space,MUD, _Acc = []).
2669
2670%%%%%%% [14] CharData
2671
2672scan_char_data([], S=#xmerl_scanner{environment={external,{entity,_}}},
2673	       _Space,_MUD, Acc) ->
2674
2675    {lists:reverse(Acc), [], S};
2676scan_char_data([], S=#xmerl_scanner{environment=internal_parsed_entity},
2677	       _Space, _MUD,Acc) ->
2678
2679    {lists:reverse(Acc), [], S};
2680scan_char_data([], S=#xmerl_scanner{continuation_fun = F}, Space, _MUD,Acc) ->
2681    ?dbg("cont()...~n", []),
2682    F(fun(MoreBytes, S1) -> scan_char_data(MoreBytes,S1,Space,_MUD,Acc) end,
2683      fatal_fun(unexpected_end),
2684      S);
2685scan_char_data([$&|T], S,Space,"&",Acc) ->
2686    scan_char_data(T, S, Space,[], [$&|Acc]);
2687scan_char_data(T=[$&|_], S,_Space,_MUD,Acc) ->
2688
2689    {lists:reverse(Acc), T, S};
2690scan_char_data("]]>" ++ _T, S, _Space,_MUD, _Acc) ->
2691    %% See Section 2.4: Especially:
2692    %% "The right angle bracket (>) MAY be represented using the string "&gt;",
2693    %% and MUST, for compatibility, be escaped using either "&gt;" or a
2694    %% character reference when it appears in the string "]]>" in content, when
2695    %% that string is not marking the end of a CDATA section.
2696    ?fatal(unexpected_cdata_end, S);
2697scan_char_data([$<|T],S,Space,"<", Acc) ->
2698    scan_char_data(T, S, Space,[], [$<|Acc]);
2699scan_char_data(T = [$<|_], S, _Space,_MUD,Acc) ->
2700
2701    {lists:reverse(Acc), T, S};
2702scan_char_data(T = [H|R], S, Space,MUD, Acc) when ?whitespace(H) ->
2703    if
2704        MUD =:= [], Acc =:= [], H =:= $\n, Space =:= preserve ->
2705            case fast_accumulate_whitespace(R, S, T) of
2706                {done, Reply} ->
2707                    Reply;
2708                {NewAcc, T1, S1} ->
2709                    scan_char_data(T1, S1, Space, MUD, NewAcc)
2710            end;
2711        true ->
2712            {NewAcc, T1, S1} = accumulate_whitespace(T, S, Space, Acc),
2713            scan_char_data(T1, S1, Space,MUD,NewAcc)
2714    end;
2715scan_char_data([H1,H2|_T],S,_Space,_MUD,_Acc) when ?non_character(H1,H2) ->
2716    ?fatal({error,{not_allowed_to_use_Unicode_noncharacters}},S);
2717scan_char_data("]]>"++_T,S,_Space,_MUD,_Acc) ->
2718    ?fatal({error,{illegal_character_in_content,"]]>"}},S);
2719scan_char_data(Str,S0,Space,MUD,Acc) ->
2720    ?bump_col(1),
2721    {Ch,T} = wfc_legal_char(Str,S),
2722    scan_char_data(T,S,Space,MUD,[Ch|Acc]).
2723
2724
2725
2726%%%%%%% [18]-[21] CDATA
2727
2728scan_cdata(Str, S, Pos, Parents) ->
2729    scan_cdata(Str, S, Pos, Parents, _Acc = []).
2730
2731
2732scan_cdata([], S=#xmerl_scanner{continuation_fun = F}, Pos, Parents, Acc) ->
2733    ?dbg("cont()...~n", []),
2734    F(fun(MoreBytes, S1) -> scan_cdata(MoreBytes, S1, Pos, Parents, Acc) end,
2735      fatal_fun(unexpected_end),
2736      S);
2737scan_cdata("]]>" ++ T, S0, Pos, Parents, Acc) ->
2738    ?bump_col(3),
2739    {#xmlText{pos = Pos,
2740	      parents = Parents,
2741	      value = lists:reverse(Acc),
2742	      type = cdata}, T, S};
2743scan_cdata(Str, S0, Pos, Parents, Acc) ->
2744    {Ch,T} = to_ucs(S0#xmerl_scanner.encoding,Str),
2745    case xmerl_lib:is_char(Ch) of
2746	true ->
2747	    ?bump_col(1),
2748	    scan_cdata(T, S, Pos, Parents, [Ch|Acc]);
2749	false ->
2750	    ?fatal({unexpected_char,Ch}, S0)
2751    end.
2752
2753
2754%%%%%%% [67] Reference
2755%% returns a three tuple {Result,RestBuf,State}
2756
2757scan_reference([], S=#xmerl_scanner{continuation_fun = F}) ->
2758    ?dbg("cont()...~n", []),
2759    F(fun(MoreBytes, S1) -> scan_reference(MoreBytes, S1) end,
2760      fatal_fun(unexpected_end),
2761      S);
2762scan_reference("#x" ++ T, S0) ->
2763    %% [66] CharRef
2764    ?bump_col(1),
2765    if hd(T) /= $; ->
2766	    scan_char_ref_hex(T, S, 0);
2767       true ->
2768	    ?fatal(invalid_char_ref, S)
2769    end;
2770scan_reference("#" ++ T, S0) ->
2771    %% [66] CharRef
2772    ?bump_col(1),
2773    if hd(T) /= $; ->
2774	    scan_char_ref_dec(T, S, []);
2775       true ->
2776	    ?fatal(invalid_char_ref, S)
2777    end;
2778scan_reference(T, S) ->
2779    case catch scan_entity_ref(T, S) of
2780	{'EXIT', _} ->
2781	    ?fatal(error_scanning_entity_ref,S);
2782	Other ->
2783	    Other
2784    end.
2785
2786
2787%% Chapter 4.4.2: ... the replacement text of entities used to escape
2788%% markup delimiters (the entities amp, lt, gt, apos, quot) is always treated
2789%% as data. (The string "AT&amp;T;" expands to "AT&T;" and the remaining
2790%% ampersand is not recognized as an entity-reference delimiter.)"
2791%%
2792%% How to achieve this? My current approach is to insert the *strings* "&",
2793%% "<", ">", "'", and "\"" instead of the characters. The processor will
2794%% ignore them when performing multiple expansions. This means, for now, that
2795%% the character data output by the processor is (1-2 levels) deep.
2796%% At some suitable point, we should flatten these, so that application-level
2797%% processors should not have to be aware of this detail.
2798
2799scan_entity_ref([], S=#xmerl_scanner{continuation_fun = F}) ->
2800    ?dbg("cont()...~n", []),
2801    F(fun(MoreBytes, S1) -> scan_entity_ref(MoreBytes, S1) end,
2802      fatal_fun(unexpected_end),
2803      S);
2804scan_entity_ref("amp;" ++ T, S0) ->
2805    ?bump_col(4),
2806    {"&", T, S};
2807scan_entity_ref("lt;" ++ T, S0) ->
2808    ?bump_col(3),
2809    {"<", T, S};
2810scan_entity_ref("gt;" ++ T, S0) ->
2811    ?bump_col(3),
2812    {">", T, S};
2813scan_entity_ref("apos;" ++ T, S0) ->
2814    ?bump_col(5),
2815    {"'", T, S};
2816scan_entity_ref("quot;" ++ T, S0) ->
2817    ?bump_col(5),
2818    {"\"", T, S};
2819scan_entity_ref(T, S) ->
2820    {Name, _NamespaceInfo, T1, S1} = scan_name(T, S),
2821    T2 = scan_mandatory(";",T1,1,S1,expected_entity_reference_semicolon),
2822%    ";" ++ T2 = T1,
2823    S2 = S1,
2824    Entity = expand_reference(Name, S2),
2825    {Entity, T2, S2}.
2826
2827
2828%%%%%%% [69] PEReference
2829
2830scan_pe_reference(T, S) ->
2831    {Name, _NamespaceInfo, T1, S1} = scan_name(T, S),
2832    T2 = scan_mandatory(";",T1,1,S1,expected_parsed_entity_reference_semicolon),
2833%    ";" ++ T2 = T1,
2834    {Name, T2, S1#xmerl_scanner{col = S1#xmerl_scanner.col+1}}.
2835
2836expand_pe_reference(Name, #xmerl_scanner{rules_read_fun = Read} = S,WS) ->
2837    case Read(parameter_entity, Name, S) of
2838	undefined ->
2839	    ?fatal({unknown_parameter_entity, Name}, S); % WFC or VC failure
2840	Err={error,_Reason} ->
2841	    ?fatal(Err,S);
2842	Tuple when is_tuple(Tuple) ->
2843	    Tuple;
2844	Result ->
2845	    if
2846		WS == in_literal -> Result;
2847		true -> " "++Result++" "
2848	    end
2849    end.
2850
2851% Currently unused
2852%
2853% expand_external_pe_reference(Name, #xmerl_scanner{rules_read_fun = Read} = S) ->
2854%     case Read(parameter_entity, Name, S) of
2855% 	undefined ->
2856% 	    ?fatal({unknown_parameter_entity, Name}, S);
2857% 	Result ->
2858% 	    fetch_DTD(Result,S)
2859%     end.
2860
2861
2862%%%%%%% [68] EntityReference
2863
2864expand_reference(Name, #xmerl_scanner{environment={external,{entity,_}}}) ->
2865    atom_to_list(Name);
2866expand_reference(Name, #xmerl_scanner{environment=internal_parsed_entity}) ->
2867    atom_to_list(Name);
2868expand_reference(Name, #xmerl_scanner{rules_read_fun = Read} = S) ->
2869    case Read(entity, Name, S) of
2870	undefined ->
2871	    ?fatal({unknown_entity_ref, Name}, S);
2872	{_,external,{error,enoent}} ->
2873	    ?fatal({error,{entity_target_not_found,{error,enoent},Name}},S);
2874	{DefEnv,EntType,Value} ->
2875	    wfc_Entity_Declared(DefEnv,S,Name),
2876	    Value2 = string_to_char_set(S#xmerl_scanner.encoding,Value),
2877	    wfc_Internal_parsed_entity(EntType,Value2,S),
2878	    Value
2879    end.
2880
2881
2882%%%%%%% [66] CharRef
2883
2884scan_char_ref_dec([], S=#xmerl_scanner{continuation_fun = F}, Acc) ->
2885    ?dbg("cont()...~n", []),
2886    F(fun(MoreBytes, S1) -> scan_char_ref_dec(MoreBytes, S1, Acc) end,
2887      fatal_fun(unexpected_end),
2888      S);
2889scan_char_ref_dec([H|T], S0, Acc) when H >= $0, H =< $9 ->
2890    ?bump_col(1),
2891    scan_char_ref_dec(T, S, [H|Acc]);
2892scan_char_ref_dec(";" ++ T, S0, Acc) ->
2893    ?bump_col(1),
2894    Ref = list_to_integer(lists:reverse(Acc)),
2895    {Ch,_} = wfc_legal_char(Ref,S),
2896    {[Ch], T, S}. %% changed return value from [[Ref]]
2897
2898
2899scan_char_ref_hex([], S=#xmerl_scanner{continuation_fun = F}, Acc) ->
2900    ?dbg("cont()...~n", []),
2901    F(fun(MoreBytes, S1) -> scan_char_ref_hex(MoreBytes, S1, Acc) end,
2902      fatal_fun(unexpected_end),
2903      S);
2904scan_char_ref_hex([H|T], S0, Acc) when H >= $0, H =< $9 ->
2905    ?bump_col(1),
2906    Dec = H - $0,
2907    scan_char_ref_hex(T, S, (Dec bor (Acc bsl 4)));
2908scan_char_ref_hex([H|T], S0, Acc) when H >= $a, H =< $f ->
2909    ?bump_col(1),
2910    Dec = (H - $a) + 10,
2911    scan_char_ref_hex(T, S, (Dec bor (Acc bsl 4)));
2912scan_char_ref_hex([H|T], S0, Acc) when H >= $A, H =< $F ->
2913    ?bump_col(1),
2914    Dec = (H - $A) + 10,
2915    scan_char_ref_hex(T, S, (Dec bor (Acc bsl 4)));
2916scan_char_ref_hex(";" ++ T, S0, Acc) ->
2917    ?bump_col(1),
2918    {Ch,_} = wfc_legal_char(Acc,S),
2919    {[Ch], T, S}. %% changed return value from [[Acc]]
2920
2921
2922
2923%%%%%%% [25] Eq
2924%%% Eq    ::=    S? '=' S?
2925scan_eq(T, S) ->
2926    ?strip1,
2927    case T1 of
2928 	[$=|T2] ->
2929	    S2 = S1#xmerl_scanner{col=S1#xmerl_scanner.col+1},
2930	    ?strip3,
2931	    {T3, S3};
2932	_ ->
2933	    ?fatal(assignment_expected,S)
2934    end.
2935
2936
2937%% scan_name/2
2938%%
2939%% We perform some checks here to make sure that the names conform to
2940%% the "Namespaces in XML" specification. This is an option.
2941%%
2942%% Qualified Name:
2943%% [6]      QName ::= (Prefix ':')? LocalPart
2944%% [7]     Prefix ::= NCName
2945%% [8]  LocalPart ::= NCName
2946%% [4]     NCName ::= (Letter | '_') (NCNameChar)*
2947%% [5] NCNameChar ::= Letter | Digit | '.' | '-' | '_'
2948%%                    | CombiningChar | Extender
2949
2950
2951%% The effect of XML Names (namespace) conformance is that:
2952%% - All element types and attribute names contain either zero or one colon
2953%% - No entity names, PI targets, or notation names contain any colons.
2954%%
2955%% scan_name_no_colons/2 will ensure that the name contains no colons iff
2956%% the scanner has been told to be namespace conformant. Otherwise, it will
2957%% behave exactly like scan_name/2.
2958%%
2959scan_name_no_colons(Str, S) ->
2960    NSC = S#xmerl_scanner.namespace_conformant,
2961    case NSC of
2962	true ->
2963	    {Target, NSI, T1, S1} =
2964		scan_name(Str,S#xmerl_scanner{namespace_conformant=no_colons}),
2965	    {Target,NSI,T1,S1#xmerl_scanner{namespace_conformant=NSC}};
2966	false ->
2967	    scan_name(Str, S)
2968    end.
2969
2970
2971
2972%% [5] Name ::= (Letter | '_' | ':') (NameChar)*
2973scan_name([], S=#xmerl_scanner{continuation_fun = F}) ->
2974    ?dbg("cont()...~n", []),
2975    F(fun(MoreBytes, S1) -> scan_name(MoreBytes, S1) end,
2976      fatal_fun(unexpected_end),
2977      S);
2978scan_name(Str = [$:|T], S0 = #xmerl_scanner{namespace_conformant = NSC}) ->
2979    if NSC == false ->
2980	    ?bump_col(1),
2981	    scan_nmtoken(T, S, [$:], NSC);
2982       NSC == no_colons ->
2983	    ?fatal({invalid_NCName, lists:sublist(Str, 1, 6)}, S0);
2984       true ->
2985	    %% In order to conform with the "Namespaces in XML" spec,
2986	    %% we cannot allow names to begin with ":"
2987	    ?fatal({invalid_NCName, lists:sublist(Str, 1, 6)}, S0)
2988    end;
2989scan_name([$_|T], S0 = #xmerl_scanner{namespace_conformant = NSC}) ->
2990    ?bump_col(1),
2991    scan_nmtoken(T, S, [$_], NSC);
2992scan_name("%"++_T,S=#xmerl_scanner{environment=prolog}) ->
2993    ?fatal({error,{wfc_PEs_In_Internal_Subset}},S);
2994scan_name("%"++T,S0=#xmerl_scanner{environment={external,_}}) ->
2995    %% parameter entity that expands to a name
2996    ?bump_col(1),
2997    {PERefName, T1, S1} = scan_pe_reference(T, S),
2998    ExpRef = expand_pe_reference(PERefName, S1,as_PE),
2999    {_,T2,S2} = strip(ExpRef ++ T1,S1),
3000    scan_name(T2,S2);
3001scan_name(Str, S0 = #xmerl_scanner{namespace_conformant = NSC}) ->
3002    {Ch,T} = to_ucs(S0#xmerl_scanner.encoding,Str),
3003    case xmerl_lib:is_letter(Ch) of
3004	true ->
3005	    ?bump_col(1),
3006	    scan_nmtoken(T, S, [Ch], NSC);
3007	false ->
3008	    ?fatal({invalid_name, lists:sublist(Str, 1, 6)}, S0)
3009    end;
3010scan_name(Str, S) ->
3011    ?fatal({invalid_name, Str}, S).
3012
3013
3014
3015
3016
3017
3018scan_nmtoken(Str, S, Acc, NSC) ->
3019    scan_nmtoken(Str, S, Acc, _Prefix = [], _Local = Acc, NSC,isLatin1(hd(Acc),true)).
3020
3021%% scan_nmtoken/2
3022%% [7] NmToken ::= (NameChar)+
3023scan_nmtoken([], S=#xmerl_scanner{continuation_fun = F}) ->
3024    ?dbg("cont()...~n", []),
3025    F(fun(MoreBytes, S1) -> scan_nmtoken(MoreBytes, S1) end,
3026      fatal_fun(unexpected_end),
3027      S);
3028scan_nmtoken("%"++T, S0=#xmerl_scanner{environment={external,_}}) ->
3029    ?bump_col(1),
3030    {PERefName, T1, S1} = scan_pe_reference(T, S),
3031    ExpRef = expand_pe_reference(PERefName, S1,as_PE),
3032    {_,T2,S2}  = strip(ExpRef ++ T1,S1),
3033    scan_nmtoken(T2,S2);
3034scan_nmtoken(Str, S) ->
3035    {Ch,T} = to_ucs(S#xmerl_scanner.encoding,Str),
3036    case xmerl_lib:is_namechar(Ch) of
3037	true ->
3038	    scan_nmtoken(T, S#xmerl_scanner{col = S#xmerl_scanner.col+1},
3039			 _Acc = [Ch], _Prefix = [], _Local = [Ch],
3040			 _NamespaceConformant = false,isLatin1(Ch,true));
3041	false ->
3042	    ?fatal({invalid_nmtoken, lists:sublist(Str, 1, 6)}, S)
3043    end.
3044
3045
3046scan_nmtoken([], S=#xmerl_scanner{continuation_fun = F},
3047	     Acc, Prefix, Local, NSC,IsLatin1) ->
3048    ?dbg("cont()...~n", []),
3049    F(fun(MoreBytes, S1) -> scan_nmtoken(MoreBytes,S1,Acc,Prefix,Local,NSC,IsLatin1) end,
3050      fun(S1) -> {list_to_atom(lists:reverse(Acc)),
3051		  namespace_info(Prefix, Local),[],S1} end,
3052      S);
3053%% whitespace marks the end of a name
3054scan_nmtoken(Str = [H|_], S, Acc, Prefix, Local, _NSC,true) when ?whitespace(H) ->
3055    %% we don't strip here because the occurrence of whitespace may be an error
3056    %% e.g. <!ELEMENT spec (front, body, back ?)>
3057    NmString = lists:reverse(Acc),
3058    {list_to_atom(NmString), namespace_info(Prefix, Local), Str, S};
3059scan_nmtoken(Str = [$:|_], S, Acc, [], _Local, no_colons,_IsLatin1) ->
3060    ?fatal({invalid_NCName,
3061	    lists:sublist(lists:reverse(Acc) ++ Str, 1, 6)}, S);
3062scan_nmtoken([$:|T], S0, Acc, [], Local, NSC, IsLatin1) ->
3063    ?bump_col(1),
3064    scan_nmtoken(T, S, [$:|Acc], lists:reverse(Local), [], NSC,IsLatin1);
3065scan_nmtoken(Str = [$:|_T], S, Acc, _Prefix, _Local, _NSC = true,_IsLatin1) ->
3066    %% non-empty Prefix means that we've encountered a ":" already.
3067    %% Conformity with "Namespaces in XML" requires
3068    %% at most one colon in a name
3069    ?fatal({invalid_NCName,
3070	    lists:sublist(lists:reverse(Acc) ++ Str, 1, 6)}, S);
3071
3072%% non-namechar also marks the end of a name
3073scan_nmtoken(Str, S0, Acc, Prefix, Local, NSC,IsLatin1) ->
3074    ?bump_col(1),
3075    {Ch,T} = to_ucs(S#xmerl_scanner.encoding,Str),
3076    case {xmerl_lib:is_namechar(Ch),IsLatin1} of
3077	{true,_} ->
3078	    scan_nmtoken(T, S, [Ch|Acc], Prefix, [Ch|Local], NSC,isLatin1(Ch,IsLatin1));
3079	{_,true} ->
3080	    NmStr = lists:reverse(Acc),
3081	    {list_to_atom(NmStr), namespace_info(Prefix, Local), Str, S};
3082	_ ->
3083	    {lists:reverse(Acc), namespace_info(Prefix, Local), Str, S}
3084    end.
3085
3086namespace_info([], _) ->
3087    [];
3088namespace_info(Prefix, Local) ->
3089    {Prefix, lists:reverse(Local)}.
3090
3091isLatin1(_Ch,false) ->
3092    false;
3093isLatin1(Ch,_) when Ch > 255 ->
3094    false;
3095isLatin1(_,_) ->
3096    true.
3097
3098%%%%%%% [11] SystemLiteral
3099
3100scan_system_literal([], S=#xmerl_scanner{continuation_fun = F}) ->
3101    ?dbg("cont()...~n", []),
3102    F(fun(MoreBytes, S1) -> scan_system_literal(MoreBytes, S1) end,
3103      fatal_fun(unexpected_end),
3104      S);
3105scan_system_literal("\"" ++ T, S) ->
3106    scan_system_literal(T, S, $", []);
3107scan_system_literal("'" ++ T, S) ->
3108    scan_system_literal(T, S, $', []).
3109
3110
3111scan_system_literal([], S=#xmerl_scanner{continuation_fun = F},
3112		    Delimiter, Acc) ->
3113    ?dbg("cont()...~n", []),
3114    F(fun(MoreBytes, S1) -> scan_system_literal(MoreBytes,S1,Delimiter,Acc) end,
3115      fatal_fun(unexpected_end),
3116      S);
3117scan_system_literal([H|T], S, H, Acc) ->
3118    {lists:reverse(Acc), T, S#xmerl_scanner{col = S#xmerl_scanner.col+1}};
3119scan_system_literal("#"++_R, S, _H, _Acc) ->
3120    %% actually not a fatal error
3121    ?fatal(fragment_identifier_in_system_literal,S);
3122scan_system_literal(Str, S, Delimiter, Acc) ->
3123    {Ch,T} = to_ucs(S#xmerl_scanner.encoding,Str),
3124    scan_system_literal(T, S#xmerl_scanner{col = S#xmerl_scanner.col+1},
3125			Delimiter, [Ch|Acc]).
3126
3127
3128%%%%%%% [12] PubidLiteral
3129
3130scan_pubid_literal([], S=#xmerl_scanner{continuation_fun = F}) ->
3131    ?dbg("cont()...~n", []),
3132    F(fun(MoreBytes, S1) -> scan_pubid_literal(MoreBytes, S1) end,
3133      fatal_fun(unexpected_end),
3134      S);
3135scan_pubid_literal([H|T], S) when H == $"; H == $' ->
3136    scan_pubid_literal(T, S#xmerl_scanner{col = S#xmerl_scanner.col+1}, H, []);
3137scan_pubid_literal([H|_T], S) ->
3138    ?fatal({invalid_pubid_char, H}, S).
3139
3140
3141scan_pubid_literal([], S=#xmerl_scanner{continuation_fun = F},
3142		   Delimiter, Acc) ->
3143    ?dbg("cont()...~n", []),
3144    F(fun(MoreBytes, S1) -> scan_pubid_literal(MoreBytes,S1,Delimiter,Acc) end,
3145      fatal_fun(unexpected_end),
3146      S);
3147scan_pubid_literal([H|T], S, H, Acc) ->
3148    {lists:reverse(Acc), T, S#xmerl_scanner{col = S#xmerl_scanner.col+1}};
3149scan_pubid_literal(Str = [H|_], S, Delimiter, Acc) when ?whitespace(H) ->
3150    %% Before matching public identifiers, all whitespace must be normalized,
3151    %% so we do that here
3152    {_, T, S1} = pub_id_strip(Str, S),
3153    scan_pubid_literal(T, S1, Delimiter, [16#20|Acc]);
3154scan_pubid_literal([H|T], S, Delimiter, Acc) ->
3155    case is_pubid_char(H) of
3156	true ->
3157	    scan_pubid_literal(
3158	      T, S#xmerl_scanner{col = S#xmerl_scanner.col+1},
3159	      Delimiter, [H|Acc]);
3160	false ->
3161	    ?fatal({invalid_pubid_char, H}, S)
3162    end.
3163
3164%% We do not match whitespace here, even though they're allowed in public
3165%% identifiers. This is because we normalize this whitespace as we scan
3166%% (see above in scan_pubid_literal())
3167%%
3168is_pubid_char(X) when X >= $a, X =< $z -> true;
3169is_pubid_char(X) when X >= $A, X =< $Z -> true;
3170is_pubid_char(X) when X >= $0, X =< $9 -> true;
3171is_pubid_char(X) ->
3172    lists:member(X, "-'()+,./:=?;!*#@$_%").
3173
3174
3175%%%%%%% [46] contentspec
3176
3177scan_contentspec([], S=#xmerl_scanner{continuation_fun = F}) ->
3178    ?dbg("cont()...~n", []),
3179    F(fun(MoreBytes, S1) -> scan_contentspec(MoreBytes, S1) end,
3180      fatal_fun(unexpected_end),
3181      S);
3182scan_contentspec("EMPTY" ++ T, S0) ->
3183    ?bump_col(5),
3184    {empty, T, S};
3185scan_contentspec("ANY" ++ T, S0) ->
3186    ?bump_col(3),
3187    {any, T, S};
3188scan_contentspec("%" ++ _T, S=#xmerl_scanner{environment=prolog}) ->
3189    ?fatal({error,{wfc_PEs_In_Internal_Subset}},S);
3190scan_contentspec("%" ++ T, S0) ->
3191    ?bump_col(1),
3192    {PERefName, T1, S1} = scan_pe_reference(T, S),
3193    ExpRef = expand_pe_reference(PERefName, S1,as_PE),
3194    {_,T2,S2}  = strip(ExpRef ++ T1,S1),
3195    scan_contentspec(T2, S2);
3196scan_contentspec("(" ++ T, S0) ->
3197    ?bump_col(1),
3198    ?strip1,
3199    scan_elem_content(T1, S1);
3200scan_contentspec(_Str,S) ->
3201    ?fatal(unexpected_character,S).
3202
3203
3204%%%%%%% [47] children
3205%%%%%%% [51] Mixed
3206
3207scan_elem_content(T, S) ->
3208    scan_elem_content(T, S, _Context = children, _Mode = unknown, _Acc = []).
3209
3210scan_elem_content([], S=#xmerl_scanner{continuation_fun = F},
3211		  Context, Mode, Acc) ->
3212    ?dbg("cont()...~n", []),
3213    F(fun(MoreBytes,S1) -> scan_elem_content(MoreBytes,S1,Context,Mode,Acc) end,
3214      fatal_fun(unexpected_end),
3215      S);
3216scan_elem_content(")" ++ T, S0, Context, Mode0, Acc0) ->
3217    ?bump_col(1),
3218    {Mode, Acc} = case {Mode0, Acc0} of
3219		      {unknown, [_X]} ->
3220			  {seq, Acc0};
3221		      {M, _L} when M == seq; M == choice ->
3222			  {Mode0, lists:reverse(Acc0)}
3223		  end,
3224    {Occurrence, T1, S1} = scan_occurrence(T, S),
3225    vc_No_Duplicate_Types(S,Context,Acc),
3226    case {Occurrence, Context,Acc} of
3227	{once, mixed,['#PCDATA']} -> ok; % It is not ok when there are
3228                                         % more names than '#PCDATA'
3229                                         % and no '*'.
3230	{'*', mixed,_} -> ok;
3231	{Other, mixed,_} ->
3232	    ?fatal({illegal_for_mixed_content, Other}, S1);
3233	_ ->
3234	    ok
3235    end,
3236    ?strip2,
3237    {format_elem_content({Occurrence, {Mode, Acc}}), T2, S2};
3238scan_elem_content("#PCDATA" ++ _T, S, not_mixed, _Mode, _Acc) ->
3239    ?fatal({error,{extra_set_of_parenthesis}},S);
3240scan_elem_content("#PCDATA" ++ _T, S, _Cont, Mode, Acc)
3241  when Mode==choice;Mode==seq;Acc/=[] ->
3242    ?fatal({error,{invalid_format_of_mixed_content}},S);
3243scan_elem_content("#PCDATA" ++ T, S0, _Context, Mode, Acc) ->
3244    ?bump_col(7),
3245    ?strip1,
3246    scan_elem_content(T1, S1, mixed, Mode, ['#PCDATA'|Acc]);
3247scan_elem_content("," ++ _T, S, _Context, choice, _Acc) ->
3248    ?fatal({mixing_comma_and_vertical_bar_in_content_model},S);
3249scan_elem_content("," ++ T, S0, Context, _Mode, Acc) ->
3250    ?bump_col(1),
3251    ?strip1,
3252    scan_elem_content2(T1, S1, Context, seq, Acc);
3253scan_elem_content("|" ++ _T, S, _Context, seq, _Acc) ->
3254    ?fatal({mixing_comma_and_vertical_bar_in_content_model},S);
3255scan_elem_content("|" ++ T, S0, Context, _Mode, Acc) ->
3256    ?bump_col(1),
3257    ?strip1,
3258    scan_elem_content2(T1, S1, Context, choice, Acc);
3259scan_elem_content(T, S, Context, Mode, Acc) ->
3260    scan_elem_content2(T, S, Context, Mode, Acc).
3261
3262scan_elem_content2("(" ++ _T, S, mixed, _Mode, _Acc) ->
3263    ?fatal({error,
3264	{element_names_must_not_be_parenthesized_in_mixed_content}},S);
3265scan_elem_content2("(" ++ T, S0, Context, Mode, Acc) ->
3266    ?bump_col(1),
3267    ?strip1,
3268    {Inner, T2, S2} = scan_elem_content(T1, S1, not_mixed, unknown, []),
3269    scan_elem_content(T2, S2, Context, Mode, [Inner|Acc]);
3270scan_elem_content2("%" ++ _T,S=#xmerl_scanner{environment=prolog},_Context,_Mode,_Acc) ->
3271    ?fatal({error,{wfc_PEs_In_Internal_Subset}},S);
3272scan_elem_content2("%" ++ T, S0, Context, Mode, Acc) ->
3273    ?bump_col(1),
3274    {PERefName, T1, S1} = scan_pe_reference(T, S),
3275    ExpRef = expand_pe_reference(PERefName, S1,as_PE),
3276    {_,T2,S2}=strip(ExpRef++T1,S1),
3277    scan_elem_content(T2, S2, Context, Mode, Acc);
3278scan_elem_content2(T, S, Context, Mode, Acc) ->
3279    {Name, _NameStr, T1, S1} = scan_name(T, S),
3280    {Occurrence, T2, S2} = scan_occurrence(T1, S1),
3281    case {Occurrence, Context} of
3282	{once, mixed} -> ok;
3283	{Other, mixed} ->
3284	    ?fatal({illegal_for_mixed_content, Other}, S1);
3285	_ ->
3286	    ok
3287    end,
3288    ?strip3,
3289    mandatory_delimeter_wfc(T3,S3),
3290    NewAcc = [format_elem_content({Occurrence, Name}) | Acc],
3291    scan_elem_content(T3, S3, Context, Mode, NewAcc).
3292
3293
3294format_elem_content({once, What}) -> What;
3295format_elem_content(Other) -> Other.
3296
3297
3298scan_occurrence([], S=#xmerl_scanner{continuation_fun = F}) ->
3299    ?dbg("cont()...~n", []),
3300    F(fun(MoreBytes, S1) -> scan_occurrence(MoreBytes, S1) end,
3301      fatal_fun(unexpected_end),
3302      S);
3303scan_occurrence([$?|T], S0) ->
3304    ?bump_col(1),
3305    {'?', T, S};
3306scan_occurrence([$+|T], S0) ->
3307    ?bump_col(1),
3308    {'+', T, S};
3309scan_occurrence([$*|T], S0) ->
3310    ?bump_col(1),
3311    {'*', T, S};
3312scan_occurrence(T, S) ->
3313    {once, T , S}.
3314
3315%%% Tests of Validity Constraints
3316
3317
3318%% first part of VC: Name Token
3319vc_Valid_Char(_AT,C,S) ->
3320    case xmerl_lib:is_namechar(C) of
3321	true ->
3322	    ok;
3323	_ ->
3324	    ?fatal({error,{validity_constraint_Name_Token,C}},S)
3325    end.
3326
3327
3328
3329vc_ID_Attribute_Default(_,#xmerl_scanner{validation=Valid})
3330  when Valid /= dtd ->
3331    ok;
3332vc_ID_Attribute_Default({_,'ID',_,Def,_},_S)
3333  when Def=='#IMPLIED';Def=='#REQUIRED' ->
3334    ok;
3335vc_ID_Attribute_Default({_,'ID',_,Def,_},S) ->
3336    ?fatal({error,{validity_constraint_error_ID_Attribute_Default,Def}},S).
3337
3338vc_Enumeration({_Name,{_,NameList},DefaultVal,_,_},S)
3339  when is_list(DefaultVal) ->
3340    case lists:member(list_to_atom(DefaultVal),NameList) of
3341	true ->
3342	    ok;
3343	_ ->
3344	    ?fatal({error,{vc_enumeration,list_to_atom(DefaultVal),NameList}},S)
3345    end;
3346vc_Enumeration({_Name,{_,_NameList},_DefaultVal,_,_},_S) ->
3347    ok.
3348
3349vc_Entity_Name({_Name,'ENTITY',DefaultVal,_,_},S) when is_list(DefaultVal) ->
3350    Read = S#xmerl_scanner.rules_read_fun,
3351    case Read(entity,list_to_atom(DefaultVal),S) of
3352	{_,external,{_,{ndata,_}}} ->
3353	    ok;
3354	_ -> ?fatal({error,{vc_Entity_Name,list_to_atom(DefaultVal)}},S)
3355    end;
3356vc_Entity_Name({_Name,'ENTITY',_,_,_},_S) ->
3357    ok;
3358vc_Entity_Name({_,'ENTITIES',DefaultVal,_,_},S) when is_list(DefaultVal) ->
3359    Read = S#xmerl_scanner.rules_read_fun,
3360    NameListFun = fun([],Acc,_St,_Fun) ->
3361		       lists:reverse(Acc);
3362		  (Str,Acc,St,Fun) ->
3363		       {N,_,St2,Str2} = scan_name(Str,St),
3364		       Fun(Str2,[N|Acc],St2,Fun)
3365	       end,
3366    NameList = NameListFun(DefaultVal,[],S,NameListFun),
3367    VcFun =
3368	fun(X) ->
3369		case Read(entity,X,S) of
3370		    {_,external,{_,{ndata,_}}} ->
3371			ok;
3372		    _ -> ?fatal({error,{vc_Entity_Name,X}},S)
3373		end
3374	end,
3375    lists:foreach(VcFun,NameList);
3376vc_Entity_Name({_,'ENTITIES',_,_,_},_S) ->
3377    ok.
3378
3379vc_No_Duplicate_Types(#xmerl_scanner{validation=dtd} = S,mixed,Acc) ->
3380    CheckDupl =
3381	fun([H|T],F) ->
3382		case lists:member(H,T) of
3383		    true ->
3384			?fatal({no_duplicate_types_allowed,H},S);
3385		    _ -> F(T,F)
3386		end;
3387	   ([],_) -> ok
3388	end,
3389    CheckDupl(Acc,CheckDupl);
3390vc_No_Duplicate_Types(_,_,_) ->
3391    ok.
3392
3393
3394%%% Tests of Well-Formededness Constraints
3395
3396
3397mandatory_delimeter_wfc(","++_T,_S) ->
3398    ok;
3399mandatory_delimeter_wfc("|"++_T,_S) ->
3400    ok;
3401mandatory_delimeter_wfc(")"++_T,_S) ->
3402    ok;
3403mandatory_delimeter_wfc("%"++_T,_S) ->
3404    %% a parameter reference is ok
3405    ok;
3406mandatory_delimeter_wfc(T,S) ->
3407    ?fatal({comma_or_vertical_bar_mandatory_between_names_in_content_model,T},S).
3408
3409
3410wfc_unique_att_spec([],_S) ->
3411    ok;
3412wfc_unique_att_spec([#xmlAttribute{name=N,expanded_name=EN}|Atts],S) ->
3413    case lists:keymember(N,#xmlAttribute.name,Atts) of
3414	true ->
3415	    ?fatal({error,{unique_att_spec_required,N}},S);
3416	_ ->
3417	    case S#xmerl_scanner.namespace_conformant andalso
3418		    lists:keymember(EN, #xmlAttribute.expanded_name, Atts) of
3419		true ->
3420		    ?fatal({error,{unique_att_spec_required,EN}},S);
3421		_ ->
3422		    wfc_unique_att_spec(Atts,S)
3423	    end
3424    end.
3425
3426wfc_legal_char(Chars,S) when is_list(Chars)->
3427    {Ch,Rest} = to_ucs(S#xmerl_scanner.encoding,Chars),
3428    case xmerl_lib:is_char(Ch) of
3429	true ->
3430	    {Ch,Rest};
3431	_ ->
3432	    ?fatal({error,{wfc_Legal_Character,Ch}},S)
3433    end;
3434wfc_legal_char(Ch,S) ->
3435    case xmerl_lib:is_char(Ch) of
3436	true ->
3437	    {Ch,[]};
3438	_ ->
3439	    ?fatal({error,{wfc_Legal_Character,Ch}},S)
3440    end.
3441
3442
3443wfc_whitespace_betw_attrs([WS |_]=L,S) when ?whitespace(WS) ->
3444    {L,S};
3445wfc_whitespace_betw_attrs([$/ |_]=L,S) ->
3446    {L,S};
3447wfc_whitespace_betw_attrs([$> |_]=L,S) ->
3448    {L,S};
3449wfc_whitespace_betw_attrs([],S=#xmerl_scanner{continuation_fun = F}) ->
3450    ?dbg("cont()...~n", []),
3451    F(fun(MoreBytes, S1) -> wfc_whitespace_betw_attrs(MoreBytes, S1) end,
3452      fatal_fun(unexpected_end),
3453      S);
3454wfc_whitespace_betw_attrs(_,S) ->
3455    ?fatal({whitespace_required_between_attributes},S).
3456
3457wfc_Entity_Declared({external,_},S=#xmerl_scanner{standalone=yes},Name) ->
3458    ?fatal({reference_to_externally_defed_entity_standalone_doc,Name},S);
3459wfc_Entity_Declared({external,_},_S,_) ->
3460    ok;
3461wfc_Entity_Declared(_Env,_S,_) ->
3462    ok.
3463
3464wfc_Internal_parsed_entity(internal,Value,S) ->
3465    %% WFC test that replacement text matches production content
3466    scan_content(Value,S#xmerl_scanner{environment=internal_parsed_entity},
3467		 _Name=[],[],S#xmerl_scanner.space,_Lang=[],_Prnt=[],
3468		 #xmlNamespace{});
3469wfc_Internal_parsed_entity(_,_,_) ->
3470    ok.
3471
3472vc_Element_valid(_Name, {"xmlns", _},
3473		 S = #xmerl_scanner{namespace_conformant = true}) ->
3474    ?fatal({error,{illegal_element_prefix,xmlns}},S);
3475vc_Element_valid(Name, _, S) ->
3476    vc_Element_valid(Name, S).
3477
3478vc_Element_valid(_Name,#xmerl_scanner{environment=internal_parsed_entity}) ->
3479    ok;
3480vc_Element_valid(Name,S=#xmerl_scanner{rules_read_fun=Read,
3481				       validation=dtd}) ->
3482    case Read(elem_def,Name,S) of
3483	#xmlElement{elementdef=undeclared} ->
3484	    ?fatal({error,{error_missing_element_declaration_in_DTD,Name}},S);        undefined ->
3485	    ?fatal({error,{error_missing_element_declaration_in_DTD,Name}},S);        _ -> ok
3486    end;
3487vc_Element_valid(_,_) ->
3488    ok.
3489
3490%%%%%%% [74] PEDef
3491
3492
3493scan_pe_def([], S=#xmerl_scanner{continuation_fun = F}, PEName) ->
3494    ?dbg("cont()...~n", []),
3495    F(fun(MoreBytes, S1) -> scan_pe_def(MoreBytes, S1, PEName) end,
3496      fatal_fun(unexpected_end),
3497      S);
3498scan_pe_def("'" ++ T, S0, PEName) ->
3499    ?bump_col(1),
3500    scan_entity_value(T, S, $', PEName,parameter);
3501scan_pe_def("\"" ++ T, S0, PEName) ->
3502    ?bump_col(1),
3503    scan_entity_value(T, S, $", PEName,parameter);
3504scan_pe_def(Str, S, _PEName) ->
3505    scan_external_id(Str, S).
3506
3507
3508%%%%%%% [82] NotationDecl
3509
3510scan_notation_decl(T, #xmerl_scanner{rules_write_fun = Write,
3511				     rules_read_fun=Read,
3512				     rules_delete_fun=Delete} = S) ->
3513    {Name, _NameStr, T1, S1} = scan_name_no_colons(T, S),
3514    {_,T2,S2} = mandatory_strip(T1,S1),
3515    {Def, T3, S3} = scan_notation_decl1(T2, S2),
3516    ?strip4,
3517    T5 = scan_mandatory(">",T4,1,S4,expected_end_tag_notation_declaration),
3518%    ">" ++ T5 = T4,
3519    case Read(notation,Name,S) of
3520	undeclared -> Delete(notation,Name,S4);
3521	_ -> ok
3522    end,
3523    S5 = Write(notation, Name, Def, S4),
3524    {T5, S5}.
3525
3526scan_notation_decl1([], S=#xmerl_scanner{continuation_fun = F}) ->
3527    ?dbg("cont()...~n", []),
3528    F(fun(MoreBytes, S1) -> scan_notation_decl1(MoreBytes, S1) end,
3529      fatal_fun(unexpected_end),
3530      S);
3531scan_notation_decl1("SYSTEM" ++ T, S0) ->
3532    ?bump_col(6),
3533    {_,T1,S1} = mandatory_strip(T,S),
3534    {SL, T2, S2} = scan_system_literal(T1, S1),
3535    {{system, SL}, T2, S2};
3536scan_notation_decl1("PUBLIC" ++ T, S0) ->
3537    ?bump_col(6),
3538    {_,T1,S1} = mandatory_strip(T,S),
3539    {PIDL, T2, S2} = scan_pubid_literal(T1, S1),
3540    ?strip3,
3541    case T3 of
3542	">" ++ _ ->
3543	    {{public, PIDL}, T3,
3544	     S3#xmerl_scanner{col = S3#xmerl_scanner.col+1}};
3545	_ ->
3546	    {SL, T4, S4} = scan_system_literal(T3, S3),
3547	    {{public, PIDL, SL}, T4, S4}
3548    end.
3549
3550%%%%%%% [75] ExternalID
3551
3552scan_external_id([], S=#xmerl_scanner{continuation_fun = F}) ->
3553    ?dbg("cont()...~n", []),
3554    F(fun(MoreBytes, S1) -> scan_external_id(MoreBytes, S1) end,
3555      fatal_fun(unexpected_end),
3556      S);
3557scan_external_id("SYSTEM" ++ T, S0) ->
3558    ?bump_col(6),
3559    {_,T1,S1} = mandatory_strip(T,S),
3560    {SL, T2, S2} = scan_system_literal(T1, S1),
3561    {{system, SL}, T2, S2};
3562scan_external_id("PUBLIC" ++ T, S0) ->
3563    ?bump_col(6),
3564    {_,T1,S1} = mandatory_strip(T,S),
3565    {PIDL, T2, S2} = scan_pubid_literal(T1, S1),
3566    {_,T3,S3} = mandatory_strip(T2,S2),
3567    {SL, T4, S4} = scan_system_literal(T3, S3),
3568    {{public, PIDL, SL}, T4, S4}.
3569
3570
3571%%%%%%% [9] EntityValue
3572
3573%% Note that we have two different scan functions for EntityValue
3574%% They differ in that this one checks for recursive calls to the same
3575%% parameter entity.
3576
3577scan_entity_value(Str, S, Delim, Name, Namespace) ->
3578    scan_entity_value(Str, S, Delim, _Acc = [], Name, Namespace,[]).
3579
3580
3581scan_entity_value([], S=#xmerl_scanner{environment={external,{entity,_}}},
3582		  _Delim,Acc,_,_,[]) ->
3583    {lists:flatten(lists:reverse(Acc)), [], S};
3584scan_entity_value([], S=#xmerl_scanner{environment={external,{entity,_}},
3585				       validation=dtd},
3586		  _Delim,_Acc,PEName,_,_) ->
3587    {{error,{failed_VC_Proper_Declaration_PE_Nesting,1,PEName}},[],S};
3588scan_entity_value([],S,
3589		  no_delim,Acc,_,_,[]) ->
3590    {lists:flatten(lists:reverse(Acc)),[],S};
3591scan_entity_value([],S=#xmerl_scanner{validation=dtd},
3592		  no_delim,_Acc,PEName,_,_PENesting) ->
3593    {{error,{failed_VC_Proper_Declaration_PE_Nesting,2,PEName}},[],S};
3594scan_entity_value([], S=#xmerl_scanner{continuation_fun = F},
3595		  Delim, Acc, PEName,Namespace,PENesting) ->
3596    ?dbg("cont()...~n", []),
3597    F(fun(MoreBytes, S1) ->
3598	      scan_entity_value(MoreBytes,S1,
3599				Delim,Acc,PEName,Namespace,PENesting)
3600      end,
3601      fatal_fun(unexpected_end),
3602      S);
3603scan_entity_value([Delim|T], S=#xmerl_scanner{validation=dtd},
3604		  Delim,_Acc,PEName,_NS,PENesting) when length(PENesting) /= 0 ->
3605    {{error,{failed_VC_Proper_Declaration_PE_Nesting,3,PEName}},T,S};
3606scan_entity_value([Delim|T], S0,
3607		  Delim, Acc, _PEName,_NS,_PENesting) ->
3608    ?bump_col(1),
3609    {lists:flatten(lists:reverse(Acc)), T, S};
3610scan_entity_value("%" ++ _T,S=#xmerl_scanner{environment=prolog},_,_,_,_,_) ->
3611    ?fatal({error,{wfc_PEs_In_Internal_Subset}},S);
3612% %% This is a PEdecl in an external entity
3613% scan_entity_value([$%,WS|T], S0, Delim, Acc, PEName,Namespace,PENesting)
3614%   when ?whitespace(WS) ->
3615%     ?bump_col(2),
3616%     scan_entity_value(T, S, Delim, [WS,$%|Acc], PEName,Namespace,PENesting);
3617scan_entity_value("%" ++ T, S0, Delim, Acc, PEName,Namespace,PENesting) ->
3618    ?bump_col(1),
3619    {PERefName, T1, S1} = scan_pe_reference(T, S),
3620    if PERefName == PEName,Namespace==parameter ->
3621	    ?fatal({illegal_recursion_in_PE, PEName}, S1);
3622       true ->
3623	    {ExpandedRef,S2} =
3624		case expand_pe_reference(PERefName, S1, in_literal) of
3625		    %% actually should pe ref be expanded as_PE but
3626		    %% handle whitespace explicitly in this case.
3627		    Tuple when is_tuple(Tuple) ->
3628			%% {system,URI} or {public,URI}
3629			%% Included in literal.
3630			{ExpRef,Sx}=fetch_not_parse(Tuple,S1),
3631			{EntV, _, S5} =
3632		 	    scan_entity_value(ExpRef, Sx, no_delim,[],
3633					      PERefName,parameter,[]),
3634			%% should do an update Write(parameter_entity)
3635			%% so next expand_pe_reference is faster
3636			{string_to_char_set(S5#xmerl_scanner.encoding, EntV), S5};
3637		     ExpRef ->
3638			{string_to_char_set(S1#xmerl_scanner.encoding, ExpRef) ,S1}
3639		end,
3640	    %% single or duoble qoutes are not treated as delimeters
3641	    %% in passages "included in literal"
3642	    S3 = S2#xmerl_scanner{col=S2#xmerl_scanner.col+1},
3643	    {Acc2,_,S4} = scan_entity_value(ExpandedRef,S3,no_delim,Acc,
3644					    PEName,Namespace,[]),
3645% 	    {_,T2,S5} = strip(" "++T1,S4),
3646	    scan_entity_value(T1,S4#xmerl_scanner{line=S3#xmerl_scanner.line,
3647						  col=S3#xmerl_scanner.col},
3648			      Delim,lists:reverse(Acc2),
3649 			      PEName,Namespace,PENesting)
3650% 	    scan_entity_value(T1,S4,Delim,lists:reverse(Acc2),
3651% 			      PEName,Namespace,PENesting)
3652    end;
3653scan_entity_value("&" ++ T, S0, Delim, Acc, PEName,Namespace,PENesting) ->
3654    %% This is either a character entity or a general entity (internal
3655    %% or external) reference. An internal general entity shall not be
3656    %% expanded in an entity def XML1.0 section 4.5.
3657    ?bump_col(1),
3658    case T of
3659	"#"++_T ->
3660	    {ExpRef, T1, S1} = scan_reference(T, S),
3661	    Tok = pe_nesting_token(ExpRef++T1,Namespace,S1#xmerl_scanner.validation),
3662	    case markup_delimeter(ExpRef) of
3663		true ->
3664		    scan_entity_value(T1, S1, Delim, [ExpRef|Acc], PEName,
3665				      Namespace,pe_push(Tok,PENesting,S1));
3666		_ ->
3667		    ExpRef2 = string_to_char_set(S#xmerl_scanner.encoding,ExpRef),
3668		    scan_entity_value(ExpRef2 ++ T1, S1, Delim, Acc, PEName,
3669				      Namespace,pe_push(Tok,PENesting,S1))
3670	    end;
3671	_ -> %% General Entity is bypassed, though must check for
3672             %% recursion: save referenced name now and check for
3673             %% recursive reference after the whole entity definition is
3674             %% completed.
3675	    {Name, _NamespaceInfo, T1, S1} = scan_name(T,S),
3676	    T2=scan_mandatory(";",T1,1,S1,expected_entity_reference_semicolon),
3677	    S2=save_refed_entity_name(Name,PEName,S1),
3678	    scan_entity_value(T2,S2,Delim,[";",atom_to_list(Name),"&"|Acc],PEName,Namespace,PENesting)
3679    end;
3680%% The following clauses is for PE Nesting VC constraint
3681%% Start delimeter for ConditionalSection
3682scan_entity_value("<!["++T,S0,Delim,Acc,PEName,parameter=NS,PENesting)->
3683    ?bump_col(3),
3684    scan_entity_value(T,S,Delim,["<!["|Acc],PEName,NS,
3685		      pe_push("<![",PENesting,S));
3686%% Start delimeter for ConditionalSection (2)
3687scan_entity_value("["++T,S0,Delim,Acc,PEName,parameter=NS,PENesting)->
3688    ?bump_col(1),
3689    scan_entity_value(T,S,Delim,["["|Acc],PEName,NS,
3690		      pe_push("[",PENesting,S));
3691%% Start delimeter for comment
3692scan_entity_value("<!--"++T,S0,Delim,Acc,PEName,parameter=NS,PENesting)->
3693    ?bump_col(4),
3694    scan_entity_value(T,S,Delim,["<!--"|Acc],PEName,NS,
3695		      pe_push("<!--",PENesting,S));
3696%% Start delimeter for ElementDecl, AttListDecl,EntityDecl,NotationDecl
3697scan_entity_value("<!"++ T,S0,Delim,Acc,PEName, parameter=NS,PENesting) ->
3698    ?bump_col(2),
3699    scan_entity_value(T,S,Delim,["<!"|Acc],PEName,NS,
3700		      pe_push("<!",PENesting,S));
3701%% Start delimeter for PI
3702scan_entity_value("<?"++T,S0,Delim,Acc,PEName, parameter=NS,PENesting) ->
3703    ?bump_col(2),
3704    scan_entity_value(T,S,Delim,["<?"|Acc],PEName,NS,
3705		      pe_push("<?",PENesting,S));
3706%% Start delimeter for elements that matches the proper stop delimeter
3707%% for a markupdecl
3708scan_entity_value("</"++T,S0,Delim,Acc,PEName,parameter=NS,PENesting)->
3709    ?bump_col(2),
3710    scan_entity_value(T,S,Delim,["</"|Acc],PEName,NS,
3711		      pe_push("</",PENesting,S));
3712scan_entity_value("<"++T,S0,Delim,Acc,PEName,parameter=NS,PENesting)->
3713    ?bump_col(1),
3714    scan_entity_value(T,S,Delim,["<"|Acc],PEName,NS,
3715		      pe_push("<",PENesting,S));
3716%% Delimeter for contentspecs
3717scan_entity_value("("++T,S0,Delim,Acc,PEName,parameter=NS,PENesting)->
3718    ?bump_col(1),
3719    scan_entity_value(T,S,Delim,["("|Acc],PEName,NS,
3720		      pe_push("(",PENesting,S));
3721%% Stop delimeter for ElementDecl, AttListDecl,EntityDecl,NotationDecl
3722scan_entity_value(">"++ T,S0,Delim,Acc,PEName, parameter=NS,PENesting) ->
3723    ?bump_col(1),
3724    scan_entity_value(T,S,Delim,[">"|Acc],PEName,NS,
3725		      pe_pop(">",PENesting,S));
3726%% Stop delimeter for PI
3727scan_entity_value("?>"++ T,S0,Delim,Acc,PEName, parameter=NS,PENesting) ->
3728    ?bump_col(2),
3729    scan_entity_value(T,S,Delim,["?>"|Acc],PEName,NS,
3730		      pe_pop("?>",PENesting,S));
3731%% Stop delimeter for comment
3732scan_entity_value("-->"++ T,S0,Delim,Acc,PEName, parameter=NS,PENesting) ->
3733    ?bump_col(3),
3734    scan_entity_value(T,S,Delim,["-->"|Acc],PEName,NS,
3735		      pe_pop("-->",PENesting,S));
3736%% Stop delimeter for ConditionalSection
3737scan_entity_value("]]>"++ T,S0,Delim,Acc,PEName, parameter=NS,PENesting) ->
3738    ?bump_col(3),
3739    scan_entity_value(T,S,Delim,["]]>"|Acc],PEName,NS,
3740		      pe_pop("]]>",PENesting,S));
3741%% Stop delimeter added to match a content start delimeter included
3742scan_entity_value("/>"++ T,S0,Delim,Acc,PEName, parameter=NS,PENesting) ->
3743    ?bump_col(2),
3744    scan_entity_value(T,S,Delim,["/>"|Acc],PEName,NS,
3745		      pe_pop("/>",PENesting,S));
3746scan_entity_value(")"++ T,S0,Delim,Acc,PEName, parameter=NS,PENesting) ->
3747    ?bump_col(1),
3748    scan_entity_value(T,S,Delim,[")"|Acc],PEName,NS,
3749		      pe_pop(")",PENesting,S));
3750scan_entity_value("\n"++T, S, Delim, Acc, PEName,Namespace,PENesting) ->
3751    scan_entity_value(T, S#xmerl_scanner{line=S#xmerl_scanner.line+1},
3752		      Delim, ["\n"|Acc], PEName,Namespace,PENesting);
3753scan_entity_value(Str, S0, Delim, Acc, PEName,Namespace,PENesting) ->
3754    {Ch,T} = to_ucs(S0#xmerl_scanner.encoding,Str),
3755    case xmerl_lib:is_char(Ch) of
3756	true ->
3757	    ?bump_col(1),
3758	    scan_entity_value(T, S, Delim, [Ch|Acc], PEName,Namespace,PENesting);
3759	false ->
3760	    ?fatal({unexpected_char,Ch}, S0)
3761    end.
3762
3763
3764
3765save_refed_entity_name(Name,PEName,S) ->
3766    case predefined_entity(Name) of
3767	true ->
3768	    S;
3769	_ ->
3770	    save_refed_entity_name1(Name,PEName,S)
3771    end.
3772
3773save_refed_entity_name1(Name,PEName,
3774			S=#xmerl_scanner{entity_references=ERefs}) ->
3775    case lists:keysearch(PEName,1,ERefs) of
3776	{value,{_,Refs}} ->
3777	    NewRefs =
3778		case lists:member(Name,Refs) of
3779		    true ->Refs;
3780		    _ -> [Name|Refs]
3781		end,
3782	    S#xmerl_scanner{entity_references=lists:keyreplace(PEName,1,ERefs,
3783							       {PEName,NewRefs})
3784			   };
3785	_ ->
3786	    S#xmerl_scanner{entity_references=[{PEName,[Name]}|ERefs]}
3787    end.
3788
3789
3790
3791pe_push(Tok,Stack,_S) when Tok=="<!";Tok=="<?";Tok=="<!--";Tok=="<![";
3792			   Tok=="[";Tok=="<";Tok=="</";Tok=="(" ->
3793    [Tok|Stack];
3794pe_push(Tok,Stack,#xmerl_scanner{validation=dtd})
3795  when Tok==")";Tok==">";Tok=="?>";Tok=="]]>";Tok=="-->";Tok=="/>"->
3796    [Tok|Stack];
3797pe_push(_,Stack,_S) ->
3798    Stack.
3799
3800pe_pop(">",["<!"|Rest],_S) ->        Rest;
3801pe_pop("?>",["<?"|Rest],_S) ->       Rest;
3802pe_pop("-->",["<!--"|Rest],_S) ->    Rest;
3803pe_pop("]]>",["[","<!["|Rest],_S) -> Rest;
3804pe_pop("/>",["<"|Rest],_S) ->        Rest;
3805pe_pop(">",["<"|Rest],_S) ->         Rest;
3806pe_pop(">",["</"|Rest],_S) ->        Rest;
3807pe_pop(")",["("|Rest],_S) ->         Rest;
3808pe_pop(Token,_Stack,S=#xmerl_scanner{validation=dtd}) ->
3809    ?fatal({error,{failed_VC_Proper_Declaration_PE_Nesting,5,Token}},S);
3810pe_pop(_,Rest,_) ->
3811    Rest.
3812
3813pe_nesting_token("<!"++_T,parameter,dtd) ->   "<!";
3814pe_nesting_token("<?"++_T,parameter,dtd) ->   "<?";
3815pe_nesting_token("<!--"++_T,parameter,dtd) -> "<!--";
3816pe_nesting_token("<!["++_T,parameter,dtd) ->  "<![";
3817pe_nesting_token("["++_T,parameter,dtd) ->    "[";
3818pe_nesting_token("("++_T,parameter,dtd) ->    "(";
3819pe_nesting_token(">"++_T,parameter,dtd) ->    ">";
3820pe_nesting_token("?>"++_T,parameter,dtd) ->   "?>";
3821pe_nesting_token("-->"++_T,parameter,dtd) ->  "-->";
3822pe_nesting_token("]]>"++_T,parameter,dtd) ->  "]]>";
3823pe_nesting_token(")"++_T,parameter,dtd) ->    ")";
3824pe_nesting_token("/>"++_T,parameter,dtd) ->   "/>";
3825pe_nesting_token(_,_,_) ->                     false.
3826
3827predefined_entity(amp) ->  true;
3828predefined_entity(lt) ->   true;
3829predefined_entity(gt) ->   true;
3830predefined_entity(apos) -> true;
3831predefined_entity(quot) -> true;
3832predefined_entity(_) ->    false.
3833
3834check_entity_recursion(EName,
3835		       S=#xmerl_scanner{entity_references=EntityRefList}) ->
3836    Set = sofs:family(EntityRefList),
3837    case catch sofs:family_to_digraph(Set, [acyclic]) of
3838	{'EXIT',{cyclic,_}} ->
3839	    ?fatal({illegal_recursion_in_Entity, EName}, S);
3840	DG ->
3841	    digraph:delete(DG),
3842	    ok
3843    end.
3844
3845
3846
3847
3848%%%%%%% [15] Comment
3849scan_comment(Str, S) ->
3850    scan_comment(Str, S, _Pos = undefined, _Parents = [], _Lang = []).
3851
3852scan_comment(Str,S=#xmerl_scanner{col=C,event_fun=Event}, Pos, Parents, Lang) ->
3853    Comment = #xmlComment{pos = Pos,
3854			  parents = Parents,
3855			  language = Lang,
3856			  value = undefined},
3857    S1 = #xmerl_scanner{} = Event(#xmerl_event{event = started,
3858					       line = S#xmerl_scanner.line,
3859					       col = C,
3860					       pos = Pos,
3861					       data = Comment}, S),
3862
3863    scan_comment1(Str, S1, Pos, Comment, _Acc = []).
3864
3865scan_comment1([], S=#xmerl_scanner{continuation_fun = F},
3866	     Pos, Comment, Acc) ->
3867    ?dbg("cont()...~n", []),
3868    F(fun(MoreBytes, S1) -> scan_comment1(MoreBytes, S1, Pos, Comment, Acc) end,
3869      fatal_fun(unexpected_end),
3870      S);
3871scan_comment1("-->" ++ T, S0 = #xmerl_scanner{col = C,
3872					     event_fun = Event,
3873					     hook_fun = Hook},
3874	     _Pos, Comment, Acc) ->
3875    ?bump_col(3),
3876    Comment1 = Comment#xmlComment{value = lists:reverse(Acc)},
3877    S1=#xmerl_scanner{}=Event(#xmerl_event{event = ended,
3878						   line=S#xmerl_scanner.line,
3879						   col = C,
3880						   data = Comment1}, S),
3881    {Ret, S2} = Hook(Comment1, S1),
3882    {_,T3,S3}=strip(T,S2),
3883    {Ret,T3,S3};
3884scan_comment1("--"++T,S,_Pos,_Comment,_Acc) ->
3885    ?fatal({invalid_comment,"--"++[hd(T)]}, S);
3886scan_comment1("\n" ++ T, S=#xmerl_scanner{line = L}, Pos, Cmt, Acc) ->
3887    scan_comment1(T, S#xmerl_scanner{line=L+1,col=1},Pos, Cmt, "\n" ++ Acc);
3888scan_comment1("\r\n" ++ T, S=#xmerl_scanner{line = L}, Pos, Cmt, Acc) ->
3889    %% CR followed by LF is read as a single LF
3890    scan_comment1(T, S#xmerl_scanner{line=L+1,col=1}, Pos, Cmt, "\n" ++ Acc);
3891scan_comment1("\r" ++ T, S=#xmerl_scanner{line = L}, Pos, Cmt, Acc) ->
3892    %% CR not followed by LF is read as a LF
3893    scan_comment1(T, S#xmerl_scanner{line=L+1,col=1}, Pos, Cmt, "\n" ++ Acc);
3894scan_comment1(Str, S=#xmerl_scanner{col = C}, Pos, Cmt, Acc) ->
3895    {Ch,T} = wfc_legal_char(Str,S),
3896    scan_comment1(T, S#xmerl_scanner{col=C+1}, Pos, Cmt, [Ch|Acc]).
3897
3898%%%%%%%
3899
3900scan_markup_completion_gt([$>|_R]=T,S) ->
3901    {T,S};
3902scan_markup_completion_gt([$%|T],S0) ->
3903    ?bump_col(1),
3904    {Name,T1,S1} = scan_pe_reference(T,S),
3905    ExpandedRef =  expand_pe_reference(Name,S1,as_PE),
3906    {_,T2,S2} = strip(ExpandedRef++T1,S1),
3907    scan_markup_completion_gt(T2,S2);
3908scan_markup_completion_gt(T,S) ->
3909    ?fatal({error,{malformed_syntax_entity_completion,T}},S).
3910
3911
3912scan_mandatory(Pattern,T,N,S,ErrorMsg) ->
3913    case lists:prefix(Pattern,T) of
3914	true ->
3915	    lists:nthtail(N,T);
3916	_ ->
3917	    ?fatal(ErrorMsg,S)
3918    end.
3919
3920
3921strip(Str,S) ->
3922    strip(Str,S,all).
3923
3924strip([], S=#xmerl_scanner{continuation_fun = F},_) ->
3925    ?dbg("cont()... stripping whitespace~n", []),
3926    F(fun(MoreBytes, S1) -> strip(MoreBytes, S1) end,
3927      fun(S1) -> {[], [], S1} end,
3928      S);
3929strip("\s" ++ T, S=#xmerl_scanner{col = C},Lim) ->
3930    strip(T, S#xmerl_scanner{col = C+1},Lim);
3931strip("\t" ++ _T, S ,no_tab) ->
3932    ?fatal({error,{no_tab_allowed}},S);
3933strip("\t" ++ T, S=#xmerl_scanner{col = C},Lim) ->
3934    strip(T, S#xmerl_scanner{col = expand_tab(C)},Lim);
3935strip("\n" ++ T, S=#xmerl_scanner{line = L},Lim) ->
3936    strip(T, S#xmerl_scanner{line = L+1, col = 1},Lim);
3937strip("\r\n" ++ T, S=#xmerl_scanner{line = L},Lim) ->
3938    %% CR followed by LF is read as a single LF
3939    strip(T, S#xmerl_scanner{line = L+1, col = 1},Lim);
3940strip("\r" ++ T, S=#xmerl_scanner{line = L},Lim) ->
3941    %% CR not followed by LF is read as a LF
3942    strip(T, S#xmerl_scanner{line = L+1, col = 1},Lim);
3943strip(Str, S,_Lim) ->
3944    {[], Str, S}.
3945
3946%% demands a whitespace, though a parameter entity is ok, it will
3947%% expand with a whitespace on each side.
3948mandatory_strip([],S) ->
3949    ?fatal({error,{whitespace_was_expected}},S);
3950mandatory_strip(T,S) when ?whitespace(hd(T)) ->
3951    strip(T,S,all);
3952mandatory_strip([$%|T],S) when ?whitespace(hd(T)) -> %this is not a PERefence, but an PEDeclaration
3953    ?fatal({error,{whitespace_was_expected}},S);
3954mandatory_strip([$%|_T]=T,S) ->
3955    {[],T,S};
3956mandatory_strip(_T,S) ->
3957    ?fatal({error,{whitespace_was_expected}},S).
3958
3959%% strip but don't accept tab
3960pub_id_strip(Str, S) ->
3961    strip(Str,S,no_tab).
3962
3963
3964normalize("&"++T,S,IsNorm) ->
3965    case scan_reference(T, S) of
3966	{ExpRef, T1, S1} when ?whitespace(hd(ExpRef)) ->
3967	    ExpRef2 = string_to_char_set(S#xmerl_scanner.encoding,ExpRef),
3968	    normalize(ExpRef2++T1,S1,IsNorm);
3969	_ ->
3970	    {"&"++T,S,IsNorm}
3971    end;
3972normalize(T,S,IsNorm) ->
3973    case strip(T,S) of
3974	{_,T,S} ->
3975	    {T,S,IsNorm};
3976	{_,T1,S1} ->
3977	    normalize(T1,S1,true)
3978    end.
3979
3980
3981%% Optimization:
3982%% - avoid building list of spaces or tabs;
3983%% - avoid reverse;
3984%% - compact two common indentation patterns.
3985%% Note: only to be called when a \n was found.
3986fast_accumulate_whitespace(" " ++ T, S, _) ->
3987    fast_acc_spaces(T, S, 1);
3988fast_accumulate_whitespace("\t"++T, S, _) ->
3989    fast_acc_tabs(T, S, 1);
3990fast_accumulate_whitespace("<"++_=R, S, _T) ->
3991    #xmerl_scanner{common_data = CD, line = Line} = S,
3992    {done, {element(3, CD), R, S#xmerl_scanner{col = 1, line = Line + 1}}};
3993fast_accumulate_whitespace(_, S, T) ->
3994    accumulate_whitespace(T, S, []).
3995
3996fast_acc_spaces(" " ++ T, S, N) ->
3997    fast_acc_spaces(T, S, N + 1);
3998fast_acc_spaces(T, S, N) ->
3999    fast_acc_end(T, S, N, N, $\s, 1).
4000
4001fast_acc_tabs("\t" ++ T, S, N) ->
4002    fast_acc_tabs(T, S, N + 1);
4003fast_acc_tabs(T, S, N) ->
4004    fast_acc_end(T, S, N, N * 8 + 1, $\t, 2).
4005
4006fast_acc_end(T, S, N, Col, C, CD_I) ->
4007    #xmerl_scanner{common_data = CD, line = Line0} = S,
4008    Line = Line0 + 1,
4009    try
4010        $< = hd(T),
4011        {done,{element(N, element(CD_I, CD)), T,
4012               S#xmerl_scanner{col = Col, line = Line}}}
4013    catch _:_ ->
4014        accumulate_whitespace(T, S, Line, Col, lists:duplicate(N, C)++"\n")
4015    end.
4016
4017
4018%%% @spec accumulate_whitespace(T::string(),S::global_state(),
4019%%%                             atom(),Acc::string()) -> {Acc, T1, S1}
4020%%%
4021%%% @doc Function to accumulate and normalize whitespace.
4022accumulate_whitespace(T, S, preserve, Acc) ->
4023    accumulate_whitespace(T, S, Acc);
4024accumulate_whitespace(T, S, normalize, Acc) ->
4025    {_WsAcc, T1, S1} = accumulate_whitespace(T, S, []),
4026    {[$\s|Acc], T1, S1}.
4027
4028accumulate_whitespace(T, S, Acc) ->
4029    #xmerl_scanner{line = Line, col = Col} = S,
4030    accumulate_whitespace(T, S, Line, Col, Acc).
4031
4032accumulate_whitespace([], S0, Line, Col, Acc) ->
4033    #xmerl_scanner{continuation_fun = F} = S0,
4034    S = S0#xmerl_scanner{line = Line, col = Col},
4035    ?dbg("cont()...~n", []),
4036    F(fun(MoreBytes, S1) -> accumulate_whitespace(MoreBytes, S1, Acc) end,
4037      fun(S1) -> {Acc, [], S1} end,
4038      S);
4039accumulate_whitespace("\s" ++ T, S, Line, Col, Acc) ->
4040    accumulate_whitespace(T, S, Line, Col+1, [$\s|Acc]);
4041accumulate_whitespace("\t" ++ T, S, Line, Col, Acc) ->
4042    accumulate_whitespace(T, S, Line, expand_tab(Col), [$\t|Acc]);
4043accumulate_whitespace("\n" ++ T, S, Line, _Col, Acc) ->
4044    accumulate_whitespace(T, S, Line+1, 1, [$\n|Acc]);
4045accumulate_whitespace("\r\n" ++ T, S, Line, _Col, Acc) ->
4046    %% CR followed by LF is read as a single LF
4047    accumulate_whitespace(T, S, Line+1, 1, [$\n|Acc]);
4048accumulate_whitespace("\r" ++ T, S, Line, _Col, Acc) ->
4049    %% CR not followed by LF is read as a LF
4050    accumulate_whitespace(T, S, Line+1, 1, [$\n|Acc]);
4051accumulate_whitespace(Str, S, Line, Col, Acc) ->
4052    {Acc, Str, S#xmerl_scanner{line = Line, col = Col}}.
4053
4054expand_tab(Col) ->
4055    Rem = (Col-1) rem 8,
4056    _NewCol = Col + 8 - Rem.
4057
4058%% validation_mode(Validation)
4059%% Validation = off | dtd | schema | true | false
4060%% true and false are obsolete
4061validation_mode(false) ->
4062    off;
4063validation_mode(true) ->
4064    dtd;
4065validation_mode(Other) ->
4066    Other.
4067
4068
4069schemaLocations(El,#xmerl_scanner{schemaLocation=[]}) ->
4070    schemaLocations(El);
4071schemaLocations(El,#xmerl_scanner{schemaLocation=SL}) ->
4072    case SL of
4073	[{_,_}|_] ->
4074	    {ok,SL};
4075	_ ->
4076	    schemaLocations(El)
4077    end.
4078
4079schemaLocations(#xmlElement{attributes=Atts,xmlbase=_Base}) ->
4080    Pred = fun(#xmlAttribute{name=schemaLocation}) -> false;
4081	      (#xmlAttribute{nsinfo={_,"schemaLocation"}}) -> false;
4082	      (_) -> true
4083	   end,
4084    case lists:dropwhile(Pred,Atts) of
4085	[#xmlAttribute{value=Paths}|_] ->
4086
4087	    case string:tokens(Paths," \n\t\r") of
4088		L when length(L) > 0 ->
4089		    case length(L) rem 2 of
4090			0 ->
4091			    PairList =
4092				fun([],_Fun) ->
4093					[];
4094				   ([SLNS,SLLoc|Rest],Fun) ->
4095					[{SLNS,SLLoc}|Fun(Rest,Fun)]
4096				end,
4097			    {ok,PairList(L,PairList)};
4098			_ ->
4099			    {error,{schemaLocation_attribute,namespace_location_not_in_pair}}
4100		    end;
4101		_ ->
4102		    {error,{missing_schemaLocation}}
4103	    end;
4104	[] ->
4105	    {error,{missing_schemaLocation}}
4106    end.
4107
4108inherit_options(S) ->
4109    %%?dbg("xsdbase: ~p~n",[S#xmerl_scanner.xmlbase]),
4110    [{xsdbase,S#xmerl_scanner.xmlbase}].
4111
4112handle_schema_result({XSDRes=#xmlElement{},_},S5) ->
4113    {XSDRes,S5};
4114handle_schema_result({error,Reason},S5) ->
4115    ?fatal({failed_schema_validation,Reason},S5).
4116
4117%%% Helper functions
4118
4119-compile({inline, [fatal_fun/1]}).
4120
4121-spec fatal_fun(_) -> fun((_) -> no_return()).
4122
4123fatal_fun(Reason) ->
4124    fun(S) -> ?fatal(Reason, S) end.
4125
4126fatal(Reason, S) ->
4127    exit({fatal, {Reason,
4128		  {file,S#xmerl_scanner.filename},
4129		  {line,S#xmerl_scanner.line},
4130		  {col,S#xmerl_scanner.col}}}).
4131
4132%% preformat formats tokens in L1 and L2, L2 separated by Sep into a
4133%% list
4134preformat(L1,L2,Sep) ->
4135    Format1= lists:flatten(lists:duplicate(length(L1)-1,"~s ")++"~s"),
4136    Format2 = lists:flatten(lists:duplicate(length(L2)-1,
4137					    " ~s"++Sep)++" ~s"),
4138
4139    lists:flatten(io_lib:format(Format1++Format2,L1++L2)).
4140
4141
4142%% BUG when we are many <!ATTLIST ..> balise none attributes has save in rules
4143rules_write(Context, Name, Value, #xmerl_scanner{rules = T} = S) ->
4144    case ets:lookup(T, {Context, Name}) of
4145	[] ->
4146	    ets:insert(T, {{Context, Name}, Value});
4147	_ ->
4148	    ok
4149    end,
4150    S.
4151
4152
4153rules_read(Context, Name, #xmerl_scanner{rules = T}) ->
4154    case ets:lookup(T, {Context, Name}) of
4155	[] ->
4156	    undefined;
4157	[{_, V}] ->
4158	    V
4159    end.
4160
4161rules_delete(Context,Name,#xmerl_scanner{rules = T}) ->
4162    ets:delete(T,{Context,Name}).
4163
4164to_ucs(Encoding, Chars) when Encoding=="utf-8"; Encoding == undefined ->
4165    utf8_2_ucs(Chars);
4166to_ucs(_,[C|Rest]) ->
4167    {C,Rest}.
4168
4169utf8_2_ucs([A,B,C,D|Rest]) when A band 16#f8 =:= 16#f0,
4170			      B band 16#c0 =:= 16#80,
4171			      C band 16#c0 =:= 16#80,
4172			      D band 16#c0 =:= 16#80 ->
4173    %% 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
4174    case ((D band 16#3f) bor ((C band 16#3f) bsl 6) bor
4175	  ((B band 16#3f) bsl 12) bor ((A band 16#07) bsl 18)) of
4176	Ch when Ch >= 16#10000 ->
4177	    {Ch,Rest};
4178	Ch ->
4179	    {{error,{bad_character,Ch}},Rest}
4180    end;
4181utf8_2_ucs([A,B,C|Rest]) when A band 16#f0 =:= 16#e0,
4182			    B band 16#c0 =:= 16#80,
4183			    C band 16#c0 =:= 16#80 ->
4184    %% 1110vvvv 10vvvvvv 10vvvvvv
4185    case ((C band 16#3f) bor ((B band 16#3f) bsl 6) bor
4186	  ((A band 16#0f) bsl 12)) of
4187	Ch when Ch >= 16#800 ->
4188	    {Ch,Rest};
4189	Ch ->
4190	    {{error,{bad_character,Ch}},Rest}
4191    end;
4192utf8_2_ucs([A,B|Rest]) when A band 16#e0 =:= 16#c0,
4193			  B band 16#c0 =:= 16#80 ->
4194    %% 110vvvvv 10vvvvvv
4195    case ((B band 16#3f) bor ((A band 16#1f) bsl 6)) of
4196	Ch when Ch >= 16#80 ->
4197	    {Ch,Rest};
4198	Ch ->
4199	    {{error,{bad_character,Ch}},Rest}
4200    end;
4201utf8_2_ucs([A|Rest]) when A < 16#80 ->
4202    {A,Rest};
4203utf8_2_ucs([A|Rest]) ->
4204    {{error,{bad_character,A}},Rest}.
4205
4206%% to_char_set("iso-10646-utf-1",Ch) ->
4207%%     [Ch];
4208%% to_char_set(UTF8,Ch) when UTF8 =:= "utf-8"; UTF8 =:= undefined ->
4209%%     ucs_2_utf8(Ch);
4210%% to_char_set(_,Ch) ->
4211%%     [Ch].
4212
4213ucs_2_utf8(Ch) when Ch < 128 ->
4214    %% 0vvvvvvv
4215    [Ch];
4216ucs_2_utf8(Ch) when Ch < 16#0800 ->
4217    %% Ch: -----vvv vvvvvvvv
4218    %% 110vvvvv 10vvvvvv
4219    %% O1 = (Ch band 16#07c0) bsr 6,
4220    %% O2 = (Ch band 16#003f),
4221    [((Ch band 16#07c0) bsr 6) bor 16#c0,(Ch band 16#003f) bor 16#80];
4222ucs_2_utf8(Ch) when Ch < 16#10000 ->
4223    %% Ch: vvvvvvvv vvvvvvvv
4224    %% 1110vvvv 10vvvvvv 10vvvvvv
4225    %% O1 = (Ch band 16#f000) bsr 12
4226    %% O2 = (Ch band 16#0fc0) bsr 6
4227    %% O3 = (Ch band 16#003f)
4228    [((Ch band 16#f000) bsr 12) bor 16#e0,
4229     ((Ch band 16#0fc0) bsr 6) bor 16#80,
4230     (Ch band 16#003f) bor 16#80];
4231ucs_2_utf8(Ch) when Ch < 16#200000 ->
4232    %% Ch: ---vvvvv vvvvvvvv vvvvvvvv
4233    %% 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
4234    %% O1 = (Ch band 16#1c0000) bsr 18
4235    %% O2 = (Ch band 16#03f000) bsr 12
4236    %% O3 = (Ch band 16#000fc0) bsr 6
4237    %% O4 = (Ch band 16#00003f)
4238    [((Ch band 16#1c0000) bsr 18) bor 16#f0,
4239     ((Ch band 16#03f000) bsr 12) bor 16#80,
4240     ((Ch band 16#000fc0) bsr 6) bor 16#80,
4241     (Ch band 16#00003f) bor 16#80].
4242
4243
4244string_to_char_set(Enc,Str) when Enc =:= "utf-8"; Enc =:= undefined ->
4245    lists:flatten([ucs_2_utf8(X)||X <- Str]);
4246string_to_char_set(_,Str) ->
4247    Str.
4248
4249%% diagnose(Line) ->
4250%%     Mem=erlang:memory(),
4251%%     {OldTot,OldLine} = get_total(),
4252%%     NewTot =
4253%%     case {lists:keysearch(total,1,Mem),OldTot*1.1} of
4254%% 	{{_,{_,Tot}},Tot110} when Tot > Tot110 ->
4255%% 	    ?dbg("From ~p to ~p, total memory: ~p (~p)~n",[OldLine,Line,Tot,OldTot]),
4256%% 	    Tot;
4257%% 	{{_,{_,Tot}},_} ->
4258%% 	    Tot
4259%%     end,
4260%%     put_total({NewTot,Line}).
4261
4262%% get_total() ->
4263%%     case get(xmerl_mem) of
4264%% 	undefined ->
4265%% 	    put(xmerl_mem,{0,0}),
4266%% 	    {0,0};
4267%% 	M -> M
4268%%     end.
4269
4270%% put_total(M) ->
4271%%     put(xmerl_mem,M).
4272