1%%--------------------------------------------------------------------
2%% %CopyrightBegin%
3%%
4%% Copyright Ericsson AB 2008-2018. All Rights Reserved.
5%%
6%% Licensed under the Apache License, Version 2.0 (the "License");
7%% you may not use this file except in compliance with the License.
8%% You may obtain a copy of the License at
9%%
10%%     http://www.apache.org/licenses/LICENSE-2.0
11%%
12%% Unless required by applicable law or agreed to in writing, software
13%% distributed under the License is distributed on an "AS IS" BASIS,
14%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15%% See the License for the specific language governing permissions and
16%% limitations under the License.
17%%
18%% %CopyrightEnd%
19%%----------------------------------------------------------------------
20%% File    : xmerl_sax_parser.erl
21%% Description : XML SAX parse API module.
22%%
23%% Created :  4 Jun 2008
24%%----------------------------------------------------------------------
25-module(xmerl_sax_parser).
26
27%%----------------------------------------------------------------------
28%% Include files
29%%----------------------------------------------------------------------
30-include("xmerl_sax_parser.hrl").
31
32%%----------------------------------------------------------------------
33%% External exports
34%%----------------------------------------------------------------------
35-export([file/2,
36	 stream/3,
37	 stream/2]).
38
39%%----------------------------------------------------------------------
40%% Internal exports
41%%----------------------------------------------------------------------
42-export([default_continuation_cb/1]).
43
44%%----------------------------------------------------------------------
45%% Macros
46%%----------------------------------------------------------------------
47
48%%----------------------------------------------------------------------
49%% Records
50%%----------------------------------------------------------------------
51
52%%======================================================================
53%% External functions
54%%======================================================================
55%%----------------------------------------------------------------------
56%% Function: file(Filename, Options) -> Result
57%% Input:    Filename = string()
58%%           Options = [{OptTag, term()}]
59%%           OptTag = event_state | event_fun | continuation_state |
60%%                    continuation_fun | ....
61%% Output:   Result = {ok, EventState, Rest}
62%%           Rest = unicode_binary() | latin1_binary()
63%%           EventState = term()
64%% Description: Parse file containing an XML document.
65%%----------------------------------------------------------------------
66file(Name,Options) ->
67    case file:open(Name, [raw, read_ahead, read,binary])  of
68        {error, Reason} ->
69            {error,{Name, file:format_error(Reason)}};
70        {ok, FD} ->
71	    Dir = filename:dirname(Name),
72	    CL = filename:absname(Dir),
73            File = filename:basename(Name),
74	    ContinuationFun = fun default_continuation_cb/1,
75            Res = stream(<<>>,
76                         [{continuation_fun, ContinuationFun},
77                          {continuation_state, FD},
78                          {current_location, CL},
79                          {entity, File}
80                          |Options],
81                         file),
82	    ok = file:close(FD),
83	    Res
84    end.
85
86%%----------------------------------------------------------------------
87%% Function: stream(Xml, Options) -> Result
88%% Input:    Xml = string() | binary()
89%%           Options = [{OptTag, term()}]
90%%           OptTag = event_state | event_fun | continuation_state |
91%%                    continuation_fun | ....
92%% Output:   Result = {ok, EventState, Rest}
93%%           Rest = unicode_binary() | latin1_binary() | [unicode_char()]
94%%           EventState = term()
95%% Description: Parse a stream containing an XML document.
96%%----------------------------------------------------------------------
97stream(Xml, Options) ->
98    stream(Xml, Options, stream).
99
100stream(Xml, Options, InputType) when is_list(Xml), is_list(Options) ->
101    State = parse_options(Options, initial_state()),
102    case State#xmerl_sax_parser_state.file_type of
103	dtd ->
104	    xmerl_sax_parser_list:parse_dtd(Xml,
105					    State#xmerl_sax_parser_state{encoding = list,
106									 input_type = InputType});
107	normal ->
108	    xmerl_sax_parser_list:parse(Xml,
109					State#xmerl_sax_parser_state{encoding = list,
110								     input_type = InputType})
111    end;
112stream(Xml, Options, InputType) when is_binary(Xml), is_list(Options) ->
113    case parse_options(Options, initial_state()) of
114	{error, Reason} -> {error, Reason};
115	State ->
116	    ParseFunction =
117		case  State#xmerl_sax_parser_state.file_type of
118		    dtd ->
119			parse_dtd;
120		    normal ->
121			parse
122		end,
123                try
124                    {Xml1, State1} = detect_charset(Xml, State),
125                     parse_binary(Xml1,
126                                  State1#xmerl_sax_parser_state{input_type = InputType},
127                                  ParseFunction)
128                catch
129                    throw:{fatal_error, {State2, Reason}} ->
130                      {fatal_error,
131                       {
132                         State2#xmerl_sax_parser_state.current_location,
133                         State2#xmerl_sax_parser_state.entity,
134                         1
135                        },
136                       Reason, [],
137                       State2#xmerl_sax_parser_state.event_state}
138              end
139    end.
140
141%%----------------------------------------------------------------------
142%% Function: parse_binary(Encoding, Xml, State, F) -> Result
143%% Input:    Encoding = atom()
144%%           Xml = [integer()] | binary()
145%%           State = #xmerl_sax_parser_state
146%%           F = atom()
147%% Output:   Result = {ok, Rest, EventState}
148%%           Rest = list() | binary()
149%%           EventState = term()
150%% Description: Chooses the correct parser depending on the encoding.
151%%----------------------------------------------------------------------
152parse_binary(Xml, #xmerl_sax_parser_state{encoding=utf8}=State, F) ->
153    xmerl_sax_parser_utf8:F(Xml, State);
154parse_binary(Xml, #xmerl_sax_parser_state{encoding={utf16,little}}=State, F) ->
155    xmerl_sax_parser_utf16le:F(Xml, State);
156parse_binary(Xml, #xmerl_sax_parser_state{encoding={utf16,big}}=State, F) ->
157    xmerl_sax_parser_utf16be:F(Xml, State);
158parse_binary(Xml, #xmerl_sax_parser_state{encoding=latin1}=State, F) ->
159    xmerl_sax_parser_latin1:F(Xml, State);
160parse_binary(_, #xmerl_sax_parser_state{encoding=Enc}, State) ->
161    ?fatal_error(State, lists:flatten(io_lib:format("Charcter set ~p not supported", [Enc]))).
162
163%%----------------------------------------------------------------------
164%% Function: initial_state/0
165%% Input:    -
166%% Output:   #xmerl_sax_parser_state{}
167%% Description: Creates the initial state record.
168%%----------------------------------------------------------------------
169initial_state() ->
170    #xmerl_sax_parser_state{
171	       event_fun = fun default_event_cb/3,
172	       ns = [{"xml", "http://www.w3.org/XML/1998/namespace"}],
173	       current_location = ".",
174	       entity = ""
175	      }.
176
177%%----------------------------------------------------------------------
178%% Function: parse_options(Options, State)
179%% Input:    Options = [Option]
180%%           Option = {event_state, term()} | {event_fun, fun()} |
181%%                    {continuation_state, term()} | {continuation_fun, fun()} |
182%%                    {encoding, Encoding} | {file_type, FT}
183%%           FT = normal | dtd
184%%           Encoding = utf8 | utf16le | utf16be | list | iso8859
185%%           State = #xmerl_sax_parser_state{}
186%% Output:   #xmerl_sax_parser_state{}
187%% Description: Checks the parser options.
188%%----------------------------------------------------------------------
189parse_options([], State) ->
190    State;
191parse_options([{event_state, CbState} |Options], State) ->
192    parse_options(Options, State#xmerl_sax_parser_state{event_state = CbState});
193parse_options([{event_fun, CbF} |Options], State) ->
194    parse_options(Options, State#xmerl_sax_parser_state{event_fun = CbF});
195parse_options([{continuation_state, CState} |Options], State) ->
196    parse_options(Options, State#xmerl_sax_parser_state{continuation_state = CState});
197parse_options([{continuation_fun, CF} |Options], State) ->
198    parse_options(Options, State#xmerl_sax_parser_state{continuation_fun = CF});
199parse_options([{file_type, FT} |Options], State) when FT==normal; FT==dtd ->
200    parse_options(Options, State#xmerl_sax_parser_state{file_type = FT});
201parse_options([{encoding, E} |Options], State) ->
202    case check_encoding_option(E) of
203	{error, Reason} ->
204	    {error, Reason};
205	Enc ->
206	    parse_options(Options, State#xmerl_sax_parser_state{encoding = Enc})
207    end;
208parse_options([{current_location, CL} |Options], State) ->
209    parse_options(Options, State#xmerl_sax_parser_state{current_location = CL});
210parse_options([{entity, Entity} |Options], State) ->
211    parse_options(Options, State#xmerl_sax_parser_state{entity = Entity});
212parse_options([skip_external_dtd |Options], State) ->
213    parse_options(Options, State#xmerl_sax_parser_state{skip_external_dtd = true});
214parse_options([O |_], _State) ->
215     {error, lists:flatten(io_lib:format("Option: ~p not supported", [O]))}.
216
217
218check_encoding_option(E) when E==utf8; E=={utf16,little}; E=={utf16,big};
219			      E==latin1; E==list ->
220    E;
221check_encoding_option(utf16) ->
222    {utf16,big};
223check_encoding_option(E) ->
224    {error, io_lib:format("Character set ~p not supported", [E])}.
225
226%%----------------------------------------------------------------------
227%% Function: detect_charset(Xml, State)
228%% Input:  Xml = list() | binary()
229%%         State = #xmerl_sax_parser_state{}
230%% Output:  {utf8|utf16le|utf16be|iso8859, Xml, State}
231%% Description: Detects which character set is used in a binary stream.
232%%----------------------------------------------------------------------
233detect_charset(<<>>, #xmerl_sax_parser_state{continuation_fun = undefined} = State) ->
234    ?fatal_error(State, "Can't detect character encoding due to lack of indata");
235detect_charset(<<>>, State) ->
236    cf(<<>>, State, fun detect_charset/2);
237detect_charset(Bytes, State) ->
238    case unicode:bom_to_encoding(Bytes) of
239	{latin1, 0} ->
240	    detect_charset_1(Bytes, State);
241	{Enc, Length} ->
242	    <<_:Length/binary, RealBytes/binary>> = Bytes,
243	    {RealBytes, State#xmerl_sax_parser_state{encoding=Enc}}
244    end.
245
246detect_charset_1(<<16#00>> = Xml, State) ->
247    cf(Xml, State, fun detect_charset_1/2);
248detect_charset_1(<<16#00, 16#3C>> = Xml, State) ->
249    cf(Xml, State, fun detect_charset_1/2);
250detect_charset_1(<<16#00, 16#3C, 16#00>> = Xml, State) ->
251    cf(Xml, State, fun detect_charset_1/2);
252detect_charset_1(<<16#00, 16#3C, 16#00, 16#3F, _/binary>> = Xml, State) ->
253    {Xml, State#xmerl_sax_parser_state{encoding={utf16, big}}};
254detect_charset_1(<<16#3C>> = Xml, State) ->
255    cf(Xml, State, fun detect_charset_1/2);
256detect_charset_1(<<16#3C, 16#00>> = Xml, State) ->
257    cf(Xml, State, fun detect_charset_1/2);
258detect_charset_1(<<16#3C, 16#00, 16#3F>> = Xml, State) ->
259    cf(Xml, State, fun detect_charset_1/2);
260detect_charset_1(<<16#3C, 16#00, 16#3F, 16#00, _/binary>> = Xml, State) ->
261    {Xml, State#xmerl_sax_parser_state{encoding={utf16, little}}};
262detect_charset_1(<<16#3C>> = Xml, State) ->
263    cf(Xml, State, fun detect_charset_1/2);
264detect_charset_1(<<16#3C, 16#3F>> = Xml, State) ->
265    cf(Xml, State, fun detect_charset_1/2);
266detect_charset_1(<<16#3C, 16#3F, 16#78>> = Xml, State) ->
267    cf(Xml, State, fun detect_charset_1/2);
268detect_charset_1(<<16#3C, 16#3F, 16#78, 16#6D>> = Xml, State) ->
269    cf(Xml, State, fun detect_charset_1/2);
270detect_charset_1(<<16#3C, 16#3F, 16#78, 16#6D, 16#6C, Xml2/binary>>, State) ->
271    {Xml3, State1} = read_until_end_of_xml_directive(Xml2, State),
272    AttrList = parse_xml_directive(Xml3, State),
273    case lists:keysearch("encoding", 1, AttrList) of
274        {value, {_, E}} ->
275            Enc = convert_encoding(E, State),
276            {<<16#3C, 16#3F, 16#78, 16#6D, 16#6C, Xml3/binary>>,
277             State1#xmerl_sax_parser_state{encoding=Enc}};
278        _ ->
279            {<<16#3C, 16#3F, 16#78, 16#6D, 16#6C, Xml3/binary>>, State1}
280    end;
281detect_charset_1(Xml, State) ->
282    {Xml, State}.
283
284%%----------------------------------------------------------------------
285%% Function: convert_encoding(Enc)
286%% Input:  Enc = string()
287%% Output:  utf8 | iso8859
288%% Description: Converting 7,8 bit and utf8 encoding strings to internal format.
289%%----------------------------------------------------------------------
290convert_encoding(Enc, State) -> %% Just for 7,8 bit + utf8
291    case string:to_lower(Enc) of
292	"utf-8" -> utf8;
293	"us-ascii" -> utf8;
294	"latin1" -> latin1;
295	"iso-8859-1" -> latin1; % Handle all iso-8859 as latin1
296	"iso-8859-2" -> latin1;
297	"iso-8859-3" -> latin1;
298	"iso-8859-4" -> latin1;
299	"iso-8859-5" -> latin1;
300	"iso-8859-6" -> latin1;
301	"iso-8859-7" -> latin1;
302	"iso-8859-8" -> latin1;
303	"iso-8859-9" -> latin1;
304	_ -> ?fatal_error(State, "Unknown encoding: " ++ Enc)
305    end.
306
307%%----------------------------------------------------------------------
308%% Function: parse_xml_directive(Xml)
309%% Input:  Xml = binary()
310%%         Acc = list()
311%% Output:
312%% Description: Parsing the xml declaration from the input stream.
313%%----------------------------------------------------------------------
314parse_xml_directive(<<C, Rest/binary>>, State) when ?is_whitespace(C) ->
315   parse_xml_directive_1(Rest, [], State);
316parse_xml_directive(_, State) ->
317    ?fatal_error(State, "Expected whitespace in directive").
318
319
320%%----------------------------------------------------------------------
321%% Function: parse_xml_directive_1(Xml, Acc) -> [{Name, Value}]
322%% Input:  Xml = binary()
323%%         Acc = [{Name, Value}]
324%%         Name = string()
325%%         Value = string()
326%% Output: see above
327%% Description: Parsing the xml declaration from the input stream.
328%%----------------------------------------------------------------------
329parse_xml_directive_1(<<C, Rest/binary>>, Acc, State) when ?is_whitespace(C) ->
330    parse_xml_directive_1(Rest, Acc, State);
331parse_xml_directive_1(<<"?>", _/binary>>, Acc, _State) ->
332    Acc;
333parse_xml_directive_1(<<C, Rest/binary>>, Acc, State) when 97 =< C, C =< 122 ->
334    {Name, Rest1} = parse_name(Rest, [C]),
335    Rest2 = parse_eq(Rest1, State),
336    {Value, Rest3} = parse_value(Rest2, State),
337    parse_xml_directive_1(Rest3, [{Name, Value} |Acc], State);
338parse_xml_directive_1(_, _, State) ->
339    ?fatal_error(State, "Unknown attribute in xml directive").
340
341%%----------------------------------------------------------------------
342%% Function: parse_name(Xml, Acc) -> Name
343%% Input:   Xml = binary()
344%%          Acc = string()
345%% Output:  Name = string()
346%% Description: Parsing an attribute name from the stream.
347%%----------------------------------------------------------------------
348parse_name(<<C, Rest/binary>>, Acc) when 97 =< C, C =< 122 ->
349    parse_name(Rest, [C |Acc]);
350parse_name(Rest, Acc) ->
351    {lists:reverse(Acc), Rest}.
352
353%%----------------------------------------------------------------------
354%% Function: parse_eq(Xml) -> Rest
355%% Input:  Xml = binary()
356%% Output:  Rest = binary()
357%% Description: Reads an '=' from the stream.
358%%----------------------------------------------------------------------
359parse_eq(<<C, Rest/binary>>, State) when ?is_whitespace(C) ->
360    parse_eq(Rest, State);
361parse_eq(<<"=", Rest/binary>>, _State) ->
362    Rest;
363parse_eq(_, State) ->
364    ?fatal_error(State, "expecting = or whitespace").
365
366%%----------------------------------------------------------------------
367%% Function: parse_value(Xml) -> {Value, Rest}
368%% Input:   Xml = binary()
369%% Output:  Value = string()
370%%          Rest = binary()
371%% Description: Parsing an attribute value from the stream.
372%%----------------------------------------------------------------------
373parse_value(<<C, Rest/binary>>, State) when ?is_whitespace(C) ->
374    parse_value(Rest, State);
375parse_value(<<C, Rest/binary>>, State) when C == $'; C == $" ->
376    parse_value_1(Rest, C, [], State);
377parse_value(_, State) ->
378    ?fatal_error(State, "\', \" or whitespace expected").
379
380%%----------------------------------------------------------------------
381%% Function: parse_value_1(Xml, Stop, Acc) -> {Value, Rest}
382%% Input:   Xml = binary()
383%%          Stop = $' | $"
384%%          Acc = list()
385%% Output:  Value = string()
386%%          Rest = binary()
387%% Description: Parsing an attribute value from the stream.
388%%----------------------------------------------------------------------
389parse_value_1(<<Stop, Rest/binary>>, Stop, Acc, _State) ->
390    {lists:reverse(Acc), Rest};
391parse_value_1(<<C, Rest/binary>>, Stop, Acc, State) ->
392    parse_value_1(Rest, Stop, [C |Acc], State);
393parse_value_1(_, _Stop, _Acc, State) ->
394    ?fatal_error(State, "end of input and no \' or \" found").
395
396%%======================================================================
397%% Default functions
398%%======================================================================
399%%----------------------------------------------------------------------
400%% Function: default_event_cb(Event, LineNo, State) -> Result
401%% Input:   Event = tuple()
402%%          LineNo = integer()
403%%          State = term()
404%% Output:  Result = {ok, State}
405%% Description: Default event callback printing event.
406%%----------------------------------------------------------------------
407default_event_cb(_Event, _LineNo, State) ->
408    State.
409
410%%----------------------------------------------------------------------
411%% Function: default_continuation_cb(IoDevice) -> Result
412%%          IoDevice = iodevice()
413%% Output:  Result = {binary(), IoDevice}
414%% Description: Default continuation callback reading blocks.
415%%----------------------------------------------------------------------
416default_continuation_cb(IoDevice) ->
417    case file:read(IoDevice, 1024) of
418	eof ->
419	    {<<>>, IoDevice};
420	{ok, FileBin} ->
421	    {FileBin, IoDevice}
422    end.
423
424%%----------------------------------------------------------------------
425%% Function: read_until_end_of_xml_directive(Rest, State) -> Result
426%%          Rest = binary()
427%% Output:  Result = {binary(), State}
428%% Description: Reads a utf8 or latin1 until it finds '?>'
429%%----------------------------------------------------------------------
430read_until_end_of_xml_directive(Rest, State) ->
431    case binary:match(Rest, <<"?>">>) of
432        nomatch ->
433            case cf(Rest, State) of
434                {<<>>, _} ->
435                    ?fatal_error(State, "Can't detect character encoding due to lack of indata");
436                {NewBytes, NewState} ->
437                    read_until_end_of_xml_directive(NewBytes, NewState)
438            end;
439        _ ->
440            {Rest, State}
441    end.
442
443
444%%----------------------------------------------------------------------
445%% Function  : cf(Rest, State) -> Result
446%% Parameters: Rest = binary()
447%%             State = #xmerl_sax_parser_state{}
448%%             NextCall = fun()
449%% Result    : {Rest, State}
450%% Description: Function that uses provided fun to read another chunk from
451%%              input stream and calls the fun in NextCall.
452%%----------------------------------------------------------------------
453cf(_Rest, #xmerl_sax_parser_state{continuation_fun = undefined} = State) ->
454    ?fatal_error(State, "Continuation function undefined");
455cf(Rest, #xmerl_sax_parser_state{continuation_fun = CFun, continuation_state = CState} = State) ->
456    Result =
457	try
458	    CFun(CState)
459	catch
460	    throw:ErrorTerm ->
461		?fatal_error(State, ErrorTerm);
462            exit:Reason ->
463		?fatal_error(State, {'EXIT', Reason})
464	end,
465    case Result of
466	{<<>>, _} ->
467	    ?fatal_error(State, "Can't detect character encoding due to lack of indata");
468	{NewBytes, NewContState} ->
469            {<<Rest/binary, NewBytes/binary>>,
470             State#xmerl_sax_parser_state{continuation_state = NewContState}}
471    end.
472
473%%----------------------------------------------------------------------
474%% Function  : cf(Rest, State, NextCall) -> Result
475%% Parameters: Rest = binary()
476%%             State = #xmerl_sax_parser_state{}
477%%             NextCall = fun()
478%% Result    : {Rest, State}
479%% Description: Function that uses provided fun to read another chunk from
480%%              input stream and calls the fun in NextCall.
481%%----------------------------------------------------------------------
482cf(_Rest, #xmerl_sax_parser_state{continuation_fun = undefined} = State, _) ->
483    ?fatal_error(State, "Continuation function undefined");
484cf(Rest, #xmerl_sax_parser_state{continuation_fun = CFun, continuation_state = CState} = State,
485   NextCall) ->
486    Result =
487	try
488	    CFun(CState)
489	catch
490	    throw:ErrorTerm ->
491		?fatal_error(State, ErrorTerm);
492            exit:Reason ->
493		?fatal_error(State, {'EXIT', Reason})
494	end,
495    case Result of
496	{<<>>, _} ->
497	    ?fatal_error(State, "Can't detect character encoding due to lack of indata");
498	{NewBytes, NewContState} ->
499	    NextCall(<<Rest/binary, NewBytes/binary>>,
500		     State#xmerl_sax_parser_state{continuation_state = NewContState})
501    end.
502