1%%--------------------------------------------------------------------
2%% %CopyrightBegin%
3%%
4%% Copyright Ericsson AB 2008-2018. All Rights Reserved.
5%%
6%% Licensed under the Apache License, Version 2.0 (the "License");
7%% you may not use this file except in compliance with the License.
8%% You may obtain a copy of the License at
9%%
10%%     http://www.apache.org/licenses/LICENSE-2.0
11%%
12%% Unless required by applicable law or agreed to in writing, software
13%% distributed under the License is distributed on an "AS IS" BASIS,
14%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15%% See the License for the specific language governing permissions and
16%% limitations under the License.
17%%
18%% %CopyrightEnd%
19%%----------------------------------------------------------------------
20%% File    : xmerl_sax_parser.erl
21%% Description : XML SAX parse API module.
22%%
23%% Created :  4 Jun 2008
24%%----------------------------------------------------------------------
25-module(xmerl_sax_parser).
26
27%%----------------------------------------------------------------------
28%% Include files
29%%----------------------------------------------------------------------
30-include("xmerl_sax_parser.hrl").
31
32%%----------------------------------------------------------------------
33%% External exports
34%%----------------------------------------------------------------------
35-export([file/2,
36	 stream/3,
37	 stream/2]).
38
39%%----------------------------------------------------------------------
40%% Internal exports
41%%----------------------------------------------------------------------
42-export([default_continuation_cb/1]).
43
44%%----------------------------------------------------------------------
45%% Macros
46%%----------------------------------------------------------------------
47
48%%----------------------------------------------------------------------
49%% Records
50%%----------------------------------------------------------------------
51
52%%======================================================================
53%% External functions
54%%======================================================================
55%%----------------------------------------------------------------------
56%% Function: file(Filename, Options) -> Result
57%% Input:    Filename = string()
58%%           Options = [{OptTag, term()}]
59%%           OptTag = event_state | event_fun | continuation_state |
60%%                    continuation_fun | ....
61%% Output:   Result = {ok, EventState, Rest}
62%%           Rest = unicode_binary() | latin1_binary()
63%%           EventState = term()
64%% Description: Parse file containing an XML document.
65%%----------------------------------------------------------------------
66file(Name,Options) ->
67    case file:open(Name, [raw, read_ahead, read,binary])  of
68        {error, Reason} ->
69            {error,{Name, file:format_error(Reason)}};
70        {ok, FD} ->
71	    Dir = filename:dirname(Name),
72	    CL = filename:absname(Dir),
73            File = filename:basename(Name),
74	    ContinuationFun = fun default_continuation_cb/1,
75            Res = stream(<<>>,
76                         [{continuation_fun, ContinuationFun},
77                          {continuation_state, FD},
78                          {current_location, CL},
79                          {entity, File}
80                          |Options],
81                         file),
82	    ok = file:close(FD),
83	    Res
84    end.
85
86%%----------------------------------------------------------------------
87%% Function: stream(Xml, Options) -> Result
88%% Input:    Xml = string() | binary()
89%%           Options = [{OptTag, term()}]
90%%           OptTag = event_state | event_fun | continuation_state |
91%%                    continuation_fun | ....
92%% Output:   Result = {ok, EventState, Rest}
93%%           Rest = unicode_binary() | latin1_binary() | [unicode_char()]
94%%           EventState = term()
95%% Description: Parse a stream containing an XML document.
96%%----------------------------------------------------------------------
97stream(Xml, Options) ->
98    stream(Xml, Options, stream).
99
100stream(Xml, Options, InputType) when is_list(Xml), is_list(Options) ->
101    State = parse_options(Options, initial_state()),
102    case State#xmerl_sax_parser_state.file_type of
103	dtd ->
104	    xmerl_sax_parser_list:parse_dtd(Xml,
105					    State#xmerl_sax_parser_state{encoding = list,
106									 input_type = InputType});
107	normal ->
108	    xmerl_sax_parser_list:parse(Xml,
109					State#xmerl_sax_parser_state{encoding = list,
110								     input_type = InputType})
111    end;
112stream(Xml, Options, InputType) when is_binary(Xml), is_list(Options) ->
113    case parse_options(Options, initial_state()) of
114	{error, Reason} -> {error, Reason};
115	State ->
116	    ParseFunction =
117		case  State#xmerl_sax_parser_state.file_type of
118		    dtd ->
119			parse_dtd;
120		    normal ->
121			parse
122		end,
123                try
124                    {Xml1, State1} = detect_charset(Xml, State),
125                     parse_binary(Xml1,
126                                  State1#xmerl_sax_parser_state{input_type = InputType},
127                                  ParseFunction)
128                catch
129                    throw:{fatal_error, {State2, Reason}} ->
130                      {fatal_error,
131                       {
132                         State2#xmerl_sax_parser_state.current_location,
133                         State2#xmerl_sax_parser_state.entity,
134                         1
135                        },
136                       Reason, [],
137                       State2#xmerl_sax_parser_state.event_state}
138              end
139    end.
140
141%%----------------------------------------------------------------------
142%% Function: parse_binary(Encoding, Xml, State, F) -> Result
143%% Input:    Encoding = atom()
144%%           Xml = [integer()] | binary()
145%%           State = #xmerl_sax_parser_state
146%%           F = atom()
147%% Output:   Result = {ok, Rest, EventState}
148%%           Rest = list() | binary()
149%%           EventState = term()
150%% Description: Chooses the correct parser depending on the encoding.
151%%----------------------------------------------------------------------
152parse_binary(Xml, #xmerl_sax_parser_state{encoding=utf8}=State, F) ->
153    xmerl_sax_parser_utf8:F(Xml, State);
154parse_binary(Xml, #xmerl_sax_parser_state{encoding={utf16,little}}=State, F) ->
155    xmerl_sax_parser_utf16le:F(Xml, State);
156parse_binary(Xml, #xmerl_sax_parser_state{encoding={utf16,big}}=State, F) ->
157    xmerl_sax_parser_utf16be:F(Xml, State);
158parse_binary(Xml, #xmerl_sax_parser_state{encoding=latin1}=State, F) ->
159    xmerl_sax_parser_latin1:F(Xml, State);
160parse_binary(_, #xmerl_sax_parser_state{encoding=Enc}, State) ->
161    ?fatal_error(State, lists:flatten(io_lib:format("Charcter set ~p not supported", [Enc]))).
162
163%%----------------------------------------------------------------------
164%% Function: initial_state/0
165%% Input:    -
166%% Output:   #xmerl_sax_parser_state{}
167%% Description: Creates the initial state record.
168%%----------------------------------------------------------------------
169initial_state() ->
170    #xmerl_sax_parser_state{
171	       event_fun = fun default_event_cb/3,
172	       ns = [{"xml", "http://www.w3.org/XML/1998/namespace"}],
173	       current_location = ".",
174	       entity = ""
175	      }.
176
177%%----------------------------------------------------------------------
178%% Function: parse_options(Options, State)
179%% Input:    Options = [Option]
180%%           Option = {event_state, term()} | {event_fun, fun()} |
181%%                    {continuation_state, term()} | {continuation_fun, fun()} |
182%%                    {encoding, Encoding} | {file_type, FT}
183%%           FT = normal | dtd
184%%           Encoding = utf8 | utf16le | utf16be | list | iso8859
185%%           State = #xmerl_sax_parser_state{}
186%% Output:   #xmerl_sax_parser_state{}
187%% Description: Checks the parser options.
188%%----------------------------------------------------------------------
189parse_options([], State) ->
190    State;
191parse_options([{event_state, CbState} |Options], State) ->
192    parse_options(Options, State#xmerl_sax_parser_state{event_state = CbState});
193parse_options([{event_fun, CbF} |Options], State) ->
194    parse_options(Options, State#xmerl_sax_parser_state{event_fun = CbF});
195parse_options([{continuation_state, CState} |Options], State) ->
196    parse_options(Options, State#xmerl_sax_parser_state{continuation_state = CState});
197parse_options([{continuation_fun, CF} |Options], State) ->
198    parse_options(Options, State#xmerl_sax_parser_state{continuation_fun = CF});
199parse_options([{file_type, FT} |Options], State) when FT==normal; FT==dtd ->
200    parse_options(Options, State#xmerl_sax_parser_state{file_type = FT});
201parse_options([{encoding, E} |Options], State) ->
202    case check_encoding_option(E) of
203	{error, Reason} ->
204	    {error, Reason};
205	Enc ->
206	    parse_options(Options, State#xmerl_sax_parser_state{encoding = Enc})
207    end;
208parse_options([{current_location, CL} |Options], State) ->
209    parse_options(Options, State#xmerl_sax_parser_state{current_location = CL});
210parse_options([{entity, Entity} |Options], State) ->
211    parse_options(Options, State#xmerl_sax_parser_state{entity = Entity});
212parse_options([skip_external_dtd |Options], State) ->
213    parse_options(Options, State#xmerl_sax_parser_state{skip_external_dtd = true});
214parse_options([O |_], _State) ->
215     {error, lists:flatten(io_lib:format("Option: ~p not supported", [O]))}.
216
217
218check_encoding_option(E) when E==utf8; E=={utf16,little}; E=={utf16,big};
219			      E==latin1; E==list ->
220    E;
221check_encoding_option(utf16) ->
222    {utf16,big};
223check_encoding_option(E) ->
224    {error, io_lib:format("Character set ~p not supported", [E])}.
225
226%%----------------------------------------------------------------------
227%% Function: detect_charset(Xml, State)
228%% Input:  Xml = list() | binary()
229%%         State = #xmerl_sax_parser_state{}
230%% Output:  {utf8|utf16le|utf16be|iso8859, Xml, State}
231%% Description: Detects which character set is used in a binary stream.
232%%----------------------------------------------------------------------
233detect_charset(<<>>, #xmerl_sax_parser_state{continuation_fun = undefined} = State) ->
234    ?fatal_error(State, "Can't detect character encoding due to lack of indata");
235detect_charset(<<>>, State) ->
236    cf(<<>>, State, fun detect_charset/2);
237detect_charset(Bytes, State) ->
238    case unicode:bom_to_encoding(Bytes) of
239	{latin1, 0} ->
240	    detect_charset_1(Bytes, State);
241	{Enc, Length} ->
242	    <<_:Length/binary, RealBytes/binary>> = Bytes,
243	    {RealBytes, State#xmerl_sax_parser_state{encoding=Enc}}
244    end.
245
246detect_charset_1(<<16#00>> = Xml, State) ->
247    cf(Xml, State, fun detect_charset_1/2);
248detect_charset_1(<<16#00, 16#3C>> = Xml, State) ->
249    cf(Xml, State, fun detect_charset_1/2);
250detect_charset_1(<<16#00, 16#3C, 16#00>> = Xml, State) ->
251    cf(Xml, State, fun detect_charset_1/2);
252detect_charset_1(<<16#00, 16#3C, 16#00, 16#3F, _/binary>> = Xml, State) ->
253    {Xml, State#xmerl_sax_parser_state{encoding={utf16, big}}};
254detect_charset_1(<<16#3C>> = Xml, State) ->
255    cf(Xml, State, fun detect_charset_1/2);
256detect_charset_1(<<16#3C, 16#00>> = Xml, State) ->
257    cf(Xml, State, fun detect_charset_1/2);
258detect_charset_1(<<16#3C, 16#00, 16#3F>> = Xml, State) ->
259    cf(Xml, State, fun detect_charset_1/2);
260detect_charset_1(<<16#3C, 16#00, 16#3F, 16#00, _/binary>> = Xml, State) ->
261    {Xml, State#xmerl_sax_parser_state{encoding={utf16, little}}};
262detect_charset_1(<<16#3C>> = Xml, State) ->
263    cf(Xml, State, fun detect_charset_1/2);
264detect_charset_1(<<16#3C, 16#3F>> = Xml, State) ->
265    cf(Xml, State, fun detect_charset_1/2);
266detect_charset_1(<<16#3C, 16#3F, 16#78>> = Xml, State) ->
267    cf(Xml, State, fun detect_charset_1/2);
268detect_charset_1(<<16#3C, 16#3F, 16#78, 16#6D>> = Xml, State) ->
269    cf(Xml, State, fun detect_charset_1/2);
270detect_charset_1(<<16#3C, 16#3F, 16#78, 16#6D, 16#6C, Xml2/binary>>, State) ->
271    {Xml3, State1} = read_until_end_of_xml_directive(Xml2, State),
272    AttrList = parse_xml_directive(Xml3, State),
273    case lists:keysearch("encoding", 1, AttrList) of
274        {value, {_, E}} ->
275            Enc = convert_encoding(E, State),
276            {<<16#3C, 16#3F, 16#78, 16#6D, 16#6C, Xml3/binary>>,
277             State1#xmerl_sax_parser_state{encoding=Enc}};
278        _ ->
279            {<<16#3C, 16#3F, 16#78, 16#6D, 16#6C, Xml3/binary>>, State1}
280    end;
281detect_charset_1(Xml, State) ->
282    {Xml, State}.
283
284%%----------------------------------------------------------------------
285%% Function: convert_encoding(Enc)
286%% Input:  Enc = string()
287%% Output:  utf8 | iso8859
288%% Description: Converting 7,8 bit and utf8 encoding strings to internal format.
289%%----------------------------------------------------------------------
290convert_encoding(Enc, State) -> %% Just for 7,8 bit + utf8
291    case string:to_lower(Enc) of
292	"utf-8" -> utf8;
293	"us-ascii" -> utf8;
294	"latin1" -> latin1;
295	"iso-8859-1" -> latin1; % Handle all iso-8859 as latin1
296	"iso-8859-2" -> latin1;
297	"iso-8859-3" -> latin1;
298	"iso-8859-4" -> latin1;
299	"iso-8859-5" -> latin1;
300	"iso-8859-6" -> latin1;
301	"iso-8859-7" -> latin1;
302	"iso-8859-8" -> latin1;
303	"iso-8859-9" -> latin1;
304	_ -> ?fatal_error(State, "Unknown encoding: " ++ Enc)
305    end.
306
307%%----------------------------------------------------------------------
308%% Function: parse_xml_directive(Xml)
309%% Input:  Xml = binary()
310%%         Acc = list()
311%% Output:
312%% Description: Parsing the xml declaration from the input stream.
313%%----------------------------------------------------------------------
314parse_xml_directive(<<C, Rest/binary>>, State) when ?is_whitespace(C) ->
315   parse_xml_directive_1(Rest, [], State).
316
317%%----------------------------------------------------------------------
318%% Function: parse_xml_directive_1(Xml, Acc) -> [{Name, Value}]
319%% Input:  Xml = binary()
320%%         Acc = [{Name, Value}]
321%%         Name = string()
322%%         Value = string()
323%% Output: see above
324%% Description: Parsing the xml declaration from the input stream.
325%%----------------------------------------------------------------------
326parse_xml_directive_1(<<C, Rest/binary>>, Acc, State) when ?is_whitespace(C) ->
327    parse_xml_directive_1(Rest, Acc, State);
328parse_xml_directive_1(<<"?>", _/binary>>, Acc, _State) ->
329    Acc;
330parse_xml_directive_1(<<C, Rest/binary>>, Acc, State) when 97 =< C, C =< 122 ->
331    {Name, Rest1} = parse_name(Rest, [C]),
332    Rest2 = parse_eq(Rest1, State),
333    {Value, Rest3} = parse_value(Rest2, State),
334    parse_xml_directive_1(Rest3, [{Name, Value} |Acc], State);
335parse_xml_directive_1(_, _, State) ->
336    ?fatal_error(State, "Unknown attribute in xml directive").
337
338%%----------------------------------------------------------------------
339%% Function: parse_name(Xml, Acc) -> Name
340%% Input:   Xml = binary()
341%%          Acc = string()
342%% Output:  Name = string()
343%% Description: Parsing an attribute name from the stream.
344%%----------------------------------------------------------------------
345parse_name(<<C, Rest/binary>>, Acc) when 97 =< C, C =< 122 ->
346    parse_name(Rest, [C |Acc]);
347parse_name(Rest, Acc) ->
348    {lists:reverse(Acc), Rest}.
349
350%%----------------------------------------------------------------------
351%% Function: parse_eq(Xml) -> Rest
352%% Input:  Xml = binary()
353%% Output:  Rest = binary()
354%% Description: Reads an '=' from the stream.
355%%----------------------------------------------------------------------
356parse_eq(<<C, Rest/binary>>, State) when ?is_whitespace(C) ->
357    parse_eq(Rest, State);
358parse_eq(<<"=", Rest/binary>>, _State) ->
359    Rest;
360parse_eq(_, State) ->
361    ?fatal_error(State, "expecting = or whitespace").
362
363%%----------------------------------------------------------------------
364%% Function: parse_value(Xml) -> {Value, Rest}
365%% Input:   Xml = binary()
366%% Output:  Value = string()
367%%          Rest = binary()
368%% Description: Parsing an attribute value from the stream.
369%%----------------------------------------------------------------------
370parse_value(<<C, Rest/binary>>, State) when ?is_whitespace(C) ->
371    parse_value(Rest, State);
372parse_value(<<C, Rest/binary>>, State) when C == $'; C == $" ->
373    parse_value_1(Rest, C, [], State);
374parse_value(_, State) ->
375    ?fatal_error(State, "\', \" or whitespace expected").
376
377%%----------------------------------------------------------------------
378%% Function: parse_value_1(Xml, Stop, Acc) -> {Value, Rest}
379%% Input:   Xml = binary()
380%%          Stop = $' | $"
381%%          Acc = list()
382%% Output:  Value = string()
383%%          Rest = binary()
384%% Description: Parsing an attribute value from the stream.
385%%----------------------------------------------------------------------
386parse_value_1(<<Stop, Rest/binary>>, Stop, Acc, _State) ->
387    {lists:reverse(Acc), Rest};
388parse_value_1(<<C, Rest/binary>>, Stop, Acc, State) ->
389    parse_value_1(Rest, Stop, [C |Acc], State);
390parse_value_1(_, _Stop, _Acc, State) ->
391    ?fatal_error(State, "end of input and no \' or \" found").
392
393%%======================================================================
394%% Default functions
395%%======================================================================
396%%----------------------------------------------------------------------
397%% Function: default_event_cb(Event, LineNo, State) -> Result
398%% Input:   Event = tuple()
399%%          LineNo = integer()
400%%          State = term()
401%% Output:  Result = {ok, State}
402%% Description: Default event callback printing event.
403%%----------------------------------------------------------------------
404default_event_cb(_Event, _LineNo, State) ->
405    State.
406
407%%----------------------------------------------------------------------
408%% Function: default_continuation_cb(IoDevice) -> Result
409%%          IoDevice = iodevice()
410%% Output:  Result = {binary(), IoDevice}
411%% Description: Default continuation callback reading blocks.
412%%----------------------------------------------------------------------
413default_continuation_cb(IoDevice) ->
414    case file:read(IoDevice, 1024) of
415	eof ->
416	    {<<>>, IoDevice};
417	{ok, FileBin} ->
418	    {FileBin, IoDevice}
419    end.
420
421%%----------------------------------------------------------------------
422%% Function: read_until_end_of_xml_directive(Rest, State) -> Result
423%%          Rest = binary()
424%% Output:  Result = {binary(), State}
425%% Description: Reads a utf8 or latin1 until it finds '?>'
426%%----------------------------------------------------------------------
427read_until_end_of_xml_directive(Rest, State) ->
428    case binary:match(Rest, <<"?>">>) of
429        nomatch ->
430            case cf(Rest, State) of
431                {<<>>, _} ->
432                    ?fatal_error(State, "Can't detect character encoding due to lack of indata");
433                {NewBytes, NewState} ->
434                    read_until_end_of_xml_directive(NewBytes, NewState)
435            end;
436        _ ->
437            {Rest, State}
438    end.
439
440
441%%----------------------------------------------------------------------
442%% Function  : cf(Rest, State) -> Result
443%% Parameters: Rest = binary()
444%%             State = #xmerl_sax_parser_state{}
445%%             NextCall = fun()
446%% Result    : {Rest, State}
447%% Description: Function that uses provided fun to read another chunk from
448%%              input stream and calls the fun in NextCall.
449%%----------------------------------------------------------------------
450cf(_Rest, #xmerl_sax_parser_state{continuation_fun = undefined} = State) ->
451    ?fatal_error(State, "Continuation function undefined");
452cf(Rest, #xmerl_sax_parser_state{continuation_fun = CFun, continuation_state = CState} = State) ->
453    Result =
454	try
455	    CFun(CState)
456	catch
457	    throw:ErrorTerm ->
458		?fatal_error(State, ErrorTerm);
459            exit:Reason ->
460		?fatal_error(State, {'EXIT', Reason})
461	end,
462    case Result of
463	{<<>>, _} ->
464	    ?fatal_error(State, "Can't detect character encoding due to lack of indata");
465	{NewBytes, NewContState} ->
466            {<<Rest/binary, NewBytes/binary>>,
467             State#xmerl_sax_parser_state{continuation_state = NewContState}}
468    end.
469
470%%----------------------------------------------------------------------
471%% Function  : cf(Rest, State, NextCall) -> Result
472%% Parameters: Rest = binary()
473%%             State = #xmerl_sax_parser_state{}
474%%             NextCall = fun()
475%% Result    : {Rest, State}
476%% Description: Function that uses provided fun to read another chunk from
477%%              input stream and calls the fun in NextCall.
478%%----------------------------------------------------------------------
479cf(_Rest, #xmerl_sax_parser_state{continuation_fun = undefined} = State, _) ->
480    ?fatal_error(State, "Continuation function undefined");
481cf(Rest, #xmerl_sax_parser_state{continuation_fun = CFun, continuation_state = CState} = State,
482   NextCall) ->
483    Result =
484	try
485	    CFun(CState)
486	catch
487	    throw:ErrorTerm ->
488		?fatal_error(State, ErrorTerm);
489            exit:Reason ->
490		?fatal_error(State, {'EXIT', Reason})
491	end,
492    case Result of
493	{<<>>, _} ->
494	    ?fatal_error(State, "Can't detect character encoding due to lack of indata");
495	{NewBytes, NewContState} ->
496	    NextCall(<<Rest/binary, NewBytes/binary>>,
497		     State#xmerl_sax_parser_state{continuation_state = NewContState})
498    end.
499