1%%-------------------------------------------------------------------- 2%% %CopyrightBegin% 3%% 4%% Copyright Ericsson AB 2008-2018. All Rights Reserved. 5%% 6%% Licensed under the Apache License, Version 2.0 (the "License"); 7%% you may not use this file except in compliance with the License. 8%% You may obtain a copy of the License at 9%% 10%% http://www.apache.org/licenses/LICENSE-2.0 11%% 12%% Unless required by applicable law or agreed to in writing, software 13%% distributed under the License is distributed on an "AS IS" BASIS, 14%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15%% See the License for the specific language governing permissions and 16%% limitations under the License. 17%% 18%% %CopyrightEnd% 19%%---------------------------------------------------------------------- 20%% File : xmerl_sax_parser.erl 21%% Description : XML SAX parse API module. 22%% 23%% Created : 4 Jun 2008 24%%---------------------------------------------------------------------- 25-module(xmerl_sax_parser). 26 27%%---------------------------------------------------------------------- 28%% Include files 29%%---------------------------------------------------------------------- 30-include("xmerl_sax_parser.hrl"). 31 32%%---------------------------------------------------------------------- 33%% External exports 34%%---------------------------------------------------------------------- 35-export([file/2, 36 stream/3, 37 stream/2]). 38 39%%---------------------------------------------------------------------- 40%% Internal exports 41%%---------------------------------------------------------------------- 42-export([default_continuation_cb/1]). 43 44%%---------------------------------------------------------------------- 45%% Macros 46%%---------------------------------------------------------------------- 47 48%%---------------------------------------------------------------------- 49%% Records 50%%---------------------------------------------------------------------- 51 52%%====================================================================== 53%% External functions 54%%====================================================================== 55%%---------------------------------------------------------------------- 56%% Function: file(Filename, Options) -> Result 57%% Input: Filename = string() 58%% Options = [{OptTag, term()}] 59%% OptTag = event_state | event_fun | continuation_state | 60%% continuation_fun | .... 61%% Output: Result = {ok, EventState, Rest} 62%% Rest = unicode_binary() | latin1_binary() 63%% EventState = term() 64%% Description: Parse file containing an XML document. 65%%---------------------------------------------------------------------- 66file(Name,Options) -> 67 case file:open(Name, [raw, read_ahead, read,binary]) of 68 {error, Reason} -> 69 {error,{Name, file:format_error(Reason)}}; 70 {ok, FD} -> 71 Dir = filename:dirname(Name), 72 CL = filename:absname(Dir), 73 File = filename:basename(Name), 74 ContinuationFun = fun default_continuation_cb/1, 75 Res = stream(<<>>, 76 [{continuation_fun, ContinuationFun}, 77 {continuation_state, FD}, 78 {current_location, CL}, 79 {entity, File} 80 |Options], 81 file), 82 ok = file:close(FD), 83 Res 84 end. 85 86%%---------------------------------------------------------------------- 87%% Function: stream(Xml, Options) -> Result 88%% Input: Xml = string() | binary() 89%% Options = [{OptTag, term()}] 90%% OptTag = event_state | event_fun | continuation_state | 91%% continuation_fun | .... 92%% Output: Result = {ok, EventState, Rest} 93%% Rest = unicode_binary() | latin1_binary() | [unicode_char()] 94%% EventState = term() 95%% Description: Parse a stream containing an XML document. 96%%---------------------------------------------------------------------- 97stream(Xml, Options) -> 98 stream(Xml, Options, stream). 99 100stream(Xml, Options, InputType) when is_list(Xml), is_list(Options) -> 101 State = parse_options(Options, initial_state()), 102 case State#xmerl_sax_parser_state.file_type of 103 dtd -> 104 xmerl_sax_parser_list:parse_dtd(Xml, 105 State#xmerl_sax_parser_state{encoding = list, 106 input_type = InputType}); 107 normal -> 108 xmerl_sax_parser_list:parse(Xml, 109 State#xmerl_sax_parser_state{encoding = list, 110 input_type = InputType}) 111 end; 112stream(Xml, Options, InputType) when is_binary(Xml), is_list(Options) -> 113 case parse_options(Options, initial_state()) of 114 {error, Reason} -> {error, Reason}; 115 State -> 116 ParseFunction = 117 case State#xmerl_sax_parser_state.file_type of 118 dtd -> 119 parse_dtd; 120 normal -> 121 parse 122 end, 123 try 124 {Xml1, State1} = detect_charset(Xml, State), 125 parse_binary(Xml1, 126 State1#xmerl_sax_parser_state{input_type = InputType}, 127 ParseFunction) 128 catch 129 throw:{fatal_error, {State2, Reason}} -> 130 {fatal_error, 131 { 132 State2#xmerl_sax_parser_state.current_location, 133 State2#xmerl_sax_parser_state.entity, 134 1 135 }, 136 Reason, [], 137 State2#xmerl_sax_parser_state.event_state} 138 end 139 end. 140 141%%---------------------------------------------------------------------- 142%% Function: parse_binary(Encoding, Xml, State, F) -> Result 143%% Input: Encoding = atom() 144%% Xml = [integer()] | binary() 145%% State = #xmerl_sax_parser_state 146%% F = atom() 147%% Output: Result = {ok, Rest, EventState} 148%% Rest = list() | binary() 149%% EventState = term() 150%% Description: Chooses the correct parser depending on the encoding. 151%%---------------------------------------------------------------------- 152parse_binary(Xml, #xmerl_sax_parser_state{encoding=utf8}=State, F) -> 153 xmerl_sax_parser_utf8:F(Xml, State); 154parse_binary(Xml, #xmerl_sax_parser_state{encoding={utf16,little}}=State, F) -> 155 xmerl_sax_parser_utf16le:F(Xml, State); 156parse_binary(Xml, #xmerl_sax_parser_state{encoding={utf16,big}}=State, F) -> 157 xmerl_sax_parser_utf16be:F(Xml, State); 158parse_binary(Xml, #xmerl_sax_parser_state{encoding=latin1}=State, F) -> 159 xmerl_sax_parser_latin1:F(Xml, State); 160parse_binary(_, #xmerl_sax_parser_state{encoding=Enc}, State) -> 161 ?fatal_error(State, lists:flatten(io_lib:format("Charcter set ~p not supported", [Enc]))). 162 163%%---------------------------------------------------------------------- 164%% Function: initial_state/0 165%% Input: - 166%% Output: #xmerl_sax_parser_state{} 167%% Description: Creates the initial state record. 168%%---------------------------------------------------------------------- 169initial_state() -> 170 #xmerl_sax_parser_state{ 171 event_fun = fun default_event_cb/3, 172 ns = [{"xml", "http://www.w3.org/XML/1998/namespace"}], 173 current_location = ".", 174 entity = "" 175 }. 176 177%%---------------------------------------------------------------------- 178%% Function: parse_options(Options, State) 179%% Input: Options = [Option] 180%% Option = {event_state, term()} | {event_fun, fun()} | 181%% {continuation_state, term()} | {continuation_fun, fun()} | 182%% {encoding, Encoding} | {file_type, FT} 183%% FT = normal | dtd 184%% Encoding = utf8 | utf16le | utf16be | list | iso8859 185%% State = #xmerl_sax_parser_state{} 186%% Output: #xmerl_sax_parser_state{} 187%% Description: Checks the parser options. 188%%---------------------------------------------------------------------- 189parse_options([], State) -> 190 State; 191parse_options([{event_state, CbState} |Options], State) -> 192 parse_options(Options, State#xmerl_sax_parser_state{event_state = CbState}); 193parse_options([{event_fun, CbF} |Options], State) -> 194 parse_options(Options, State#xmerl_sax_parser_state{event_fun = CbF}); 195parse_options([{continuation_state, CState} |Options], State) -> 196 parse_options(Options, State#xmerl_sax_parser_state{continuation_state = CState}); 197parse_options([{continuation_fun, CF} |Options], State) -> 198 parse_options(Options, State#xmerl_sax_parser_state{continuation_fun = CF}); 199parse_options([{file_type, FT} |Options], State) when FT==normal; FT==dtd -> 200 parse_options(Options, State#xmerl_sax_parser_state{file_type = FT}); 201parse_options([{encoding, E} |Options], State) -> 202 case check_encoding_option(E) of 203 {error, Reason} -> 204 {error, Reason}; 205 Enc -> 206 parse_options(Options, State#xmerl_sax_parser_state{encoding = Enc}) 207 end; 208parse_options([{current_location, CL} |Options], State) -> 209 parse_options(Options, State#xmerl_sax_parser_state{current_location = CL}); 210parse_options([{entity, Entity} |Options], State) -> 211 parse_options(Options, State#xmerl_sax_parser_state{entity = Entity}); 212parse_options([skip_external_dtd |Options], State) -> 213 parse_options(Options, State#xmerl_sax_parser_state{skip_external_dtd = true}); 214parse_options([O |_], _State) -> 215 {error, lists:flatten(io_lib:format("Option: ~p not supported", [O]))}. 216 217 218check_encoding_option(E) when E==utf8; E=={utf16,little}; E=={utf16,big}; 219 E==latin1; E==list -> 220 E; 221check_encoding_option(utf16) -> 222 {utf16,big}; 223check_encoding_option(E) -> 224 {error, io_lib:format("Character set ~p not supported", [E])}. 225 226%%---------------------------------------------------------------------- 227%% Function: detect_charset(Xml, State) 228%% Input: Xml = list() | binary() 229%% State = #xmerl_sax_parser_state{} 230%% Output: {utf8|utf16le|utf16be|iso8859, Xml, State} 231%% Description: Detects which character set is used in a binary stream. 232%%---------------------------------------------------------------------- 233detect_charset(<<>>, #xmerl_sax_parser_state{continuation_fun = undefined} = State) -> 234 ?fatal_error(State, "Can't detect character encoding due to lack of indata"); 235detect_charset(<<>>, State) -> 236 cf(<<>>, State, fun detect_charset/2); 237detect_charset(Bytes, State) -> 238 case unicode:bom_to_encoding(Bytes) of 239 {latin1, 0} -> 240 detect_charset_1(Bytes, State); 241 {Enc, Length} -> 242 <<_:Length/binary, RealBytes/binary>> = Bytes, 243 {RealBytes, State#xmerl_sax_parser_state{encoding=Enc}} 244 end. 245 246detect_charset_1(<<16#00>> = Xml, State) -> 247 cf(Xml, State, fun detect_charset_1/2); 248detect_charset_1(<<16#00, 16#3C>> = Xml, State) -> 249 cf(Xml, State, fun detect_charset_1/2); 250detect_charset_1(<<16#00, 16#3C, 16#00>> = Xml, State) -> 251 cf(Xml, State, fun detect_charset_1/2); 252detect_charset_1(<<16#00, 16#3C, 16#00, 16#3F, _/binary>> = Xml, State) -> 253 {Xml, State#xmerl_sax_parser_state{encoding={utf16, big}}}; 254detect_charset_1(<<16#3C>> = Xml, State) -> 255 cf(Xml, State, fun detect_charset_1/2); 256detect_charset_1(<<16#3C, 16#00>> = Xml, State) -> 257 cf(Xml, State, fun detect_charset_1/2); 258detect_charset_1(<<16#3C, 16#00, 16#3F>> = Xml, State) -> 259 cf(Xml, State, fun detect_charset_1/2); 260detect_charset_1(<<16#3C, 16#00, 16#3F, 16#00, _/binary>> = Xml, State) -> 261 {Xml, State#xmerl_sax_parser_state{encoding={utf16, little}}}; 262detect_charset_1(<<16#3C>> = Xml, State) -> 263 cf(Xml, State, fun detect_charset_1/2); 264detect_charset_1(<<16#3C, 16#3F>> = Xml, State) -> 265 cf(Xml, State, fun detect_charset_1/2); 266detect_charset_1(<<16#3C, 16#3F, 16#78>> = Xml, State) -> 267 cf(Xml, State, fun detect_charset_1/2); 268detect_charset_1(<<16#3C, 16#3F, 16#78, 16#6D>> = Xml, State) -> 269 cf(Xml, State, fun detect_charset_1/2); 270detect_charset_1(<<16#3C, 16#3F, 16#78, 16#6D, 16#6C, Xml2/binary>>, State) -> 271 {Xml3, State1} = read_until_end_of_xml_directive(Xml2, State), 272 AttrList = parse_xml_directive(Xml3, State), 273 case lists:keysearch("encoding", 1, AttrList) of 274 {value, {_, E}} -> 275 Enc = convert_encoding(E, State), 276 {<<16#3C, 16#3F, 16#78, 16#6D, 16#6C, Xml3/binary>>, 277 State1#xmerl_sax_parser_state{encoding=Enc}}; 278 _ -> 279 {<<16#3C, 16#3F, 16#78, 16#6D, 16#6C, Xml3/binary>>, State1} 280 end; 281detect_charset_1(Xml, State) -> 282 {Xml, State}. 283 284%%---------------------------------------------------------------------- 285%% Function: convert_encoding(Enc) 286%% Input: Enc = string() 287%% Output: utf8 | iso8859 288%% Description: Converting 7,8 bit and utf8 encoding strings to internal format. 289%%---------------------------------------------------------------------- 290convert_encoding(Enc, State) -> %% Just for 7,8 bit + utf8 291 case string:to_lower(Enc) of 292 "utf-8" -> utf8; 293 "us-ascii" -> utf8; 294 "latin1" -> latin1; 295 "iso-8859-1" -> latin1; % Handle all iso-8859 as latin1 296 "iso-8859-2" -> latin1; 297 "iso-8859-3" -> latin1; 298 "iso-8859-4" -> latin1; 299 "iso-8859-5" -> latin1; 300 "iso-8859-6" -> latin1; 301 "iso-8859-7" -> latin1; 302 "iso-8859-8" -> latin1; 303 "iso-8859-9" -> latin1; 304 _ -> ?fatal_error(State, "Unknown encoding: " ++ Enc) 305 end. 306 307%%---------------------------------------------------------------------- 308%% Function: parse_xml_directive(Xml) 309%% Input: Xml = binary() 310%% Acc = list() 311%% Output: 312%% Description: Parsing the xml declaration from the input stream. 313%%---------------------------------------------------------------------- 314parse_xml_directive(<<C, Rest/binary>>, State) when ?is_whitespace(C) -> 315 parse_xml_directive_1(Rest, [], State); 316parse_xml_directive(_, State) -> 317 ?fatal_error(State, "Expected whitespace in directive"). 318 319 320%%---------------------------------------------------------------------- 321%% Function: parse_xml_directive_1(Xml, Acc) -> [{Name, Value}] 322%% Input: Xml = binary() 323%% Acc = [{Name, Value}] 324%% Name = string() 325%% Value = string() 326%% Output: see above 327%% Description: Parsing the xml declaration from the input stream. 328%%---------------------------------------------------------------------- 329parse_xml_directive_1(<<C, Rest/binary>>, Acc, State) when ?is_whitespace(C) -> 330 parse_xml_directive_1(Rest, Acc, State); 331parse_xml_directive_1(<<"?>", _/binary>>, Acc, _State) -> 332 Acc; 333parse_xml_directive_1(<<C, Rest/binary>>, Acc, State) when 97 =< C, C =< 122 -> 334 {Name, Rest1} = parse_name(Rest, [C]), 335 Rest2 = parse_eq(Rest1, State), 336 {Value, Rest3} = parse_value(Rest2, State), 337 parse_xml_directive_1(Rest3, [{Name, Value} |Acc], State); 338parse_xml_directive_1(_, _, State) -> 339 ?fatal_error(State, "Unknown attribute in xml directive"). 340 341%%---------------------------------------------------------------------- 342%% Function: parse_name(Xml, Acc) -> Name 343%% Input: Xml = binary() 344%% Acc = string() 345%% Output: Name = string() 346%% Description: Parsing an attribute name from the stream. 347%%---------------------------------------------------------------------- 348parse_name(<<C, Rest/binary>>, Acc) when 97 =< C, C =< 122 -> 349 parse_name(Rest, [C |Acc]); 350parse_name(Rest, Acc) -> 351 {lists:reverse(Acc), Rest}. 352 353%%---------------------------------------------------------------------- 354%% Function: parse_eq(Xml) -> Rest 355%% Input: Xml = binary() 356%% Output: Rest = binary() 357%% Description: Reads an '=' from the stream. 358%%---------------------------------------------------------------------- 359parse_eq(<<C, Rest/binary>>, State) when ?is_whitespace(C) -> 360 parse_eq(Rest, State); 361parse_eq(<<"=", Rest/binary>>, _State) -> 362 Rest; 363parse_eq(_, State) -> 364 ?fatal_error(State, "expecting = or whitespace"). 365 366%%---------------------------------------------------------------------- 367%% Function: parse_value(Xml) -> {Value, Rest} 368%% Input: Xml = binary() 369%% Output: Value = string() 370%% Rest = binary() 371%% Description: Parsing an attribute value from the stream. 372%%---------------------------------------------------------------------- 373parse_value(<<C, Rest/binary>>, State) when ?is_whitespace(C) -> 374 parse_value(Rest, State); 375parse_value(<<C, Rest/binary>>, State) when C == $'; C == $" -> 376 parse_value_1(Rest, C, [], State); 377parse_value(_, State) -> 378 ?fatal_error(State, "\', \" or whitespace expected"). 379 380%%---------------------------------------------------------------------- 381%% Function: parse_value_1(Xml, Stop, Acc) -> {Value, Rest} 382%% Input: Xml = binary() 383%% Stop = $' | $" 384%% Acc = list() 385%% Output: Value = string() 386%% Rest = binary() 387%% Description: Parsing an attribute value from the stream. 388%%---------------------------------------------------------------------- 389parse_value_1(<<Stop, Rest/binary>>, Stop, Acc, _State) -> 390 {lists:reverse(Acc), Rest}; 391parse_value_1(<<C, Rest/binary>>, Stop, Acc, State) -> 392 parse_value_1(Rest, Stop, [C |Acc], State); 393parse_value_1(_, _Stop, _Acc, State) -> 394 ?fatal_error(State, "end of input and no \' or \" found"). 395 396%%====================================================================== 397%% Default functions 398%%====================================================================== 399%%---------------------------------------------------------------------- 400%% Function: default_event_cb(Event, LineNo, State) -> Result 401%% Input: Event = tuple() 402%% LineNo = integer() 403%% State = term() 404%% Output: Result = {ok, State} 405%% Description: Default event callback printing event. 406%%---------------------------------------------------------------------- 407default_event_cb(_Event, _LineNo, State) -> 408 State. 409 410%%---------------------------------------------------------------------- 411%% Function: default_continuation_cb(IoDevice) -> Result 412%% IoDevice = iodevice() 413%% Output: Result = {binary(), IoDevice} 414%% Description: Default continuation callback reading blocks. 415%%---------------------------------------------------------------------- 416default_continuation_cb(IoDevice) -> 417 case file:read(IoDevice, 1024) of 418 eof -> 419 {<<>>, IoDevice}; 420 {ok, FileBin} -> 421 {FileBin, IoDevice} 422 end. 423 424%%---------------------------------------------------------------------- 425%% Function: read_until_end_of_xml_directive(Rest, State) -> Result 426%% Rest = binary() 427%% Output: Result = {binary(), State} 428%% Description: Reads a utf8 or latin1 until it finds '?>' 429%%---------------------------------------------------------------------- 430read_until_end_of_xml_directive(Rest, State) -> 431 case binary:match(Rest, <<"?>">>) of 432 nomatch -> 433 case cf(Rest, State) of 434 {<<>>, _} -> 435 ?fatal_error(State, "Can't detect character encoding due to lack of indata"); 436 {NewBytes, NewState} -> 437 read_until_end_of_xml_directive(NewBytes, NewState) 438 end; 439 _ -> 440 {Rest, State} 441 end. 442 443 444%%---------------------------------------------------------------------- 445%% Function : cf(Rest, State) -> Result 446%% Parameters: Rest = binary() 447%% State = #xmerl_sax_parser_state{} 448%% NextCall = fun() 449%% Result : {Rest, State} 450%% Description: Function that uses provided fun to read another chunk from 451%% input stream and calls the fun in NextCall. 452%%---------------------------------------------------------------------- 453cf(_Rest, #xmerl_sax_parser_state{continuation_fun = undefined} = State) -> 454 ?fatal_error(State, "Continuation function undefined"); 455cf(Rest, #xmerl_sax_parser_state{continuation_fun = CFun, continuation_state = CState} = State) -> 456 Result = 457 try 458 CFun(CState) 459 catch 460 throw:ErrorTerm -> 461 ?fatal_error(State, ErrorTerm); 462 exit:Reason -> 463 ?fatal_error(State, {'EXIT', Reason}) 464 end, 465 case Result of 466 {<<>>, _} -> 467 ?fatal_error(State, "Can't detect character encoding due to lack of indata"); 468 {NewBytes, NewContState} -> 469 {<<Rest/binary, NewBytes/binary>>, 470 State#xmerl_sax_parser_state{continuation_state = NewContState}} 471 end. 472 473%%---------------------------------------------------------------------- 474%% Function : cf(Rest, State, NextCall) -> Result 475%% Parameters: Rest = binary() 476%% State = #xmerl_sax_parser_state{} 477%% NextCall = fun() 478%% Result : {Rest, State} 479%% Description: Function that uses provided fun to read another chunk from 480%% input stream and calls the fun in NextCall. 481%%---------------------------------------------------------------------- 482cf(_Rest, #xmerl_sax_parser_state{continuation_fun = undefined} = State, _) -> 483 ?fatal_error(State, "Continuation function undefined"); 484cf(Rest, #xmerl_sax_parser_state{continuation_fun = CFun, continuation_state = CState} = State, 485 NextCall) -> 486 Result = 487 try 488 CFun(CState) 489 catch 490 throw:ErrorTerm -> 491 ?fatal_error(State, ErrorTerm); 492 exit:Reason -> 493 ?fatal_error(State, {'EXIT', Reason}) 494 end, 495 case Result of 496 {<<>>, _} -> 497 ?fatal_error(State, "Can't detect character encoding due to lack of indata"); 498 {NewBytes, NewContState} -> 499 NextCall(<<Rest/binary, NewBytes/binary>>, 500 State#xmerl_sax_parser_state{continuation_state = NewContState}) 501 end. 502