1%%-------------------------------------------------------------------- 2%% %CopyrightBegin% 3%% 4%% Copyright Ericsson AB 2008-2018. All Rights Reserved. 5%% 6%% Licensed under the Apache License, Version 2.0 (the "License"); 7%% you may not use this file except in compliance with the License. 8%% You may obtain a copy of the License at 9%% 10%% http://www.apache.org/licenses/LICENSE-2.0 11%% 12%% Unless required by applicable law or agreed to in writing, software 13%% distributed under the License is distributed on an "AS IS" BASIS, 14%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15%% See the License for the specific language governing permissions and 16%% limitations under the License. 17%% 18%% %CopyrightEnd% 19%%---------------------------------------------------------------------- 20%% File : xmerl_sax_parser.erl 21%% Description : XML SAX parse API module. 22%% 23%% Created : 4 Jun 2008 24%%---------------------------------------------------------------------- 25-module(xmerl_sax_parser). 26 27%%---------------------------------------------------------------------- 28%% Include files 29%%---------------------------------------------------------------------- 30-include("xmerl_sax_parser.hrl"). 31 32%%---------------------------------------------------------------------- 33%% External exports 34%%---------------------------------------------------------------------- 35-export([file/2, 36 stream/3, 37 stream/2]). 38 39%%---------------------------------------------------------------------- 40%% Internal exports 41%%---------------------------------------------------------------------- 42-export([default_continuation_cb/1]). 43 44%%---------------------------------------------------------------------- 45%% Macros 46%%---------------------------------------------------------------------- 47 48%%---------------------------------------------------------------------- 49%% Records 50%%---------------------------------------------------------------------- 51 52%%====================================================================== 53%% External functions 54%%====================================================================== 55%%---------------------------------------------------------------------- 56%% Function: file(Filename, Options) -> Result 57%% Input: Filename = string() 58%% Options = [{OptTag, term()}] 59%% OptTag = event_state | event_fun | continuation_state | 60%% continuation_fun | .... 61%% Output: Result = {ok, EventState, Rest} 62%% Rest = unicode_binary() | latin1_binary() 63%% EventState = term() 64%% Description: Parse file containing an XML document. 65%%---------------------------------------------------------------------- 66file(Name,Options) -> 67 case file:open(Name, [raw, read_ahead, read,binary]) of 68 {error, Reason} -> 69 {error,{Name, file:format_error(Reason)}}; 70 {ok, FD} -> 71 Dir = filename:dirname(Name), 72 CL = filename:absname(Dir), 73 File = filename:basename(Name), 74 ContinuationFun = fun default_continuation_cb/1, 75 Res = stream(<<>>, 76 [{continuation_fun, ContinuationFun}, 77 {continuation_state, FD}, 78 {current_location, CL}, 79 {entity, File} 80 |Options], 81 file), 82 ok = file:close(FD), 83 Res 84 end. 85 86%%---------------------------------------------------------------------- 87%% Function: stream(Xml, Options) -> Result 88%% Input: Xml = string() | binary() 89%% Options = [{OptTag, term()}] 90%% OptTag = event_state | event_fun | continuation_state | 91%% continuation_fun | .... 92%% Output: Result = {ok, EventState, Rest} 93%% Rest = unicode_binary() | latin1_binary() | [unicode_char()] 94%% EventState = term() 95%% Description: Parse a stream containing an XML document. 96%%---------------------------------------------------------------------- 97stream(Xml, Options) -> 98 stream(Xml, Options, stream). 99 100stream(Xml, Options, InputType) when is_list(Xml), is_list(Options) -> 101 State = parse_options(Options, initial_state()), 102 case State#xmerl_sax_parser_state.file_type of 103 dtd -> 104 xmerl_sax_parser_list:parse_dtd(Xml, 105 State#xmerl_sax_parser_state{encoding = list, 106 input_type = InputType}); 107 normal -> 108 xmerl_sax_parser_list:parse(Xml, 109 State#xmerl_sax_parser_state{encoding = list, 110 input_type = InputType}) 111 end; 112stream(Xml, Options, InputType) when is_binary(Xml), is_list(Options) -> 113 case parse_options(Options, initial_state()) of 114 {error, Reason} -> {error, Reason}; 115 State -> 116 ParseFunction = 117 case State#xmerl_sax_parser_state.file_type of 118 dtd -> 119 parse_dtd; 120 normal -> 121 parse 122 end, 123 try 124 {Xml1, State1} = detect_charset(Xml, State), 125 parse_binary(Xml1, 126 State1#xmerl_sax_parser_state{input_type = InputType}, 127 ParseFunction) 128 catch 129 throw:{fatal_error, {State2, Reason}} -> 130 {fatal_error, 131 { 132 State2#xmerl_sax_parser_state.current_location, 133 State2#xmerl_sax_parser_state.entity, 134 1 135 }, 136 Reason, [], 137 State2#xmerl_sax_parser_state.event_state} 138 end 139 end. 140 141%%---------------------------------------------------------------------- 142%% Function: parse_binary(Encoding, Xml, State, F) -> Result 143%% Input: Encoding = atom() 144%% Xml = [integer()] | binary() 145%% State = #xmerl_sax_parser_state 146%% F = atom() 147%% Output: Result = {ok, Rest, EventState} 148%% Rest = list() | binary() 149%% EventState = term() 150%% Description: Chooses the correct parser depending on the encoding. 151%%---------------------------------------------------------------------- 152parse_binary(Xml, #xmerl_sax_parser_state{encoding=utf8}=State, F) -> 153 xmerl_sax_parser_utf8:F(Xml, State); 154parse_binary(Xml, #xmerl_sax_parser_state{encoding={utf16,little}}=State, F) -> 155 xmerl_sax_parser_utf16le:F(Xml, State); 156parse_binary(Xml, #xmerl_sax_parser_state{encoding={utf16,big}}=State, F) -> 157 xmerl_sax_parser_utf16be:F(Xml, State); 158parse_binary(Xml, #xmerl_sax_parser_state{encoding=latin1}=State, F) -> 159 xmerl_sax_parser_latin1:F(Xml, State); 160parse_binary(_, #xmerl_sax_parser_state{encoding=Enc}, State) -> 161 ?fatal_error(State, lists:flatten(io_lib:format("Charcter set ~p not supported", [Enc]))). 162 163%%---------------------------------------------------------------------- 164%% Function: initial_state/0 165%% Input: - 166%% Output: #xmerl_sax_parser_state{} 167%% Description: Creates the initial state record. 168%%---------------------------------------------------------------------- 169initial_state() -> 170 #xmerl_sax_parser_state{ 171 event_fun = fun default_event_cb/3, 172 ns = [{"xml", "http://www.w3.org/XML/1998/namespace"}], 173 current_location = ".", 174 entity = "" 175 }. 176 177%%---------------------------------------------------------------------- 178%% Function: parse_options(Options, State) 179%% Input: Options = [Option] 180%% Option = {event_state, term()} | {event_fun, fun()} | 181%% {continuation_state, term()} | {continuation_fun, fun()} | 182%% {encoding, Encoding} | {file_type, FT} 183%% FT = normal | dtd 184%% Encoding = utf8 | utf16le | utf16be | list | iso8859 185%% State = #xmerl_sax_parser_state{} 186%% Output: #xmerl_sax_parser_state{} 187%% Description: Checks the parser options. 188%%---------------------------------------------------------------------- 189parse_options([], State) -> 190 State; 191parse_options([{event_state, CbState} |Options], State) -> 192 parse_options(Options, State#xmerl_sax_parser_state{event_state = CbState}); 193parse_options([{event_fun, CbF} |Options], State) -> 194 parse_options(Options, State#xmerl_sax_parser_state{event_fun = CbF}); 195parse_options([{continuation_state, CState} |Options], State) -> 196 parse_options(Options, State#xmerl_sax_parser_state{continuation_state = CState}); 197parse_options([{continuation_fun, CF} |Options], State) -> 198 parse_options(Options, State#xmerl_sax_parser_state{continuation_fun = CF}); 199parse_options([{file_type, FT} |Options], State) when FT==normal; FT==dtd -> 200 parse_options(Options, State#xmerl_sax_parser_state{file_type = FT}); 201parse_options([{encoding, E} |Options], State) -> 202 case check_encoding_option(E) of 203 {error, Reason} -> 204 {error, Reason}; 205 Enc -> 206 parse_options(Options, State#xmerl_sax_parser_state{encoding = Enc}) 207 end; 208parse_options([{current_location, CL} |Options], State) -> 209 parse_options(Options, State#xmerl_sax_parser_state{current_location = CL}); 210parse_options([{entity, Entity} |Options], State) -> 211 parse_options(Options, State#xmerl_sax_parser_state{entity = Entity}); 212parse_options([skip_external_dtd |Options], State) -> 213 parse_options(Options, State#xmerl_sax_parser_state{skip_external_dtd = true}); 214parse_options([O |_], _State) -> 215 {error, lists:flatten(io_lib:format("Option: ~p not supported", [O]))}. 216 217 218check_encoding_option(E) when E==utf8; E=={utf16,little}; E=={utf16,big}; 219 E==latin1; E==list -> 220 E; 221check_encoding_option(utf16) -> 222 {utf16,big}; 223check_encoding_option(E) -> 224 {error, io_lib:format("Character set ~p not supported", [E])}. 225 226%%---------------------------------------------------------------------- 227%% Function: detect_charset(Xml, State) 228%% Input: Xml = list() | binary() 229%% State = #xmerl_sax_parser_state{} 230%% Output: {utf8|utf16le|utf16be|iso8859, Xml, State} 231%% Description: Detects which character set is used in a binary stream. 232%%---------------------------------------------------------------------- 233detect_charset(<<>>, #xmerl_sax_parser_state{continuation_fun = undefined} = State) -> 234 ?fatal_error(State, "Can't detect character encoding due to lack of indata"); 235detect_charset(<<>>, State) -> 236 cf(<<>>, State, fun detect_charset/2); 237detect_charset(Bytes, State) -> 238 case unicode:bom_to_encoding(Bytes) of 239 {latin1, 0} -> 240 detect_charset_1(Bytes, State); 241 {Enc, Length} -> 242 <<_:Length/binary, RealBytes/binary>> = Bytes, 243 {RealBytes, State#xmerl_sax_parser_state{encoding=Enc}} 244 end. 245 246detect_charset_1(<<16#00>> = Xml, State) -> 247 cf(Xml, State, fun detect_charset_1/2); 248detect_charset_1(<<16#00, 16#3C>> = Xml, State) -> 249 cf(Xml, State, fun detect_charset_1/2); 250detect_charset_1(<<16#00, 16#3C, 16#00>> = Xml, State) -> 251 cf(Xml, State, fun detect_charset_1/2); 252detect_charset_1(<<16#00, 16#3C, 16#00, 16#3F, _/binary>> = Xml, State) -> 253 {Xml, State#xmerl_sax_parser_state{encoding={utf16, big}}}; 254detect_charset_1(<<16#3C>> = Xml, State) -> 255 cf(Xml, State, fun detect_charset_1/2); 256detect_charset_1(<<16#3C, 16#00>> = Xml, State) -> 257 cf(Xml, State, fun detect_charset_1/2); 258detect_charset_1(<<16#3C, 16#00, 16#3F>> = Xml, State) -> 259 cf(Xml, State, fun detect_charset_1/2); 260detect_charset_1(<<16#3C, 16#00, 16#3F, 16#00, _/binary>> = Xml, State) -> 261 {Xml, State#xmerl_sax_parser_state{encoding={utf16, little}}}; 262detect_charset_1(<<16#3C>> = Xml, State) -> 263 cf(Xml, State, fun detect_charset_1/2); 264detect_charset_1(<<16#3C, 16#3F>> = Xml, State) -> 265 cf(Xml, State, fun detect_charset_1/2); 266detect_charset_1(<<16#3C, 16#3F, 16#78>> = Xml, State) -> 267 cf(Xml, State, fun detect_charset_1/2); 268detect_charset_1(<<16#3C, 16#3F, 16#78, 16#6D>> = Xml, State) -> 269 cf(Xml, State, fun detect_charset_1/2); 270detect_charset_1(<<16#3C, 16#3F, 16#78, 16#6D, 16#6C, Xml2/binary>>, State) -> 271 {Xml3, State1} = read_until_end_of_xml_directive(Xml2, State), 272 AttrList = parse_xml_directive(Xml3, State), 273 case lists:keysearch("encoding", 1, AttrList) of 274 {value, {_, E}} -> 275 Enc = convert_encoding(E, State), 276 {<<16#3C, 16#3F, 16#78, 16#6D, 16#6C, Xml3/binary>>, 277 State1#xmerl_sax_parser_state{encoding=Enc}}; 278 _ -> 279 {<<16#3C, 16#3F, 16#78, 16#6D, 16#6C, Xml3/binary>>, State1} 280 end; 281detect_charset_1(Xml, State) -> 282 {Xml, State}. 283 284%%---------------------------------------------------------------------- 285%% Function: convert_encoding(Enc) 286%% Input: Enc = string() 287%% Output: utf8 | iso8859 288%% Description: Converting 7,8 bit and utf8 encoding strings to internal format. 289%%---------------------------------------------------------------------- 290convert_encoding(Enc, State) -> %% Just for 7,8 bit + utf8 291 case string:to_lower(Enc) of 292 "utf-8" -> utf8; 293 "us-ascii" -> utf8; 294 "latin1" -> latin1; 295 "iso-8859-1" -> latin1; % Handle all iso-8859 as latin1 296 "iso-8859-2" -> latin1; 297 "iso-8859-3" -> latin1; 298 "iso-8859-4" -> latin1; 299 "iso-8859-5" -> latin1; 300 "iso-8859-6" -> latin1; 301 "iso-8859-7" -> latin1; 302 "iso-8859-8" -> latin1; 303 "iso-8859-9" -> latin1; 304 _ -> ?fatal_error(State, "Unknown encoding: " ++ Enc) 305 end. 306 307%%---------------------------------------------------------------------- 308%% Function: parse_xml_directive(Xml) 309%% Input: Xml = binary() 310%% Acc = list() 311%% Output: 312%% Description: Parsing the xml declaration from the input stream. 313%%---------------------------------------------------------------------- 314parse_xml_directive(<<C, Rest/binary>>, State) when ?is_whitespace(C) -> 315 parse_xml_directive_1(Rest, [], State). 316 317%%---------------------------------------------------------------------- 318%% Function: parse_xml_directive_1(Xml, Acc) -> [{Name, Value}] 319%% Input: Xml = binary() 320%% Acc = [{Name, Value}] 321%% Name = string() 322%% Value = string() 323%% Output: see above 324%% Description: Parsing the xml declaration from the input stream. 325%%---------------------------------------------------------------------- 326parse_xml_directive_1(<<C, Rest/binary>>, Acc, State) when ?is_whitespace(C) -> 327 parse_xml_directive_1(Rest, Acc, State); 328parse_xml_directive_1(<<"?>", _/binary>>, Acc, _State) -> 329 Acc; 330parse_xml_directive_1(<<C, Rest/binary>>, Acc, State) when 97 =< C, C =< 122 -> 331 {Name, Rest1} = parse_name(Rest, [C]), 332 Rest2 = parse_eq(Rest1, State), 333 {Value, Rest3} = parse_value(Rest2, State), 334 parse_xml_directive_1(Rest3, [{Name, Value} |Acc], State); 335parse_xml_directive_1(_, _, State) -> 336 ?fatal_error(State, "Unknown attribute in xml directive"). 337 338%%---------------------------------------------------------------------- 339%% Function: parse_name(Xml, Acc) -> Name 340%% Input: Xml = binary() 341%% Acc = string() 342%% Output: Name = string() 343%% Description: Parsing an attribute name from the stream. 344%%---------------------------------------------------------------------- 345parse_name(<<C, Rest/binary>>, Acc) when 97 =< C, C =< 122 -> 346 parse_name(Rest, [C |Acc]); 347parse_name(Rest, Acc) -> 348 {lists:reverse(Acc), Rest}. 349 350%%---------------------------------------------------------------------- 351%% Function: parse_eq(Xml) -> Rest 352%% Input: Xml = binary() 353%% Output: Rest = binary() 354%% Description: Reads an '=' from the stream. 355%%---------------------------------------------------------------------- 356parse_eq(<<C, Rest/binary>>, State) when ?is_whitespace(C) -> 357 parse_eq(Rest, State); 358parse_eq(<<"=", Rest/binary>>, _State) -> 359 Rest; 360parse_eq(_, State) -> 361 ?fatal_error(State, "expecting = or whitespace"). 362 363%%---------------------------------------------------------------------- 364%% Function: parse_value(Xml) -> {Value, Rest} 365%% Input: Xml = binary() 366%% Output: Value = string() 367%% Rest = binary() 368%% Description: Parsing an attribute value from the stream. 369%%---------------------------------------------------------------------- 370parse_value(<<C, Rest/binary>>, State) when ?is_whitespace(C) -> 371 parse_value(Rest, State); 372parse_value(<<C, Rest/binary>>, State) when C == $'; C == $" -> 373 parse_value_1(Rest, C, [], State); 374parse_value(_, State) -> 375 ?fatal_error(State, "\', \" or whitespace expected"). 376 377%%---------------------------------------------------------------------- 378%% Function: parse_value_1(Xml, Stop, Acc) -> {Value, Rest} 379%% Input: Xml = binary() 380%% Stop = $' | $" 381%% Acc = list() 382%% Output: Value = string() 383%% Rest = binary() 384%% Description: Parsing an attribute value from the stream. 385%%---------------------------------------------------------------------- 386parse_value_1(<<Stop, Rest/binary>>, Stop, Acc, _State) -> 387 {lists:reverse(Acc), Rest}; 388parse_value_1(<<C, Rest/binary>>, Stop, Acc, State) -> 389 parse_value_1(Rest, Stop, [C |Acc], State); 390parse_value_1(_, _Stop, _Acc, State) -> 391 ?fatal_error(State, "end of input and no \' or \" found"). 392 393%%====================================================================== 394%% Default functions 395%%====================================================================== 396%%---------------------------------------------------------------------- 397%% Function: default_event_cb(Event, LineNo, State) -> Result 398%% Input: Event = tuple() 399%% LineNo = integer() 400%% State = term() 401%% Output: Result = {ok, State} 402%% Description: Default event callback printing event. 403%%---------------------------------------------------------------------- 404default_event_cb(_Event, _LineNo, State) -> 405 State. 406 407%%---------------------------------------------------------------------- 408%% Function: default_continuation_cb(IoDevice) -> Result 409%% IoDevice = iodevice() 410%% Output: Result = {binary(), IoDevice} 411%% Description: Default continuation callback reading blocks. 412%%---------------------------------------------------------------------- 413default_continuation_cb(IoDevice) -> 414 case file:read(IoDevice, 1024) of 415 eof -> 416 {<<>>, IoDevice}; 417 {ok, FileBin} -> 418 {FileBin, IoDevice} 419 end. 420 421%%---------------------------------------------------------------------- 422%% Function: read_until_end_of_xml_directive(Rest, State) -> Result 423%% Rest = binary() 424%% Output: Result = {binary(), State} 425%% Description: Reads a utf8 or latin1 until it finds '?>' 426%%---------------------------------------------------------------------- 427read_until_end_of_xml_directive(Rest, State) -> 428 case binary:match(Rest, <<"?>">>) of 429 nomatch -> 430 case cf(Rest, State) of 431 {<<>>, _} -> 432 ?fatal_error(State, "Can't detect character encoding due to lack of indata"); 433 {NewBytes, NewState} -> 434 read_until_end_of_xml_directive(NewBytes, NewState) 435 end; 436 _ -> 437 {Rest, State} 438 end. 439 440 441%%---------------------------------------------------------------------- 442%% Function : cf(Rest, State) -> Result 443%% Parameters: Rest = binary() 444%% State = #xmerl_sax_parser_state{} 445%% NextCall = fun() 446%% Result : {Rest, State} 447%% Description: Function that uses provided fun to read another chunk from 448%% input stream and calls the fun in NextCall. 449%%---------------------------------------------------------------------- 450cf(_Rest, #xmerl_sax_parser_state{continuation_fun = undefined} = State) -> 451 ?fatal_error(State, "Continuation function undefined"); 452cf(Rest, #xmerl_sax_parser_state{continuation_fun = CFun, continuation_state = CState} = State) -> 453 Result = 454 try 455 CFun(CState) 456 catch 457 throw:ErrorTerm -> 458 ?fatal_error(State, ErrorTerm); 459 exit:Reason -> 460 ?fatal_error(State, {'EXIT', Reason}) 461 end, 462 case Result of 463 {<<>>, _} -> 464 ?fatal_error(State, "Can't detect character encoding due to lack of indata"); 465 {NewBytes, NewContState} -> 466 {<<Rest/binary, NewBytes/binary>>, 467 State#xmerl_sax_parser_state{continuation_state = NewContState}} 468 end. 469 470%%---------------------------------------------------------------------- 471%% Function : cf(Rest, State, NextCall) -> Result 472%% Parameters: Rest = binary() 473%% State = #xmerl_sax_parser_state{} 474%% NextCall = fun() 475%% Result : {Rest, State} 476%% Description: Function that uses provided fun to read another chunk from 477%% input stream and calls the fun in NextCall. 478%%---------------------------------------------------------------------- 479cf(_Rest, #xmerl_sax_parser_state{continuation_fun = undefined} = State, _) -> 480 ?fatal_error(State, "Continuation function undefined"); 481cf(Rest, #xmerl_sax_parser_state{continuation_fun = CFun, continuation_state = CState} = State, 482 NextCall) -> 483 Result = 484 try 485 CFun(CState) 486 catch 487 throw:ErrorTerm -> 488 ?fatal_error(State, ErrorTerm); 489 exit:Reason -> 490 ?fatal_error(State, {'EXIT', Reason}) 491 end, 492 case Result of 493 {<<>>, _} -> 494 ?fatal_error(State, "Can't detect character encoding due to lack of indata"); 495 {NewBytes, NewContState} -> 496 NextCall(<<Rest/binary, NewBytes/binary>>, 497 State#xmerl_sax_parser_state{continuation_state = NewContState}) 498 end. 499