1 // Copyright (c) 2014-2020 Thomas Fussell 2 // Copyright (c) 2010-2015 openpyxl 3 // 4 // Permission is hereby granted, free of charge, to any person obtaining a copy 5 // of this software and associated documentation files (the "Software"), to deal 6 // in the Software without restriction, including without limitation the rights 7 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 // copies of the Software, and to permit persons to whom the Software is 9 // furnished to do so, subject to the following conditions: 10 // 11 // The above copyright notice and this permission notice shall be included in 12 // all copies or substantial portions of the Software. 13 // 14 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, WRISING FROM, 19 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 // THE SOFTWARE 21 // 22 // @license: http://www.opensource.org/licenses/mit-license.php 23 // @author: see AUTHORS file 24 25 #pragma once 26 27 #include <cstdint> 28 #include <functional> 29 #include <iostream> 30 #include <memory> 31 #include <string> 32 #include <unordered_map> 33 #include <vector> 34 35 #include <detail/external/include_libstudxml.hpp> 36 #include <detail/serialization/zstream.hpp> 37 #include <xlnt/utils/numeric.hpp> 38 39 namespace xlnt { 40 41 class cell; 42 class color; 43 class rich_text; 44 class manifest; 45 template<typename T> 46 class optional; 47 class path; 48 class relationship; 49 class streaming_workbook_reader; 50 class variant; 51 class workbook; 52 class worksheet; 53 54 namespace detail { 55 56 class izstream; 57 struct cell_impl; 58 struct worksheet_impl; 59 60 /// <summary> 61 /// Handles writing a workbook into an XLSX file. 62 /// </summary> 63 class xlsx_consumer 64 { 65 public: 66 xlsx_consumer(workbook &destination); 67 68 ~xlsx_consumer(); 69 70 void read(std::istream &source); 71 72 void read(std::istream &source, const std::string &password); 73 74 private: 75 friend class xlnt::streaming_workbook_reader; 76 77 void open(std::istream &source); 78 79 bool has_cell(); 80 81 /// <summary> 82 /// Reads the next cell in the current worksheet and optionally returns it if 83 /// the last cell in the sheet has not yet been read. An exception will be thrown 84 /// if this is not open as a streaming consumer. 85 /// </summary> 86 cell read_cell(); 87 88 /// <summary> 89 /// Read all the files needed from the XLSX archive and initialize all of 90 /// the data in the workbook to match. 91 /// </summary> 92 void populate_workbook(bool streaming); 93 94 /// <summary> 95 /// 96 /// </summary> 97 void read_content_types(); 98 99 // Metadata Property Readers 100 101 /// <summary> 102 /// Parse the core properties about the current package. 103 /// </summary> 104 void read_core_properties(); 105 106 /// <summary> 107 /// Parse the core properties about the current package. 108 /// </summary> 109 void read_extended_properties(); 110 111 /// <summary> 112 /// Parse the core properties about the current package. 113 /// </summary> 114 void read_custom_properties(); 115 116 // SpreadsheetML-Specific Package Part Readers 117 118 /// <summary> 119 /// Parse the main XML document about the workbook and then all child relationships 120 /// of the workbook (e.g. worksheets). 121 /// </summary> 122 void read_office_document(const std::string &content_type); 123 124 // Workbook Relationship Target Parts 125 126 /// <summary> 127 /// xl/calcChain.xml 128 /// </summary> 129 void read_calculation_chain(); 130 131 /// <summary> 132 /// 133 /// </summary> 134 void read_connections(); 135 136 /// <summary> 137 /// 138 /// </summary> 139 void read_custom_property(); 140 141 /// <summary> 142 /// 143 /// </summary> 144 void read_custom_xml_mappings(); 145 146 /// <summary> 147 /// 148 /// </summary> 149 void read_external_workbook_references(); 150 151 /// <summary> 152 /// 153 /// </summary> 154 void read_pivot_table(); 155 156 /// <summary> 157 /// xl/sharedStrings.xml 158 /// </summary> 159 void read_shared_string_table(); 160 161 /// <summary> 162 /// 163 /// </summary> 164 void read_shared_workbook_revision_headers(); 165 166 /// <summary> 167 /// 168 /// </summary> 169 void read_shared_workbook(); 170 171 /// <summary> 172 /// 173 /// </summary> 174 void read_shared_workbook_user_data(); 175 176 /// <summary> 177 /// xl/styles.xml 178 /// </summary> 179 void read_stylesheet(); 180 181 /// <summary> 182 /// xl/theme/theme1.xml 183 /// </summary> 184 void read_theme(); 185 186 /// <summary> 187 /// 188 /// </summary> 189 void read_volatile_dependencies(); 190 191 /// <summary> 192 /// xl/sheets/*.xml 193 /// </summary> 194 void read_chartsheet(const std::string &rel_id); 195 196 /// <summary> 197 /// xl/sheets/*.xml 198 /// </summary> 199 void read_dialogsheet(const std::string &rel_id); 200 201 /// <summary> 202 /// xl/sheets/*.xml 203 /// </summary> 204 void read_worksheet(const std::string &rel_id); 205 206 /// <summary> 207 /// xl/sheets/*.xml 208 /// </summary> 209 std::string read_worksheet_begin(const std::string &rel_id); 210 211 /// <summary> 212 /// xl/sheets/*.xml 213 /// </summary> 214 void read_worksheet_sheetdata(); 215 216 /// <summary> 217 /// xl/sheets/*.xml 218 /// </summary> 219 worksheet read_worksheet_end(const std::string &rel_id); 220 221 // Sheet Relationship Target Parts 222 223 /// <summary> 224 /// 225 /// </summary> 226 void read_comments(worksheet ws); 227 228 /// <summary> 229 /// 230 /// </summary> 231 void read_vml_drawings(worksheet ws); 232 233 /// <summary> 234 /// 235 /// </summary> 236 void read_drawings(worksheet ws, const path &part); 237 238 // Unknown Parts 239 240 /// <summary> 241 /// 242 /// </summary> 243 void read_unknown_parts(); 244 245 /// <summary> 246 /// 247 /// </summary> 248 void read_unknown_relationships(); 249 250 /// <summary> 251 /// 252 /// </summary> 253 void read_image(const path &part); 254 255 // Common Section Readers 256 257 /// <summary> 258 /// Read part from the archive and return a vector of relationships 259 /// based on the content of that part. 260 /// </summary> 261 std::vector<relationship> read_relationships(const path &part); 262 263 /// <summary> 264 /// Read a CT_Color from the document currently being parsed. 265 /// </summary> 266 color read_color(); 267 268 /// <summary> 269 /// Read a rich text CT_RElt from the document currently being parsed. 270 /// </summary> 271 rich_text read_rich_text(const xml::qname &parent); 272 273 /// <summary> 274 /// Returns true if the givent document type represents an XLSX file. 275 /// </summary> 276 bool document_type_is_xlsx(const std::string &document_content_type); 277 278 // SAX Parsing Helpers 279 280 /// <summary> 281 /// In mixed content XML elements, whitespace before and after is not ignored. 282 /// Additionally, if PCDATA spans the boundary of the XML read buffer, it will 283 /// be parsed as two separate strings instead of on longer string. This method 284 /// will read character data until non-character data is peek()ed from the parser 285 /// and returns the combined strings. This should be used when parsing mixed 286 /// content to ignore whitespace and whenever character data is expected between 287 /// tags. 288 /// </summary> 289 std::string read_text(); 290 291 variant read_variant(); 292 293 /// <summary> 294 /// Read the part from the archive and parse it as XML. After this is called, 295 /// xlsx_consumer::parser() will return a reference to the parser that reads 296 /// this part. 297 /// </summary> 298 void read_part(const std::vector<relationship> &rel_chain); 299 300 /// <summary> 301 /// libstudxml will throw an exception if all attributes on an element are not 302 /// read with xml::parser::attribute(const std::string &). This should therefore 303 /// be called if every remaining attribute should be ignored on an element. 304 /// </summary> 305 void skip_attributes(); 306 307 /// <summary> 308 /// Skip attribute name if it exists on the currently parsed element in the XML 309 /// parser. 310 /// </summary> 311 void skip_attribute(const std::string &name); 312 313 /// <summary> 314 /// Skip attribute name if it exists on the currently parsed element in the XML 315 /// parser. 316 /// </summary> 317 void skip_attribute(const xml::qname &name); 318 319 /// <summary> 320 /// Call skip_attribute on every name in names. 321 /// </summary> 322 void skip_attributes(const std::vector<xml::qname> &names); 323 324 /// <summary> 325 /// Call skip_attribute on every name in names. 326 /// </summary> 327 void skip_attributes(const std::vector<std::string> &names); 328 329 /// <summary> 330 /// Read all content in name until the closing tag is reached. 331 /// The closing tag will not be handled after this is called. 332 /// </summary> 333 void skip_remaining_content(const xml::qname &name); 334 335 /// <summary> 336 /// Handles the next event in the XML parser and throws an exception 337 /// if it is not the start of an element. Additionally sets the content 338 /// type of the element to content. 339 /// </summary> 340 xml::qname expect_start_element(xml::content content); 341 342 /// <summary> 343 /// Handles the next event in the XML parser and throws an exception 344 /// if the next element is not named name. Sets the content type of 345 /// the element to content. 346 /// </summary> 347 void expect_start_element(const xml::qname &name, xml::content content); 348 349 /// <summary> 350 /// Throws an exception if the next event in the XML parser is not 351 /// the end of element called name. 352 /// </summary> 353 void expect_end_element(const xml::qname &name); 354 355 /// <summary> 356 /// Returns true if the top of the parsing stack is called name and 357 /// the end of that element hasn't been reached in the XML document. 358 /// </summary> 359 bool in_element(const xml::qname &name); 360 361 /// <summary> 362 /// Throws an exception or skips remaining elements depending on 363 /// the value of THROW_ON_INVALID_XML. 364 /// </summary> 365 void unexpected_element(const xml::qname &name); 366 367 // Properties 368 369 /// <summary> 370 /// Convenience method to dereference the pointer to the current parser to avoid 371 /// having to use "parser_->" constantly. 372 /// </summary> 373 xml::parser &parser(); 374 375 /// <summary> 376 /// Convenience method to access the target workbook's manifest. 377 /// </summary> 378 class manifest &manifest(); 379 380 /// <summary> 381 /// The ZIP file containing the files that make up the OOXML package. 382 /// </summary> 383 std::unique_ptr<izstream> archive_; 384 385 /// <summary> 386 /// Map of sheet titles to relationship IDs. 387 /// </summary> 388 std::unordered_map<std::string, std::size_t> sheet_title_id_map_; 389 390 /// <summary> 391 /// Map of sheet titles to indices. Used to ensure sheets are maintained 392 /// in the correct order. 393 /// </summary> 394 std::unordered_map<std::string, std::size_t> sheet_title_index_map_; 395 396 /// <summary> 397 /// A reference to the workbook which is being read. 398 /// </summary> 399 workbook &target_; 400 401 /// <summary> 402 /// This pointer is generally set by instantiating an xml::parser in a function 403 /// scope and then calling a read_*() method which uses xlsx_consumer::parser() 404 /// to access the object. 405 /// </summary> 406 xml::parser *parser_; 407 408 std::vector<xml::qname> stack_; 409 410 bool preserve_space_ = false; 411 412 bool streaming_ = false; 413 414 std::unique_ptr<detail::cell_impl> streaming_cell_; 415 416 detail::worksheet_impl *current_worksheet_; 417 number_serialiser converter_; 418 }; 419 420 } // namespace detail 421 } // namespace xlnt 422