1 // Copyright (c) 2014-2020 Thomas Fussell
2 // Copyright (c) 2010-2015 openpyxl
3 //
4 // Permission is hereby granted, free of charge, to any person obtaining a copy
5 // of this software and associated documentation files (the "Software"), to deal
6 // in the Software without restriction, including without limitation the rights
7 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8 // copies of the Software, and to permit persons to whom the Software is
9 // furnished to do so, subject to the following conditions:
10 //
11 // The above copyright notice and this permission notice shall be included in
12 // all copies or substantial portions of the Software.
13 //
14 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, WRISING FROM,
19 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20 // THE SOFTWARE
21 //
22 // @license: http://www.opensource.org/licenses/mit-license.php
23 // @author: see AUTHORS file
24 
25 #pragma once
26 
27 #include <cstdint>
28 #include <functional>
29 #include <iostream>
30 #include <memory>
31 #include <string>
32 #include <unordered_map>
33 #include <vector>
34 
35 #include <detail/external/include_libstudxml.hpp>
36 #include <detail/serialization/zstream.hpp>
37 #include <xlnt/utils/numeric.hpp>
38 
39 namespace xlnt {
40 
41 class cell;
42 class color;
43 class rich_text;
44 class manifest;
45 template<typename T>
46 class optional;
47 class path;
48 class relationship;
49 class streaming_workbook_reader;
50 class variant;
51 class workbook;
52 class worksheet;
53 
54 namespace detail {
55 
56 class izstream;
57 struct cell_impl;
58 struct worksheet_impl;
59 
60 /// <summary>
61 /// Handles writing a workbook into an XLSX file.
62 /// </summary>
63 class xlsx_consumer
64 {
65 public:
66 	xlsx_consumer(workbook &destination);
67 
68 	~xlsx_consumer();
69 
70 	void read(std::istream &source);
71 
72 	void read(std::istream &source, const std::string &password);
73 
74 private:
75     friend class xlnt::streaming_workbook_reader;
76 
77     void open(std::istream &source);
78 
79     bool has_cell();
80 
81     /// <summary>
82     /// Reads the next cell in the current worksheet and optionally returns it if
83     /// the last cell in the sheet has not yet been read. An exception will be thrown
84     /// if this is not open as a streaming consumer.
85     /// </summary>
86     cell read_cell();
87 
88 	/// <summary>
89 	/// Read all the files needed from the XLSX archive and initialize all of
90 	/// the data in the workbook to match.
91 	/// </summary>
92 	void populate_workbook(bool streaming);
93 
94     /// <summary>
95     ///
96     /// </summary>
97     void read_content_types();
98 
99     // Metadata Property Readers
100 
101 	/// <summary>
102 	/// Parse the core properties about the current package.
103 	/// </summary>
104 	void read_core_properties();
105 
106     /// <summary>
107     /// Parse the core properties about the current package.
108     /// </summary>
109     void read_extended_properties();
110 
111     /// <summary>
112     /// Parse the core properties about the current package.
113     /// </summary>
114     void read_custom_properties();
115 
116 	// SpreadsheetML-Specific Package Part Readers
117 
118 	/// <summary>
119 	/// Parse the main XML document about the workbook and then all child relationships
120 	/// of the workbook (e.g. worksheets).
121 	/// </summary>
122 	void read_office_document(const std::string &content_type);
123 
124 	// Workbook Relationship Target Parts
125 
126 	/// <summary>
127 	/// xl/calcChain.xml
128 	/// </summary>
129 	void read_calculation_chain();
130 
131 	/// <summary>
132 	///
133 	/// </summary>
134 	void read_connections();
135 
136 	/// <summary>
137 	///
138 	/// </summary>
139 	void read_custom_property();
140 
141 	/// <summary>
142 	///
143 	/// </summary>
144 	void read_custom_xml_mappings();
145 
146 	/// <summary>
147 	///
148 	/// </summary>
149 	void read_external_workbook_references();
150 
151 	/// <summary>
152 	///
153 	/// </summary>
154 	void read_pivot_table();
155 
156 	/// <summary>
157 	/// xl/sharedStrings.xml
158 	/// </summary>
159 	void read_shared_string_table();
160 
161 	/// <summary>
162 	///
163 	/// </summary>
164 	void read_shared_workbook_revision_headers();
165 
166 	/// <summary>
167 	///
168 	/// </summary>
169 	void read_shared_workbook();
170 
171 	/// <summary>
172 	///
173 	/// </summary>
174 	void read_shared_workbook_user_data();
175 
176 	/// <summary>
177 	/// xl/styles.xml
178 	/// </summary>
179 	void read_stylesheet();
180 
181 	/// <summary>
182 	/// xl/theme/theme1.xml
183 	/// </summary>
184 	void read_theme();
185 
186 	/// <summary>
187 	///
188 	/// </summary>
189 	void read_volatile_dependencies();
190 
191 	/// <summary>
192 	/// xl/sheets/*.xml
193 	/// </summary>
194 	void read_chartsheet(const std::string &rel_id);
195 
196 	/// <summary>
197 	/// xl/sheets/*.xml
198 	/// </summary>
199 	void read_dialogsheet(const std::string &rel_id);
200 
201 	/// <summary>
202 	/// xl/sheets/*.xml
203 	/// </summary>
204 	void read_worksheet(const std::string &rel_id);
205 
206     /// <summary>
207     /// xl/sheets/*.xml
208     /// </summary>
209     std::string read_worksheet_begin(const std::string &rel_id);
210 
211     /// <summary>
212     /// xl/sheets/*.xml
213     /// </summary>
214     void read_worksheet_sheetdata();
215 
216     /// <summary>
217     /// xl/sheets/*.xml
218     /// </summary>
219     worksheet read_worksheet_end(const std::string &rel_id);
220 
221 	// Sheet Relationship Target Parts
222 
223 	/// <summary>
224 	///
225 	/// </summary>
226 	void read_comments(worksheet ws);
227 
228 	/// <summary>
229 	///
230 	/// </summary>
231 	void read_vml_drawings(worksheet ws);
232 
233 	/// <summary>
234 	///
235 	/// </summary>
236 	void read_drawings(worksheet ws, const path &part);
237 
238 	// Unknown Parts
239 
240 	/// <summary>
241 	///
242 	/// </summary>
243 	void read_unknown_parts();
244 
245 	/// <summary>
246 	///
247 	/// </summary>
248 	void read_unknown_relationships();
249 
250 	/// <summary>
251 	///
252 	/// </summary>
253 	void read_image(const path &part);
254 
255     // Common Section Readers
256 
257     /// <summary>
258     /// Read part from the archive and return a vector of relationships
259     /// based on the content of that part.
260     /// </summary>
261     std::vector<relationship> read_relationships(const path &part);
262 
263     /// <summary>
264     /// Read a CT_Color from the document currently being parsed.
265     /// </summary>
266     color read_color();
267 
268     /// <summary>
269     /// Read a rich text CT_RElt from the document currently being parsed.
270     /// </summary>
271     rich_text read_rich_text(const xml::qname &parent);
272 
273     /// <summary>
274     /// Returns true if the givent document type represents an XLSX file.
275     /// </summary>
276     bool document_type_is_xlsx(const std::string &document_content_type);
277 
278     // SAX Parsing Helpers
279 
280     /// <summary>
281     /// In mixed content XML elements, whitespace before and after is not ignored.
282     /// Additionally, if PCDATA spans the boundary of the XML read buffer, it will
283     /// be parsed as two separate strings instead of on longer string. This method
284     /// will read character data until non-character data is peek()ed from the parser
285     /// and returns the combined strings. This should be used when parsing mixed
286     /// content to ignore whitespace and whenever character data is expected between
287     /// tags.
288     /// </summary>
289     std::string read_text();
290 
291     variant read_variant();
292 
293     /// <summary>
294     /// Read the part from the archive and parse it as XML. After this is called,
295     /// xlsx_consumer::parser() will return a reference to the parser that reads
296     /// this part.
297     /// </summary>
298     void read_part(const std::vector<relationship> &rel_chain);
299 
300     /// <summary>
301     /// libstudxml will throw an exception if all attributes on an element are not
302     /// read with xml::parser::attribute(const std::string &). This should therefore
303     /// be called if every remaining attribute should be ignored on an element.
304     /// </summary>
305     void skip_attributes();
306 
307     /// <summary>
308     /// Skip attribute name if it exists on the currently parsed element in the XML
309     /// parser.
310     /// </summary>
311     void skip_attribute(const std::string &name);
312 
313     /// <summary>
314     /// Skip attribute name if it exists on the currently parsed element in the XML
315     /// parser.
316     /// </summary>
317     void skip_attribute(const xml::qname &name);
318 
319     /// <summary>
320     /// Call skip_attribute on every name in names.
321     /// </summary>
322     void skip_attributes(const std::vector<xml::qname> &names);
323 
324     /// <summary>
325     /// Call skip_attribute on every name in names.
326     /// </summary>
327     void skip_attributes(const std::vector<std::string> &names);
328 
329     /// <summary>
330     /// Read all content in name until the closing tag is reached.
331     /// The closing tag will not be handled after this is called.
332     /// </summary>
333     void skip_remaining_content(const xml::qname &name);
334 
335     /// <summary>
336     /// Handles the next event in the XML parser and throws an exception
337     /// if it is not the start of an element. Additionally sets the content
338     /// type of the element to content.
339     /// </summary>
340     xml::qname expect_start_element(xml::content content);
341 
342     /// <summary>
343     /// Handles the next event in the XML parser and throws an exception
344     /// if the next element is not named name. Sets the content type of
345     /// the element to content.
346     /// </summary>
347     void expect_start_element(const xml::qname &name, xml::content content);
348 
349     /// <summary>
350     /// Throws an exception if the next event in the XML parser is not
351     /// the end of element called name.
352     /// </summary>
353     void expect_end_element(const xml::qname &name);
354 
355     /// <summary>
356     /// Returns true if the top of the parsing stack is called name and
357     /// the end of that element hasn't been reached in the XML document.
358     /// </summary>
359     bool in_element(const xml::qname &name);
360 
361     /// <summary>
362     /// Throws an exception or skips remaining elements depending on
363     /// the value of THROW_ON_INVALID_XML.
364     /// </summary>
365     void unexpected_element(const xml::qname &name);
366 
367     // Properties
368 
369 	/// <summary>
370 	/// Convenience method to dereference the pointer to the current parser to avoid
371 	/// having to use "parser_->" constantly.
372 	/// </summary>
373 	xml::parser &parser();
374 
375     /// <summary>
376     /// Convenience method to access the target workbook's manifest.
377     /// </summary>
378     class manifest &manifest();
379 
380 	/// <summary>
381 	/// The ZIP file containing the files that make up the OOXML package.
382 	/// </summary>
383 	std::unique_ptr<izstream> archive_;
384 
385 	/// <summary>
386 	/// Map of sheet titles to relationship IDs.
387 	/// </summary>
388 	std::unordered_map<std::string, std::size_t> sheet_title_id_map_;
389 
390 	/// <summary>
391 	/// Map of sheet titles to indices. Used to ensure sheets are maintained
392 	/// in the correct order.
393 	/// </summary>
394 	std::unordered_map<std::string, std::size_t> sheet_title_index_map_;
395 
396 	/// <summary>
397 	/// A reference to the workbook which is being read.
398 	/// </summary>
399 	workbook &target_;
400 
401 	/// <summary>
402 	/// This pointer is generally set by instantiating an xml::parser in a function
403 	/// scope and then calling a read_*() method which uses xlsx_consumer::parser()
404 	/// to access the object.
405 	/// </summary>
406 	xml::parser *parser_;
407 
408     std::vector<xml::qname> stack_;
409 
410     bool preserve_space_ = false;
411 
412     bool streaming_ = false;
413 
414     std::unique_ptr<detail::cell_impl> streaming_cell_;
415 
416     detail::worksheet_impl *current_worksheet_;
417     number_serialiser converter_;
418 };
419 
420 } // namespace detail
421 } // namespace xlnt
422