1 /* This file is part of the wvWare 2 project
2    Copyright (C) 2001-2003 Werner Trobin <trobin@kde.org>
3 
4    This library is free software; you can redistribute it and/or
5    modify it under the terms of the GNU Library General Public
6    License version 2 as published by the Free Software Foundation.
7 
8    This library is distributed in the hope that it will be useful,
9    but WITHOUT ANY WARRANTY; without even the implied warranty of
10    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11    Library General Public License for more details.
12 
13    You should have received a copy of the GNU Library General Public License
14    along with this library; see the file COPYING.LIB.  If not, write to
15    the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16    Boston, MA 02111-1307, USA.
17 */
18 
19 #ifndef PARSER9X_H
20 #define PARSER9X_H
21 
22 #include "parser.h"
23 #include "word97_generated.h"
24 
25 #include <string>
26 #include <list>
27 #include <stack>
28 
29 namespace wvWare
30 {
31 
32     // Word97 so far. Is that different in Word95?
33     const unsigned char CELL_MARK = 7;
34     const unsigned char ROW_MARK = 7;
35     const unsigned char TAB = 9;
36     const unsigned char HARD_LINE_BREAK = 11;
37     const unsigned char PAGE_BREAK = 12;
38     const unsigned char SECTION_MARK = 12;
39     const unsigned char PARAGRAPH_MARK = 13;
40     const unsigned char COLUMN_BREAK = 14;
41     const unsigned char FIELD_BEGIN_MARK = 19;
42     const unsigned char FIELD_SEPARATOR = 20;
43     const unsigned char FIELD_END_MARK = 21;
44     const unsigned char NON_BREAKING_HYPHEN = 30;
45     const unsigned char NON_REQUIRED_HYPHEN = 31;
46     const unsigned char SPACE = 32;
47     const unsigned char BREAKING_HYPHEN = 45;
48     const unsigned char NON_BREAKING_SPACE = 160;
49     const unsigned char FIELD_ESCAPE_CHAR = '\\';
50     const unsigned char FORMULA_MARK = '\\';
51 
52     // Special chars (fSpec==1)
53     const unsigned char SPEC_CURRENT_PAGE_NUMBER = 0;
54     const unsigned char SPEC_PICTURE = 1;
55     const unsigned char SPEC_AUTONUM_FOOTNOTE_REF = 2;
56     const unsigned char SPEC_FOOTNOTE_SEPARATOR = 3;
57     const unsigned char SPEC_FOOTNOTE_CONTINUATION = 4;
58     const unsigned char SPEC_ANNOTATION_REF = 5;
59     const unsigned char SPEC_LINE_NUMBER = 6;
60     const unsigned char SPEC_HAND_ANNOTATION_PIC = 7;
61     const unsigned char SPEC_DRAWN_OBJECT = 8;
62     const unsigned char SPEC_ABBREV_DATE = 10;
63     const unsigned char SPEC_TIME_HMS = 11;
64     const unsigned char SPEC_CURRENT_SECTION_NUMBER = 12;
65     const unsigned char SPEC_ABBREV_WEEKDAY = 14;
66     const unsigned char SPEC_WEEKDAY = 15;
67     const unsigned char SPEC_DAY_SHORT = 16;
68     const unsigned char SPEC_CURRENT_HOUR = 22;
69     const unsigned char SPEC_CURRENT_HOUR_TWODIG = 23;
70     const unsigned char SPEC_CURRENT_MINUTE = 24;
71     const unsigned char SPEC_CURRENT_MINUTE_TWODIG = 25;
72     const unsigned char SPEC_CURRENT_SECONDS = 26;
73     const unsigned char SPEC_CURRENT_AMPM = 27;
74     const unsigned char SPEC_CURRENT_TIME_HMS = 28;
75     const unsigned char SPEC_DATE_M = 29;
76     const unsigned char SPEC_DATE_SHORT = 30;
77     const unsigned char SPEC_MONTH_SHORT = 33;
78     const unsigned char SPEC_YEAR_LONG = 34;
79     const unsigned char SPEC_YEAR_SHORT = 35;
80     const unsigned char SPEC_MONTH_ABBREV = 36;
81     const unsigned char SPEC_MONTH_LONG = 37;
82     const unsigned char SPEC_CURRENT_TIME_HM = 38;
83     const unsigned char SPEC_DATE_LONG = 39;
84     const unsigned char SPEC_MERGE_HELPER = 41;
85 
86 
87     class Properties97;
88     class ListInfoProvider;
89     class FontCollection;
90     class TextConverter;
91     class Fields;
92     class Headers;
93     class Footnotes97;
94     class Drawings;
95     template<class T> class PLCF;
96 
97     // Helper structures for the Functor-based approach
98     struct HeaderData;
99     struct FootnoteData;
100     struct TableRowData;
101     struct PictureData;
102 
103     /**
104      * This class should contain all the common functionality shared
105      * among the Word9[5|7] parsers.
106      */
107     class Parser9x : public Parser
108     {
109     public:
110         Parser9x( OLEStorage* storage, OLEStreamReader* wordDocument, const Word97::FIB& fib );
111         virtual ~Parser9x();
112 
113         /**
114          * The main parsing method
115          */
116         virtual bool parse();
117 
118         virtual const Word97::FIB& fib() const;
119         virtual const Word97::DOP& dop() const;
120 
121         /**
122          * Get the font family name structure for a given ftc.
123          */
124         virtual const Word97::FFN& font( S16 ftc ) const;
125 
126         /**
127          * Get the associated strings (author, title,...).
128          * Not cached.
129          */
130         virtual AssociatedStrings associatedStrings();
131 
132         virtual const StyleSheet& styleSheet() const;
133 
134         // This part of the public API is only visible to the Functor classes,
135         // as the "outside world" only sees the public API of Parser. The Functors
136         // allow to delay the parsing of certain text inside the file (e.g. headers)
137         // and trigger parsing at any point (as long as the parser exists).
138         //
139         // In case you want to add a new method here, please obey the following guidelines:
140         //    - Executing the method mustn't change the state of the parser (i.e. save and
141         //      restore the state!)
142         //    - Be very careful, these calls can possibly be triggered at any time
143         void parseHeaders( const HeaderData& data );
144         void parseFootnote( const FootnoteData& data );
145         void parseTableRow( const TableRowData& data );
146         void parsePicture( const PictureData& data );
147 
148     protected:
149         // First all variables which don't change their state during
150         // the parsing process. We don't have to save and restore those.
151         const Word97::FIB m_fib;
152 
153         OLEStreamReader* m_table; // table stream ('WordDocument' for Word 6+95 and
154                                   // the real table stream for Word 97+)
155         OLEStreamReader* m_data;  // data stream (if any, most of the time 0)
156 
157         Properties97* m_properties;
158         Headers* m_headers;
159 
160         // From here on we have all variables which change their state depending
161         // on the parsed content. These variables have to be saved and restored
162         // to make the parsing code reentrant.
163 
164     private:
165         // Don't copy or assign us
166         Parser9x( const Parser9x& rhs );
167         Parser9x& operator=( const Parser9x& rhs );
168 
169         // Uniquely represents a position inside a complex file. Used to map a CP to a Position
170         struct Position
171         {
172             // Start position
PositionPosition173             Position( U32 p, U32 o ) : piece( p ), offset( o ) {}
174             // Constructs a Position from a CP
175             Position( U32 cp, const PLCF<Word97::PCD>* plcfpcd );
176 
177             U32 piece;    // The piece number (0-based index)
178             U32 offset;   // The CP offset within the piece
179         };
180 
181         // Represents a chunk of text. This is a part of a (or a whole) paragraph
182         // contained in one text piece. A paragraph consists of at least one Chunk.
183         // We don't store the paragraph/section mark, and in case only the paragraph
184         // mark sits in a different piece than the rest of the paragraph we just store
185         // an empty string for this chunk.
186         struct Chunk
187         {
ChunkChunk188             Chunk( const UString& text, const Position& position, U32 startFC, bool isUnicode ) :
189                 m_text( text ), m_position( position ), m_startFC( startFC ), m_isUnicode( isUnicode ) {}
190 
191             UString m_text;
192             Position m_position;
193             U32 m_startFC;
194             bool m_isUnicode;
195         };
196         // Represents a paragraph consisting of at least one Chunk. Right now it's only
197         // a typedef, maybe we need more than that later on
198         typedef std::list<Chunk> Paragraph;
199 
200         // We have to keep track of the current parsing mode (e.g. are we skimming tables
201         // or are we parsing them?)
202         enum ParsingMode { Default, Table };
203 
204         // "Callbacks" for the 95/97 parsers
205         // ##### TODO
206 
207         // Private helper methods
208         std::string tableStream() const;
209         void init();
210         bool readPieceTable();
211         void fakePieceTable();
212 
213         bool parseBody();
214 
215         // Expects m_remainingChars to be set correctly, changes the state of m_wordDocument,...
216         void parseHelper( Position startPos );
217         template<typename String> void processPiece( String* string, U32 fc, U32 limit, const Position& position );
218         // These helper methods are a cheap trick to "configure" parts of the template code by
219         // plain old overloading. It's just a matter of compressed vs. real unicode (1 vs. 2 bytes)
220         UString processPieceStringHelper( XCHAR* string, unsigned int start, unsigned int index ) const;
221         UString processPieceStringHelper( U8* string, unsigned int start, unsigned int index ) const;
222         // Processes the current contents of the Paragraph structure and clears it when it's done
223         void processParagraph( U32 fc );
224         void processChunk( const Chunk& chunk, SharedPtr<const Word97::CHP> chp,
225                            U32 length, U32 index, U32 currentStart );
226         void processRun( const Chunk& chunk, SharedPtr<const Word97::CHP> chp,
227                          U32 length, U32 index, U32 currentStart );
228 
229         void processSpecialCharacter( UChar character, U32 globalCP, SharedPtr<const Word97::CHP> chp );
230         void processFootnote( UChar character, U32 globalCP, SharedPtr<const Word97::CHP> chp );
231 
232         // Helper methods to gather and emit the information needed for the functors
233         void emitHeaderData( SharedPtr<const Word97::SEP> sep );
234         void emitPictureData( SharedPtr<const Word97::CHP> chp );
235         void emitDrawnObject( SharedPtr<const Word97::CHP> chp );
236 
237         void parseHeader( const HeaderData& data, unsigned char mask );
238 
239         void parsePictureEscher( const PictureData& data, OLEStreamReader* stream,
240                 int totalPicfSize, int picfStartPos );
241         void parsePictureExternalHelper( const PictureData& data, OLEStreamReader* stream );
242         void parsePictureBitmapHelper( const PictureData& data, OLEStreamReader* stream );
243         void parsePictureWmfHelper( const PictureData& data, OLEStreamReader* stream );
244 
245         void saveState( U32 newRemainingChars, SubDocument newSubDocument, ParsingMode newParsingMode = Default );
246         void restoreState();
247 
248         // Maps the global CP (as found in the piece table) to the local CP
249         // coordinate space of the corresponding sub document
250         U32 toLocalCP( U32 globalCP ) const;
251         // Calculates the real FC and tells us whether it was unicode or not
252         inline void realFC( U32& fc, bool& unicode ) const;
253         // Helper method to use std::accumulate in the table handling code
254         static int accumulativeLength( int len, const Chunk& chunk );
255 
256         // Private variables, no access needed in 95/97 code
257         // First all variables which don't change their state during
258         // the parsing process. We don't have to save and restore those.
259         ListInfoProvider* m_lists;
260         TextConverter* m_textconverter;
261         Fields* m_fields;
262         Footnotes97* m_footnotes;
263         FontCollection* m_fonts;
264         Drawings* m_drawings;
265 
266         PLCF<Word97::PCD>* m_plcfpcd;     // piece table
267 
268         // From here on we have all variables which change their state depending
269         // on the parsed content. These variables have to be saved and restored
270         // to make the parsing code reentrant.
271         Position* m_tableRowStart;      // If != 0 this represents the start of a table row
272         U32 m_tableRowLength;           // Lenght of the table row (in characters). Only valid
273         bool m_cellMarkFound;           // if m_tableRowStart != 0
274         int m_remainingCells;           // The number of remaining cells for the processed row
275 
276         Paragraph* m_currentParagraph;
277 
278         U32 m_remainingChars;
279         U32 m_sectionNumber;
280 
281         // Keeps track of the current sub document
282         SubDocument m_subDocument;
283 
284         // We have to behave differently, depending whether we are parsing
285         // a table or the "main" text, as we skim the table first
286         ParsingMode m_parsingMode;
287 
288         // Needed to have reentrant parsing methods (to make the functor approach work)
289         struct ParsingState
290         {
ParsingStateParsingState291             ParsingState( Position* tableRowS, U32 tableRowL, bool cMarkFound,
292                           int remCells, Paragraph* parag, U32 remChars, U32 sectionNum,
293                           SubDocument subD, ParsingMode mode ) :
294                 tableRowStart( tableRowS ), tableRowLength( tableRowL ), cellMarkFound( cMarkFound),
295                 remainingCells( remCells ), paragraph( parag ), remainingChars( remChars ),
296                 sectionNumber( sectionNum ), subDocument( subD ), parsingMode( mode ) {}
297 
298             Position* tableRowStart;
299             U32 tableRowLength;
300             bool cellMarkFound;
301             int remainingCells;
302             Paragraph* paragraph;
303             U32 remainingChars;
304             U32 sectionNumber;   // not strictly necessary, but doesn't hurt
305             SubDocument subDocument;
306             ParsingMode parsingMode;
307         };
308 
309         std::stack<ParsingState> oldParsingStates;
310     };
311 
realFC(U32 & fc,bool & unicode)312     inline void Parser9x::realFC( U32& fc, bool& unicode ) const
313     {
314         if ( fc & 0x40000000 ) {
315             fc = ( fc & 0xbfffffff ) >> 1;
316             unicode = false;
317         }
318         else
319             unicode = m_fib.nFib >= Word8nFib;
320     }
321 
322 } // namespace wvWare
323 
324 #endif // PARSER9X_H
325