1 /* This file is part of the wvWare 2 project 2 Copyright (C) 2001-2003 Werner Trobin <trobin@kde.org> 3 4 This library is free software; you can redistribute it and/or 5 modify it under the terms of the GNU Library General Public 6 License version 2 as published by the Free Software Foundation. 7 8 This library is distributed in the hope that it will be useful, 9 but WITHOUT ANY WARRANTY; without even the implied warranty of 10 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 Library General Public License for more details. 12 13 You should have received a copy of the GNU Library General Public License 14 along with this library; see the file COPYING.LIB. If not, write to 15 the Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 Boston, MA 02111-1307, USA. 17 */ 18 19 #ifndef PARSER9X_H 20 #define PARSER9X_H 21 22 #include "parser.h" 23 #include "word97_generated.h" 24 25 #include <string> 26 #include <list> 27 #include <stack> 28 29 namespace wvWare 30 { 31 32 // Word97 so far. Is that different in Word95? 33 const unsigned char CELL_MARK = 7; 34 const unsigned char ROW_MARK = 7; 35 const unsigned char TAB = 9; 36 const unsigned char HARD_LINE_BREAK = 11; 37 const unsigned char PAGE_BREAK = 12; 38 const unsigned char SECTION_MARK = 12; 39 const unsigned char PARAGRAPH_MARK = 13; 40 const unsigned char COLUMN_BREAK = 14; 41 const unsigned char FIELD_BEGIN_MARK = 19; 42 const unsigned char FIELD_SEPARATOR = 20; 43 const unsigned char FIELD_END_MARK = 21; 44 const unsigned char NON_BREAKING_HYPHEN = 30; 45 const unsigned char NON_REQUIRED_HYPHEN = 31; 46 const unsigned char SPACE = 32; 47 const unsigned char BREAKING_HYPHEN = 45; 48 const unsigned char NON_BREAKING_SPACE = 160; 49 const unsigned char FIELD_ESCAPE_CHAR = '\\'; 50 const unsigned char FORMULA_MARK = '\\'; 51 52 // Special chars (fSpec==1) 53 const unsigned char SPEC_CURRENT_PAGE_NUMBER = 0; 54 const unsigned char SPEC_PICTURE = 1; 55 const unsigned char SPEC_AUTONUM_FOOTNOTE_REF = 2; 56 const unsigned char SPEC_FOOTNOTE_SEPARATOR = 3; 57 const unsigned char SPEC_FOOTNOTE_CONTINUATION = 4; 58 const unsigned char SPEC_ANNOTATION_REF = 5; 59 const unsigned char SPEC_LINE_NUMBER = 6; 60 const unsigned char SPEC_HAND_ANNOTATION_PIC = 7; 61 const unsigned char SPEC_DRAWN_OBJECT = 8; 62 const unsigned char SPEC_ABBREV_DATE = 10; 63 const unsigned char SPEC_TIME_HMS = 11; 64 const unsigned char SPEC_CURRENT_SECTION_NUMBER = 12; 65 const unsigned char SPEC_ABBREV_WEEKDAY = 14; 66 const unsigned char SPEC_WEEKDAY = 15; 67 const unsigned char SPEC_DAY_SHORT = 16; 68 const unsigned char SPEC_CURRENT_HOUR = 22; 69 const unsigned char SPEC_CURRENT_HOUR_TWODIG = 23; 70 const unsigned char SPEC_CURRENT_MINUTE = 24; 71 const unsigned char SPEC_CURRENT_MINUTE_TWODIG = 25; 72 const unsigned char SPEC_CURRENT_SECONDS = 26; 73 const unsigned char SPEC_CURRENT_AMPM = 27; 74 const unsigned char SPEC_CURRENT_TIME_HMS = 28; 75 const unsigned char SPEC_DATE_M = 29; 76 const unsigned char SPEC_DATE_SHORT = 30; 77 const unsigned char SPEC_MONTH_SHORT = 33; 78 const unsigned char SPEC_YEAR_LONG = 34; 79 const unsigned char SPEC_YEAR_SHORT = 35; 80 const unsigned char SPEC_MONTH_ABBREV = 36; 81 const unsigned char SPEC_MONTH_LONG = 37; 82 const unsigned char SPEC_CURRENT_TIME_HM = 38; 83 const unsigned char SPEC_DATE_LONG = 39; 84 const unsigned char SPEC_MERGE_HELPER = 41; 85 86 87 class Properties97; 88 class ListInfoProvider; 89 class FontCollection; 90 class TextConverter; 91 class Fields; 92 class Headers; 93 class Footnotes97; 94 class Drawings; 95 template<class T> class PLCF; 96 97 // Helper structures for the Functor-based approach 98 struct HeaderData; 99 struct FootnoteData; 100 struct TableRowData; 101 struct PictureData; 102 103 /** 104 * This class should contain all the common functionality shared 105 * among the Word9[5|7] parsers. 106 */ 107 class Parser9x : public Parser 108 { 109 public: 110 Parser9x( OLEStorage* storage, OLEStreamReader* wordDocument, const Word97::FIB& fib ); 111 virtual ~Parser9x(); 112 113 /** 114 * The main parsing method 115 */ 116 virtual bool parse(); 117 118 virtual const Word97::FIB& fib() const; 119 virtual const Word97::DOP& dop() const; 120 121 /** 122 * Get the font family name structure for a given ftc. 123 */ 124 virtual const Word97::FFN& font( S16 ftc ) const; 125 126 /** 127 * Get the associated strings (author, title,...). 128 * Not cached. 129 */ 130 virtual AssociatedStrings associatedStrings(); 131 132 virtual const StyleSheet& styleSheet() const; 133 134 // This part of the public API is only visible to the Functor classes, 135 // as the "outside world" only sees the public API of Parser. The Functors 136 // allow to delay the parsing of certain text inside the file (e.g. headers) 137 // and trigger parsing at any point (as long as the parser exists). 138 // 139 // In case you want to add a new method here, please obey the following guidelines: 140 // - Executing the method mustn't change the state of the parser (i.e. save and 141 // restore the state!) 142 // - Be very careful, these calls can possibly be triggered at any time 143 void parseHeaders( const HeaderData& data ); 144 void parseFootnote( const FootnoteData& data ); 145 void parseTableRow( const TableRowData& data ); 146 void parsePicture( const PictureData& data ); 147 148 protected: 149 // First all variables which don't change their state during 150 // the parsing process. We don't have to save and restore those. 151 const Word97::FIB m_fib; 152 153 OLEStreamReader* m_table; // table stream ('WordDocument' for Word 6+95 and 154 // the real table stream for Word 97+) 155 OLEStreamReader* m_data; // data stream (if any, most of the time 0) 156 157 Properties97* m_properties; 158 Headers* m_headers; 159 160 // From here on we have all variables which change their state depending 161 // on the parsed content. These variables have to be saved and restored 162 // to make the parsing code reentrant. 163 164 private: 165 // Don't copy or assign us 166 Parser9x( const Parser9x& rhs ); 167 Parser9x& operator=( const Parser9x& rhs ); 168 169 // Uniquely represents a position inside a complex file. Used to map a CP to a Position 170 struct Position 171 { 172 // Start position PositionPosition173 Position( U32 p, U32 o ) : piece( p ), offset( o ) {} 174 // Constructs a Position from a CP 175 Position( U32 cp, const PLCF<Word97::PCD>* plcfpcd ); 176 177 U32 piece; // The piece number (0-based index) 178 U32 offset; // The CP offset within the piece 179 }; 180 181 // Represents a chunk of text. This is a part of a (or a whole) paragraph 182 // contained in one text piece. A paragraph consists of at least one Chunk. 183 // We don't store the paragraph/section mark, and in case only the paragraph 184 // mark sits in a different piece than the rest of the paragraph we just store 185 // an empty string for this chunk. 186 struct Chunk 187 { ChunkChunk188 Chunk( const UString& text, const Position& position, U32 startFC, bool isUnicode ) : 189 m_text( text ), m_position( position ), m_startFC( startFC ), m_isUnicode( isUnicode ) {} 190 191 UString m_text; 192 Position m_position; 193 U32 m_startFC; 194 bool m_isUnicode; 195 }; 196 // Represents a paragraph consisting of at least one Chunk. Right now it's only 197 // a typedef, maybe we need more than that later on 198 typedef std::list<Chunk> Paragraph; 199 200 // We have to keep track of the current parsing mode (e.g. are we skimming tables 201 // or are we parsing them?) 202 enum ParsingMode { Default, Table }; 203 204 // "Callbacks" for the 95/97 parsers 205 // ##### TODO 206 207 // Private helper methods 208 std::string tableStream() const; 209 void init(); 210 bool readPieceTable(); 211 void fakePieceTable(); 212 213 bool parseBody(); 214 215 // Expects m_remainingChars to be set correctly, changes the state of m_wordDocument,... 216 void parseHelper( Position startPos ); 217 template<typename String> void processPiece( String* string, U32 fc, U32 limit, const Position& position ); 218 // These helper methods are a cheap trick to "configure" parts of the template code by 219 // plain old overloading. It's just a matter of compressed vs. real unicode (1 vs. 2 bytes) 220 UString processPieceStringHelper( XCHAR* string, unsigned int start, unsigned int index ) const; 221 UString processPieceStringHelper( U8* string, unsigned int start, unsigned int index ) const; 222 // Processes the current contents of the Paragraph structure and clears it when it's done 223 void processParagraph( U32 fc ); 224 void processChunk( const Chunk& chunk, SharedPtr<const Word97::CHP> chp, 225 U32 length, U32 index, U32 currentStart ); 226 void processRun( const Chunk& chunk, SharedPtr<const Word97::CHP> chp, 227 U32 length, U32 index, U32 currentStart ); 228 229 void processSpecialCharacter( UChar character, U32 globalCP, SharedPtr<const Word97::CHP> chp ); 230 void processFootnote( UChar character, U32 globalCP, SharedPtr<const Word97::CHP> chp ); 231 232 // Helper methods to gather and emit the information needed for the functors 233 void emitHeaderData( SharedPtr<const Word97::SEP> sep ); 234 void emitPictureData( SharedPtr<const Word97::CHP> chp ); 235 void emitDrawnObject( SharedPtr<const Word97::CHP> chp ); 236 237 void parseHeader( const HeaderData& data, unsigned char mask ); 238 239 void parsePictureEscher( const PictureData& data, OLEStreamReader* stream, 240 int totalPicfSize, int picfStartPos ); 241 void parsePictureExternalHelper( const PictureData& data, OLEStreamReader* stream ); 242 void parsePictureBitmapHelper( const PictureData& data, OLEStreamReader* stream ); 243 void parsePictureWmfHelper( const PictureData& data, OLEStreamReader* stream ); 244 245 void saveState( U32 newRemainingChars, SubDocument newSubDocument, ParsingMode newParsingMode = Default ); 246 void restoreState(); 247 248 // Maps the global CP (as found in the piece table) to the local CP 249 // coordinate space of the corresponding sub document 250 U32 toLocalCP( U32 globalCP ) const; 251 // Calculates the real FC and tells us whether it was unicode or not 252 inline void realFC( U32& fc, bool& unicode ) const; 253 // Helper method to use std::accumulate in the table handling code 254 static int accumulativeLength( int len, const Chunk& chunk ); 255 256 // Private variables, no access needed in 95/97 code 257 // First all variables which don't change their state during 258 // the parsing process. We don't have to save and restore those. 259 ListInfoProvider* m_lists; 260 TextConverter* m_textconverter; 261 Fields* m_fields; 262 Footnotes97* m_footnotes; 263 FontCollection* m_fonts; 264 Drawings* m_drawings; 265 266 PLCF<Word97::PCD>* m_plcfpcd; // piece table 267 268 // From here on we have all variables which change their state depending 269 // on the parsed content. These variables have to be saved and restored 270 // to make the parsing code reentrant. 271 Position* m_tableRowStart; // If != 0 this represents the start of a table row 272 U32 m_tableRowLength; // Lenght of the table row (in characters). Only valid 273 bool m_cellMarkFound; // if m_tableRowStart != 0 274 int m_remainingCells; // The number of remaining cells for the processed row 275 276 Paragraph* m_currentParagraph; 277 278 U32 m_remainingChars; 279 U32 m_sectionNumber; 280 281 // Keeps track of the current sub document 282 SubDocument m_subDocument; 283 284 // We have to behave differently, depending whether we are parsing 285 // a table or the "main" text, as we skim the table first 286 ParsingMode m_parsingMode; 287 288 // Needed to have reentrant parsing methods (to make the functor approach work) 289 struct ParsingState 290 { ParsingStateParsingState291 ParsingState( Position* tableRowS, U32 tableRowL, bool cMarkFound, 292 int remCells, Paragraph* parag, U32 remChars, U32 sectionNum, 293 SubDocument subD, ParsingMode mode ) : 294 tableRowStart( tableRowS ), tableRowLength( tableRowL ), cellMarkFound( cMarkFound), 295 remainingCells( remCells ), paragraph( parag ), remainingChars( remChars ), 296 sectionNumber( sectionNum ), subDocument( subD ), parsingMode( mode ) {} 297 298 Position* tableRowStart; 299 U32 tableRowLength; 300 bool cellMarkFound; 301 int remainingCells; 302 Paragraph* paragraph; 303 U32 remainingChars; 304 U32 sectionNumber; // not strictly necessary, but doesn't hurt 305 SubDocument subDocument; 306 ParsingMode parsingMode; 307 }; 308 309 std::stack<ParsingState> oldParsingStates; 310 }; 311 realFC(U32 & fc,bool & unicode)312 inline void Parser9x::realFC( U32& fc, bool& unicode ) const 313 { 314 if ( fc & 0x40000000 ) { 315 fc = ( fc & 0xbfffffff ) >> 1; 316 unicode = false; 317 } 318 else 319 unicode = m_fib.nFib >= Word8nFib; 320 } 321 322 } // namespace wvWare 323 324 #endif // PARSER9X_H 325