1 /* This file is part of the wvWare 2 project
2    Copyright (C) 2001-2003 Werner Trobin <trobin@kde.org>
3    Copyright (C) 2010, 2011 Matus Uzak <matus.uzak@ixonos.com>
4 
5    This library is free software; you can redistribute it and/or
6    modify it under the terms of the Library GNU General Public
7    version 2 of the License, or (at your option) version 3 or,
8    at the discretion of KDE e.V (which shall act as a proxy as in
9    section 14 of the GPLv3), any later version..
10 
11    This library is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14    Library General Public License for more details.
15 
16    You should have received a copy of the GNU Library General Public License
17    along with this library; see the file COPYING.LIB.  If not, write to
18    the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
19    Boston, MA 02110-1301, USA.
20 */
21 
22 #include "parser9x.h"
23 #include "properties97.h"
24 #include "styles.h"
25 #include "word97_helper.h"
26 #include "lists.h"
27 #include "handlers.h"
28 #include "footnotes97.h"
29 #include "annotations.h"
30 #include "bookmark.h"
31 #include "headers.h"
32 #include "fonts.h"
33 #include "textconverter.h"
34 #include "olestream.h"
35 #include "fields.h"
36 #include "graphics.h"
37 #include "associatedstrings.h"
38 #include "paragraphproperties.h"
39 #include "functor.h"
40 #include "functordata.h"
41 #include "word95_generated.h"
42 #include "convert.h"
43 #include "wvlog.h"
44 
45 #include <numeric>
46 #include <string.h>
47 
48 using namespace wvWare;
49 
Position(U32 cp,const PLCF<Word97::PCD> * plcfpcd)50 Parser9x::Position::Position( U32 cp, const PLCF<Word97::PCD>* plcfpcd ) :
51         piece( 0 ), offset( cp )
52 {
53     PLCFIterator<Word97::PCD> it( *plcfpcd );
54     for ( ; it.current(); ++it, ++piece ) {
55         if ( it.currentLim() > cp && it.currentStart() <= cp )
56             break;
57         offset -= it.currentRun();
58     }
59 }
60 
61 
Parser9x(OLEStorage * storage,OLEStreamReader * wordDocument,const Word97::FIB & fib)62 Parser9x::Parser9x( OLEStorage* storage, OLEStreamReader* wordDocument, const Word97::FIB& fib ) :
63         Parser( storage, wordDocument ), m_fib( fib ), m_table( 0 ), m_data( 0 ),
64         m_properties( 0 ), m_headers( 0 ), m_lists( 0 ), m_textconverter( 0 ), m_fields( 0 ),
65         m_footnotes( 0 ), m_annotations( 0 ), m_fonts( 0 ), m_drawings( 0 ), m_bookmarks(0),
66         m_plcfpcd( 0 ), m_tableRowStart( 0 ), m_tableRowLength( 0 ), m_cellMarkFound( false ),
67         m_remainingCells( 0 ), m_table_skimming( 0 ),
68         m_currentParagraph( new Paragraph ), m_remainingChars( 0 ),
69         m_sectionNumber( 0 ), m_subDocument( None ), m_parsingMode( Default )
70 {
71     if ( !isOk() )
72         return;
73 
74     m_table = storage->createStreamReader( tableStream() );
75     if ( !m_table || !m_table->isValid() ) {
76         wvlog << "Error: Couldn't open the table stream (i.e. [0|1]Table or WordDocument)" << endl;
77         m_okay = false;
78         return;
79     }
80 
81     m_data = storage->createStreamReader( "Data" );
82     if ( !m_data || !m_data->isValid() ) {
83         wvlog << "Information: Couldn't open the Data stream, no big deal" << endl;
84         delete m_data;
85         m_data = 0;
86     }
87 
88     //validate FIB, keep going even if it's not perfect
89     m_fib.valid();
90 
91 #ifdef WV2_DUMP_FIB
92     wvlog << "Dumping some parts of the FIB: " << endl;
93     wvlog << "   wIdent=" << m_fib.wIdent << endl;
94     wvlog << "   nFib=0x" << hex << m_fib.nFib << dec << endl;
95     wvlog << "   nFibBack=" << m_fib.nFibBack << endl;
96     wvlog << "   lid=0x" << hex << m_fib.lid << dec << endl;
97     wvlog << "   lidFE=0x" << hex << m_fib.lidFE << dec << endl;
98     wvlog << "   fEncrypted=" << m_fib.fEncrypted << endl;
99     wvlog << "   chs=" << m_fib.chs << endl;
100     wvlog << "   fcMin=" << m_fib.fcMin << endl;
101     wvlog << "   fcMac=" << m_fib.fcMac << endl;
102     wvlog << "   ccpText=" << m_fib.ccpText << endl;
103     wvlog << "   ccpFtn=" << m_fib.ccpFtn << endl;
104     wvlog << "   ccpHdd=" << m_fib.ccpHdd << endl;
105     wvlog << "   ccpMcr=" << m_fib.ccpMcr << endl;
106     wvlog << "   ccpAtn=" << m_fib.ccpAtn << endl;
107     wvlog << "   ccpEdn=" << m_fib.ccpEdn << endl;
108     wvlog << "   ccpTxbx=" << m_fib.ccpTxbx << endl;
109     wvlog << "   ccpHdrTxbx=" << m_fib.ccpHdrTxbx << endl;
110     wvlog << "   pnFbpChpFirst=" << m_fib.pnFbpChpFirst << endl;
111     wvlog << "   pnChpFirst=" << m_fib.pnChpFirst << endl;
112     wvlog << "   cpnBteChp=" << m_fib.cpnBteChp << endl;
113     wvlog << "   pnFbpPapFirst=" << m_fib.pnFbpPapFirst << endl;
114     wvlog << "   pnPapFirst=" << m_fib.pnPapFirst << endl;
115     wvlog << "   cpnBtePap=" << m_fib.cpnBtePap << endl;
116     wvlog << "   fcPlcfandRef=" << m_fib.fcPlcfandRef << endl;
117     wvlog << "   lcbPlcfandRef=" << m_fib.lcbPlcfandRef << endl;
118     wvlog << "   cswNew=" << hex << m_fib.cswNew << dec << endl;
119 #endif
120     // Initialize all the cached data structures like stylesheets, fonts,
121     // textconverter,...
122     init();
123 }
124 
~Parser9x()125 Parser9x::~Parser9x()
126 {
127     // Sanity check
128     if ( !oldParsingStates.empty() || m_subDocument != None ) {
129         wvlog << "Bug: Someone messed up the save/restore stack!" << endl;
130     }
131 
132     delete m_currentParagraph;
133     delete m_tableRowStart;
134     delete m_drawings;
135     delete m_fonts;
136     delete m_plcfpcd;
137     delete m_headers;
138     delete m_footnotes;
139     delete m_bookmarks;
140     delete m_annotations;
141     delete m_fields;
142     delete m_textconverter;
143     delete m_properties;
144     delete m_lists;
145     delete m_data;
146     delete m_table;
147 }
148 
parse()149 bool Parser9x::parse()
150 {
151     if ( !isOk() )
152         return false;
153 
154     if ( m_fib.fEncrypted ) {
155         // There is some code out there to break this "encryption", do we want
156         // to implement that?
157         // We could either ask for a password or cheat a bit :-)
158         wvlog << "Error: The document is encrypted." << endl;
159         return false;
160     }
161 
162     if ( m_fib.lcbClx == 0 )
163         fakePieceTable();
164     else {
165         // Get the piece table
166         if ( !readPieceTable() )
167             return false;
168     }
169 
170     //provide the headers mask to m_subDocumentHandler
171     if (m_headers) {
172         m_subDocumentHandler->headersMask(m_headers->headersMask());
173     }
174     // start parsing the body
175     if ( !parseBody() )
176         return false;
177     return true;
178 }
179 
fib() const180 const Word97::FIB& Parser9x::fib() const
181 {
182     return m_fib;
183 }
184 
dop() const185 const Word97::DOP& Parser9x::dop() const
186 {
187     return m_properties->dop();
188 }
189 
font(S16 ftc) const190 const Word97::FFN& Parser9x::font( S16 ftc ) const
191 {
192     return m_fonts->font( ftc );
193 }
194 
associatedStrings()195 AssociatedStrings Parser9x::associatedStrings()
196 {
197     return AssociatedStrings( m_fib.fcSttbfAssoc, m_fib.lcbSttbfAssoc,
198                               m_fib.fFarEast ? m_fib.lidFE : m_fib.lid, m_table );
199 }
200 
styleSheet() const201 const StyleSheet& Parser9x::styleSheet() const
202 {
203     return m_properties->styleSheet();
204 }
205 
getDrawings() const206 const Drawings* Parser9x::getDrawings() const
207 {
208     return m_drawings;
209 }
210 
getTable()211 OLEStreamReader* Parser9x::getTable()
212 {
213     return m_table;
214 }
215 
parseHeaders(const HeaderData & data)216 void Parser9x::parseHeaders( const HeaderData& data )
217 {
218     m_subDocumentHandler->headersStart();
219     for ( unsigned char mask = HeaderData::HeaderEven;
220           mask <= HeaderData::FooterFirst; mask <<= 1 )
221     {
222         if ( mask & data.headerMask ) {
223             parseHeader( data, mask );
224         }
225     }
226     m_subDocumentHandler->headersEnd();
227 }
228 
parseFootnote(const FootnoteData & data)229 void Parser9x::parseFootnote( const FootnoteData& data )
230 {
231 #ifdef WV2_DEBUG_FOOTNOTES
232     wvlog << "Parser9x::parseFootnote() #####################" << endl;
233 #endif
234     // shouldn't happen, but well...
235     if ( data.limCP - data.startCP == 0 ) {
236         return;
237     }
238 
239     saveState( data.limCP - data.startCP, data.type == FootnoteData::Footnote ? Footnote : Endnote );
240     m_subDocumentHandler->footnoteStart();
241 
242     U32 offset = m_fib.ccpText + data.startCP;
243     if ( data.type == FootnoteData::Endnote ) {
244         offset += m_fib.ccpFtn + m_fib.ccpHdd + m_fib.ccpMcr + m_fib.ccpAtn;
245     }
246     parseHelper( Position( offset, m_plcfpcd ) );
247 
248     m_subDocumentHandler->footnoteEnd();
249     restoreState();
250 #ifdef WV2_DEBUG_FOOTNOTES
251     wvlog << "Parser9x::parseFootnote() done ################" << endl;
252 #endif
253 }
254 
parseAnnotation(const AnnotationData & data)255 void Parser9x::parseAnnotation( const AnnotationData& data )
256 {
257 #ifdef WV2_DEBUG_ANNOTATIONS
258     wvlog << "Parser9x::parseAnnotation() #####################" << endl;
259 #endif
260     // shouldn't happen, but well...
261     if ( data.limCP - data.startCP == 0 ) {
262         return;
263     }
264 
265     saveState( data.limCP - data.startCP, Annotation );
266     m_subDocumentHandler->annotationStart();
267 
268     U32 offset = m_fib.ccpText + m_fib.ccpFtn + m_fib.ccpHdd + data.startCP;
269     parseHelper( Position( offset, m_plcfpcd ) );
270 
271     m_subDocumentHandler->annotationEnd();
272     restoreState();
273 #ifdef WV2_DEBUG_ANNOTATIONS
274     wvlog << "Parser9x::parseAnnotation() done ################" << endl;
275 #endif
276 }
277 
parseTableRow(const TableRowData & data)278 void Parser9x::parseTableRow( const TableRowData& data )
279 {
280 #ifdef WV2_DEBUG_TABLES
281     wvlog << "Parser9x::parseTableRow(): startPiece=" << data.startPiece <<
282              " startOffset=" << data.startOffset << " length=" << data.length << endl;
283 #endif
284 
285     if ( data.length == 0 ) {
286         return;
287     }
288 
289     saveState( data.length, static_cast<SubDocument>( data.subDocument ), Table );
290     m_remainingCells = data.tap->itcMac;
291     m_tableHandler->tableRowStart( data.tap );
292     m_tableHandler->tableCellStart();
293 
294     parseHelper( Position( data.startPiece, data.startOffset ) );
295 
296     m_tableHandler->tableRowEnd();
297     restoreState();
298 
299 #ifdef WV2_DEBUG_TABLES
300     wvlog << "Parser9x::parseTableRow() done #####################" << endl;
301 #endif
302 }
303 
parseTextBox(unsigned int index,bool stylesxml)304 void Parser9x::parseTextBox(unsigned int index, bool stylesxml)
305 {
306     const PLCF<Word97::FTXBXS>* plcftxbxTxt = 0;
307     if (stylesxml) {
308         plcftxbxTxt =  m_drawings->getHdrTxbxTxt();
309     } else {
310         plcftxbxTxt =  m_drawings->getTxbxTxt();
311     }
312     if (!plcftxbxTxt) {
313         wvlog << "plcftxbxTxt MISSING!";
314         return;
315     }
316     //NOTE: text ranges for each FTXBXS structure are separated by 0x0D
317     //characters that MUST be the last character in each range.
318 
319     PLCFIterator<Word97::FTXBXS> it( plcftxbxTxt->at( index ) );
320 
321     //TODO: Do we need to save the state here?
322     saveState( it.currentRun() - 1, TextBox );
323     U32 offset = m_fib.ccpText + it.currentStart();
324     offset += m_fib.ccpFtn + m_fib.ccpHdd + m_fib.ccpAtn + m_fib.ccpEdn;
325     parseHelper( Position( offset, m_plcfpcd ) );
326     restoreState();
327 }
328 
tableStream() const329 std::string Parser9x::tableStream() const
330 {
331     if ( m_fib.nFib < Word8nFib )
332         return "WordDocument";    // Word 6 or Word 7 (==95)
333     else
334         return m_fib.fWhichTblStm ? "1Table" : "0Table";  // Word 8 (==97) or newer
335 }
336 
init()337 void Parser9x::init()
338 {
339     if ( m_fib.fFarEast )
340         m_textconverter = new TextConverter( m_fib.lidFE );
341     else
342         m_textconverter = new TextConverter( m_fib.lid );
343 
344     // Get hold of all the SEP/PAP/CHP related structures and the StyleSheet
345     m_properties = new Properties97( m_wordDocument, m_table, m_fib );
346 
347     if ( m_fib.nFib < Word8nFib ) // Word67
348         m_lists = new ListInfoProvider( &styleSheet() );
349     else
350         m_lists = new ListInfoProvider( m_table, m_fib, &m_properties->styleSheet() );
351 
352     m_fonts = new FontCollection( m_table, m_fib );
353     m_fields = new Fields( m_table, m_fib );
354     m_drawings = new Drawings( m_table, m_fib );
355 
356     if (( m_fib.ccpFtn != 0 ) || ( m_fib.ccpEdn != 0 ))
357         m_footnotes = new Footnotes97( m_table, m_fib );
358 
359     if (( m_fib.lcbPlcfbkf != 0 ) || ( m_fib.lcbPlcfbkl != 0 ))
360         m_bookmarks = new Bookmarks( m_table, m_fib );
361 
362     if ( m_fib.ccpAtn != 0 ) {
363         m_annotations = new Annotations( m_table, m_fib );
364     }
365 }
366 
readPieceTable()367 bool Parser9x::readPieceTable()
368 {
369     m_table->seek( m_fib.fcClx );
370     // first skip the leading grpprl blocks, we'll re-read them
371     // if we need them later (no caching here)
372     U8 blockType = m_table->readU8();
373     while ( blockType == wvWare::clxtGrpprl ) {
374         U16 size = m_table->readU16();
375 #if WV2_DUMP_PIECE_TABLE > 0
376         wvlog << "Found a clxtGrpprl (size=" << size << ")" << endl;
377 #endif
378         m_table->seek( size, WV2_SEEK_CUR );
379         blockType = m_table->readU8();
380     }
381     if ( blockType == wvWare::clxtPlcfpcd ) {
382         U32 size = m_table->readU32();
383 #if WV2_DUMP_PIECE_TABLE > 0
384         wvlog << "Found the clxtPlcfpcd (size=" << size << ")" << endl;
385 #endif
386         m_plcfpcd = new PLCF<Word97::PCD>( size, m_table, false );
387 
388 #if WV2_DUMP_PIECE_TABLE > 1
389         PLCFIterator<Word97::PCD> it( *m_plcfpcd );
390         for ( int i = 0; it.current(); ++it, ++i ) {
391             wvlog << "Piece Table Entry(" << i << "): " << endl;
392             wvlog << "   start: " << it.currentStart() << endl;
393             wvlog << "   lim: " << it.currentLim() << endl;
394             wvlog << "   complex: " << it.current()->prm.fComplex << endl;
395             if ( it.current()->prm.fComplex )
396                 wvlog << "   igrpprl: " << it.current()->prm.toPRM2().igrpprl << endl;
397             else
398                 wvlog << "   isprm: " << it.current()->prm.isprm << endl;
399 
400             U32 fc = it.current()->fc;
401             U32 limit = it.currentRun() << 1;
402             wvlog << "   value: " << fc << endl;
403             if ( fc & 0x40000000 ) {
404                 fc = ( fc & 0xbfffffff ) >> 1;
405                 limit >>= 1;
406                 wvlog << "   value (cleared 2nd MSB, div. by 2): " << fc << endl;
407             }
408             m_wordDocument->seek( fc );
409             wvlog << "   position: " << m_wordDocument->tell() << ", limit: " << limit << endl;
410             for ( unsigned int j = 0; j < limit; ++j ) {
411                 U8 foo = m_wordDocument->readU8();
412                 if ( foo > 31 )
413                     wvlog << static_cast<char>( foo );
414                 else if ( foo == PARAGRAPH_MARK )
415                     wvlog << endl;
416                 else if ( foo > 0 )
417                     wvlog << "{" <<  static_cast<int>( foo ) << "}";
418                 else
419                     wvlog << "_";
420             }
421             wvlog << endl << "   position: " << m_wordDocument->tell() << ", limit: " << limit << endl;
422         }
423 #endif
424     }
425     else {
426         wvlog << "Oooops, couldn't find the piece table." << endl;
427         return false;
428     }
429     return true;
430 }
431 
fakePieceTable()432 void Parser9x::fakePieceTable()
433 {
434     U32 fakePlcfPCD[ 4 ];
435     // The first CP is 0 (endianness doesn't matter :-)
436     fakePlcfPCD[ 0 ] = 0;
437     // The second CP corresponds to the length of the document
438     fakePlcfPCD[ 1 ] = toLittleEndian( m_fib.ccpText + m_fib.ccpFtn + m_fib.ccpHdd + m_fib.ccpMcr +
439                                        m_fib.ccpAtn + m_fib.ccpEdn + m_fib.ccpTxbx + m_fib.ccpHdrTxbx );
440 
441     // Now fake a matching PCD
442     U8* tmp( reinterpret_cast<U8*>( &fakePlcfPCD[0] ) );
443     tmp += 8;
444     *tmp++ = 0;  // first the bitfields (unused)
445     *tmp++ = 0;
446     U32 fcMin = m_fib.fcMin << 1;
447     fcMin |= 0x40000000;
448     *tmp++ = static_cast<U8>( fcMin & 0x000000ff );
449     *tmp++ = static_cast<U8>( ( fcMin & 0x0000ff00 ) >> 8 );   // then store the
450     *tmp++ = static_cast<U8>( ( fcMin & 0x00ff0000 ) >> 16 );  // fc in little
451     *tmp++ = static_cast<U8>( ( fcMin & 0xff000000 ) >> 24 );  // endian style
452     *tmp++ = 0;  // then an empty PRM
453     *tmp++ = 0;
454 
455     tmp = reinterpret_cast<U8*>( &fakePlcfPCD[0] );
456     m_plcfpcd = new PLCF<Word97::PCD>( 16, tmp );
457 }
458 
parseBody()459 bool Parser9x::parseBody()
460 {
461     saveState( m_fib.ccpText, Main );
462     m_subDocumentHandler->bodyStart();
463 
464     SharedPtr<const Word97::SEP> sep( m_properties->sepForCP( 0 ) );
465     if ( !sep ) {
466         sep = new Word97::SEP(); // don't pass 0 pointers in any case
467     }
468 #ifdef WV2_DEBUG_SECTIONS
469     sep->dump();
470 #endif
471 
472     m_textHandler->sectionStart( sep ); // First section, starting at CP 0
473 
474     emitHeaderData( sep );
475     sep = 0; // get rid of the huge SEP
476 
477     // Process all the pieces belonging to the main document text
478     parseHelper( Position( 0, static_cast<U32>( 0 ) ) );
479 
480     // Implicit end of the section
481     m_textHandler->sectionEnd();
482     m_subDocumentHandler->bodyEnd();
483     restoreState();
484     return true;
485 }
486 
parseHelper(Position startPos)487 void Parser9x::parseHelper( Position startPos )
488 {
489     PLCFIterator<Word97::PCD> it( m_plcfpcd->at( startPos.piece ) );
490 
491     while ( m_remainingChars > 0 && it.current() ) {
492         U32 fc = it.current()->fc;   // Start FC of this piece
493         bool unicode;
494         realFC( fc, unicode );
495 
496         U32 limit = it.currentRun(); // Number of characters in this piece
497 
498         // Check whether the text starts somewhere within the piece, reset at
499         // the end of the loop body.
500         if ( startPos.offset != 0 ) {
501             fc += unicode ? startPos.offset * 2 : startPos.offset;
502             limit -= startPos.offset;
503         }
504 
505         limit = limit > m_remainingChars ? m_remainingChars : limit;
506         m_wordDocument->seek( fc );
507 
508         if ( unicode ) {
509             XCHAR* string = new XCHAR[ limit ];
510             // First read the whole piece
511             for ( unsigned int j = 0; j < limit; ++j ) {
512                 string[ j ] = m_wordDocument->readU16();
513                 if ( ( string[ j ] & 0xff00 ) == 0xf000 ) {
514                     // Microsoft uses a Private Unicode Area (PUA) to store the characters of the
515                     // Symbol and the Wingdings font. We simply clear these bits to shift the
516                     // characters to 0x00XX and hope the correct font is installed.  If the font
517                     // isn't there, the user will get some ASCII text instead of symbols :}
518                     //wvlog << "private unicode area detected -- cropping" << endl;
519                     string[ j ] &= 0x00ff;
520                 }
521             }
522             // also takes care to delete [] string
523             processPiece<XCHAR>( string, fc, limit, startPos );
524         }
525         else {
526             U8* string = new U8[ limit ];
527             m_wordDocument->read( string, limit );
528             // also takes care to delete [] string
529             processPiece<U8>( string, fc, limit, startPos );
530         }
531         m_remainingChars -= limit;
532         ++it;
533         ++startPos.piece;
534         startPos.offset = 0; // just in case it was != 0 in the first iteration
535     }
536 }
537 
538 template<typename String>
processPiece(String * string,U32 fc,U32 limit,const Position & position)539 void Parser9x::processPiece( String* string, U32 fc, U32 limit, const Position& position )
540 {
541     // Take a closer look at the piece we just read. "start" and "index" are
542     // counted in character positions (take care!)
543     unsigned int start = 0;
544     unsigned int index = 0;
545     while ( index < limit ) {
546         switch( string[ index ] ) {
547         case SECTION_MARK:
548         {
549             if ( !m_currentParagraph->empty() || start != index ) {
550                 // No "index - start + 1" here, as we don't want to copy the section mark!
551                 UString ustring( processPieceStringHelper( string, start, index ) );
552                 m_currentParagraph->push_back( Chunk( ustring, Position( position.piece, position.offset + start ),
553                                                       fc + start * sizeof( String ),
554                                                       sizeof( String ) == sizeof( XCHAR ) ) );
555 
556                 processParagraph( fc + index * sizeof( String ) );
557             }
558             start = ++index;
559 
560             SharedPtr<const Word97::SEP> sep( m_properties->sepForCP( m_fib.ccpText - m_remainingChars + index ) );
561             if ( sep ) {
562 
563                 //Check if table skimming was active lately.  If yes, then this
564                 //is the SECTION_MARK which follows a table.
565                 if (m_table_skimming) {
566                     m_table_skimming = false;
567                     wvlog << "A table was identified lately: informing the texthandler.";
568                     m_textHandler->tableEndFound();
569                 }
570                 // It's not only a page break, it's a new section
571                 m_textHandler->sectionEnd();
572                 m_textHandler->sectionStart( sep );
573                 emitHeaderData( sep );
574             }
575             else {
576                 //manual page break
577                 m_textHandler->pageBreak();
578             }
579             break;
580         }
581         // same ASCII code as TTP_MARK (0x0007), NOTE: table depth == 1
582         case CELL_MARK:
583             m_cellMarkFound = true;
584             // Fall-through intended. A row/cell end is also a paragraph end.
585         case PARAGRAPH_MARK:
586         {
587             // No "index - start + 1" here, as we don't want to copy the
588             // paragraph mark!
589             UString ustring( processPieceStringHelper( string, start, index ) );
590             m_currentParagraph->push_back( Chunk( ustring, Position( position.piece, position.offset + start ),
591                                                   fc + start * sizeof( String ),
592                                                   sizeof( String ) == sizeof( XCHAR ) ) );
593             processParagraph( fc + index * sizeof( String ) );
594             m_cellMarkFound = false;
595             start = ++index;
596 
597             //signal progress
598             if (m_subDocument == Main && m_parsingMode == Default) {
599                 int value = m_fib.ccpText - m_remainingChars + index;
600                 int percentage = (int)((value / (float) m_fib.ccpText) * 100);
601                 m_subDocumentHandler->setProgress( percentage );
602             }
603 
604             break;
605         }
606         // "Special" characters
607         case TAB:
608             string[ index ] = m_inlineHandler->tab();
609             ++index;
610             break;
611         case HARD_LINE_BREAK:
612             string[ index ] = m_inlineHandler->hardLineBreak();
613             ++index;
614             break;
615         case COLUMN_BREAK:
616             string[ index ] = m_inlineHandler->columnBreak();
617             ++index;
618             break;
619         case NON_BREAKING_HYPHEN:
620             string[ index ] = m_inlineHandler->nonBreakingHyphen();
621             ++index;
622             break;
623         case NON_REQUIRED_HYPHEN:
624             string[ index ] = m_inlineHandler->nonRequiredHyphen();
625             ++index;
626             break;
627         case NON_BREAKING_SPACE:
628             string[ index ] = m_inlineHandler->nonBreakingSpace();
629             ++index;
630             break;
631         default:
632             ++index;
633             break;
634         }
635     } //while
636     if ( start < limit ) {
637         // Finally we have to add the remaining text to the current paragraph
638         // (if there is any)
639         UString ustring( processPieceStringHelper( string, start, limit ) );
640         m_currentParagraph->push_back( Chunk( ustring, Position( position.piece, position.offset + start ),
641                                               fc + start * sizeof( String ), sizeof( String ) == sizeof( XCHAR ) ) );
642     }
643     delete [] string;
644 }
645 
processPieceStringHelper(XCHAR * string,unsigned int start,unsigned int index) const646 UString Parser9x::processPieceStringHelper( XCHAR* string, unsigned int start, unsigned int index ) const
647 {
648     return UString( reinterpret_cast<const wvWare::UChar *>( &string[ start ] ), index - start );
649 }
650 
processPieceStringHelper(U8 * string,unsigned int start,unsigned int index) const651 UString Parser9x::processPieceStringHelper( U8* string, unsigned int start, unsigned int index ) const
652 {
653     return m_textconverter->convert( reinterpret_cast<char*>( &string[ start ] ), index - start );
654 }
655 
processParagraph(U32 fc)656 void Parser9x::processParagraph( U32 fc )
657 {
658     // Get the PAP structure as it was at the last full-save
659     ParagraphProperties* props( m_properties->fullSavedPap( fc, m_data ) );
660     // ...and apply the latest changes, then the PAP is completely restored
661     m_properties->applyClxGrpprl( m_plcfpcd->at( m_currentParagraph->back().m_position.piece ).current(),
662                                   m_fib.fcClx, props );
663 
664     // Skim the tables first, as soon as the functor is invoked we have to
665     // parse them and emit the text
666     if ( m_parsingMode == Default && props->pap().fInTable ) {
667 
668         //TODO: We could be already skimming a separate table, check TAP!  In
669         //case this is a new table inform the texthandler.
670 
671         //TODO: Support for nested tables!
672 
673         if ( !m_tableRowStart ) {
674             m_tableRowStart = new Position( m_currentParagraph->front().m_position );
675             m_tableRowLength = 0;
676             m_table_skimming = true;
677 
678 #ifdef WV2_DEBUG_TABLES
679             props->pap().dump();
680             wvlog << "Start of a table row: piece=" << m_tableRowStart->piece <<
681                      " offset=" << m_tableRowStart->offset << endl;
682 
683 #endif
684         }
685         // init == 1 because of the parag. mark!
686         m_tableRowLength += std::accumulate( m_currentParagraph->begin(), m_currentParagraph->end(), 1,
687                                              &Parser9x::accumulativeLength );
688 
689         //check if this is a Table Terminating Paragraph Mark
690         if ( props->pap().fTtp ) {
691             // Restore the table properties of this row
692             Word97::TAP* tap = m_properties->fullSavedTap( fc, m_data );
693 #ifdef WV2_DEBUG_TABLES
694             tap->dump();
695 #endif
696             m_properties->applyClxGrpprl( m_plcfpcd->at( m_currentParagraph->back().m_position.piece ).current(),
697                                           m_fib.fcClx, tap, m_properties->styleByIndex( props->pap().istd ) );
698 
699             SharedPtr<const Word97::TAP> sharedTap( tap );
700 
701             // We decrement the length by 1 that the trailing row mark doesn't
702             // emit one empty paragraph during parsing.
703             TableRowData data( m_tableRowStart->piece, m_tableRowStart->offset, m_tableRowLength - 1,
704                                static_cast<int>( m_subDocument ), sharedTap );
705 
706             m_textHandler->tableRowFound( make_functor( *this, &Parser9x::parseTableRow, data), sharedTap );
707 
708             delete m_tableRowStart;
709             m_tableRowStart = 0;
710         }
711         delete props;
712     }
713     else {
714 #ifdef WV2_DEBUG_PARAGRAPHS
715         props->pap().dump();
716 #endif
717 
718         //Check if table skimming was active lately.  If yes, then this is the
719         //paragraph behind the table (either a PARAGRAPH_MARK or a SECTION_MARK
720         //follows a table)!
721         if (m_table_skimming) {
722             m_table_skimming = false;
723             wvlog << "A table was identified lately: informing the texthandler.";
724             m_textHandler->tableEndFound();
725         }
726 
727         // Get the appropriate style for this paragraph.
728         const Style* style = m_properties->styleByIndex( props->pap().istd );
729         if ( !style ) {
730             wvlog << "Warning: Huh, really obscure error, couldn't find the Style for the current PAP -- skipping" << endl;
731             return;
732         }
733 
734         // Get the CHP for the paragraph.
735         Word97::CHP* paragraphChp = new Word97::CHP( style->chp() );
736         m_properties->fullSavedChp( fc, paragraphChp, style );
737 
738 #ifdef WV2_DEBUG_PARAGRAPHS
739         paragraphChp->dump();
740 #endif
741 
742         // Now that we have the complete PAP and CHP, let's see if this
743         // paragraph belongs to a list.
744         props->createListInfo( *m_lists, *paragraphChp );
745 
746 #ifdef WV2_DEBUG_LIST_PROCESSING
747         props->pap().dump();
748 #endif
749 
750         // Parse the bullet picture data.
751         const Word97::CHP* bulletChp = 0;
752         if (props->listInfo()) {
753             bulletChp = (props->listInfo()->text()).chp;
754         }
755         if (bulletChp && bulletChp->fPicBullet) {
756             bool ok;
757             BookmarkData data( m_bookmarks->bookmark(UString("_PictureBullets"), ok) );
758             if (ok) {
759                 Position pos(data.startCP + bulletChp->picBulletCP, m_plcfpcd);
760                 PLCFIterator<Word97::PCD> it( m_plcfpcd->at( pos.piece ));
761                 U32 fc = it.current()->fc;
762                 bool unicode;
763 
764                 realFC( fc, unicode );
765                 fc +=  unicode ? pos.offset * 2: pos.offset;
766 
767                 Word97::CHP* bulletPicChp = new Word97::CHP();
768                 m_properties->fullSavedChp( fc, bulletPicChp, 0 );
769 
770                 if (bulletPicChp->fSpec) {
771                     m_wordDocument->push();
772                     m_wordDocument->seek( fc, WV2_SEEK_SET );
773                     U8 c = m_wordDocument->readU8();
774                     m_wordDocument->pop();
775 
776                     if (c == TextHandler::Picture) {
777                         SharedPtr<const Word97::CHP> sharedBPChp( bulletPicChp );
778                         QString name = emitPictureData( 0, sharedBPChp, true);
779                         props->setBulletPictureName(name);
780                     } else {
781                         wvlog << "BulletPicture: Support for character 0x" << hex << c << "not implement yet.";
782                     }
783                 } else {
784                     wvlog << "BulletPicture: A special character expected, skipping!";
785                 }
786             }
787         }
788 
789         // keep it that way, else the variables get deleted!
790         SharedPtr<const ParagraphProperties> sharedPap( props );
791         SharedPtr<const Word97::CHP> sharedParagraphChp( paragraphChp );
792 
793         m_textHandler->paragraphStart( sharedPap, sharedParagraphChp );
794 
795         std::list<Chunk>::const_iterator it = m_currentParagraph->begin();
796         std::list<Chunk>::const_iterator end = m_currentParagraph->end();
797 
798         // Now walk the paragraph, chunk for chunk
799         for ( ; it != end; ++it ) {
800             U32 index = 0;
801             const U32 limit = ( *it ).m_text.length();
802             const PLCFIterator<Word97::PCD> pcdIt( m_plcfpcd->at( ( *it ).m_position.piece ) );
803 
804             while ( index < limit ) {
805                 // A temporary character style initialized to CHP of the
806                 // paragraph style.  Both CHPX and the built-in character style
807                 // referred by the istd are applied on top of it, while
808                 // comparing with the current CHP.
809                 Style charStyle( style->chp() );
810 
811                 U32 fc = ( *it ).m_startFC + index * ( ( *it ).m_isUnicode ? 2 : 1 );
812                 U32 length = m_properties->fullSavedChp( fc,
813                                                          &(const_cast<Word97::CHP&>(charStyle.chp())),
814                                                          &charStyle );
815                 if ( ( *it ).m_isUnicode ) {
816                     length >>= 1;
817                 }
818                 length = length > limit - index ? limit - index : length;
819 
820                 m_properties->applyClxGrpprl( pcdIt.current(), m_fib.fcClx,
821                                               &(const_cast<Word97::CHP&>(charStyle.chp())),
822                                               &charStyle );
823 
824                 Word97::CHP* chp = new Word97::CHP( charStyle.chp() );
825                 // keep it that way, else the CHP gets deleted!
826                 SharedPtr<const Word97::CHP> sharedChp( chp );
827                 processChunk( *it, sharedChp, length, index, pcdIt.currentStart() );
828                 index += length;
829             }
830             // Bookmark check for the next to last CP (paragraph mark).
831             if ( m_bookmarks ) {
832                 emitBookmark( ( *it ).m_position.offset + limit );
833             }
834         }
835         m_textHandler->paragraphEnd();
836 
837         if ( m_cellMarkFound ) {
838             m_tableHandler->tableCellEnd();
839             if ( --m_remainingCells > 0) {
840                 m_tableHandler->tableCellStart();
841             }
842         }
843     }
844     m_currentParagraph->clear();
845 }
846 
processChunk(const Chunk & chunk,SharedPtr<const Word97::CHP> chp,U32 length,U32 index,U32 currentStart)847 void Parser9x::processChunk( const Chunk& chunk, SharedPtr<const Word97::CHP> chp,
848                              U32 length, U32 index, U32 currentStart )
849 {
850     // XXX: does the following hold for Annotations as well? (BSAR)
851 
852     // Some characters have a special meaning (e.g. a footnote is anchored at some
853     // position inside the text) and they *don't* have the fSpec flag set. This means
854     // that we have to watch out for such characters even in plain text. Slooow :}
855     //
856     // For now we only have to handle footnote and endnote references that way. Due to that
857     // the code below is a bit simpler right now, but I fear we have to extend that later on.
858     // (We will have to keep track of the type of disruption, footnote() takes care of all now)
859     //
860     // A precondition for the footnote/endnote implementation below is, that footnote and
861     // endnote references only occur in the main body text. The reason is that we only check
862     // for the next footnote inside the PLCF and don't take subdocuments into account. If
863     // it turns out that this precondition is not satisfied we would have to change the
864     // O(1) nextFootnote() call to something like an O(n) containsFootnote( start, lim )
865     // Up to now Word 97, 2000, and 2002 seem to be bug compatible and fullfill that precondition.
866     //
867 
868     //only process the chunk if not marked hidden, TODO use text:display="none"
869     if (chp->fVanish == 1) {
870         return;
871     }
872 
873     while ( length > 0 ) {
874         U32 startCP = currentStart + chunk.m_position.offset + index;
875         U32 disruption = 0xffffffff; // "infinity"
876         U32 bkmk_length = 0; //num. of CPs enclosed in a bookmark
877 
878         if ( m_footnotes ) {
879             if (m_subDocument == Main) {
880                 m_footnotes->check(startCP);
881             }
882 
883             U32 nextFtn = m_footnotes->nextFootnote();
884             U32 nextEnd = m_footnotes->nextEndnote();
885             disruption = nextFtn < nextEnd ? nextFtn : nextEnd;
886 
887 #ifdef WV2_DEBUG_FOOTNOTES
888             wvlog << "nextFtn=" << nextFtn << " nextEnd=" << nextEnd <<
889                      " disruption=" << disruption << " length=" << length << endl;
890 #endif
891         } else if ( m_bookmarks ) {
892             if (m_subDocument == Main) {
893                 m_bookmarks->check(startCP);
894             }
895 
896             U32 nextBkf = m_bookmarks->nextBookmarkStart();
897             U32 nextBkl = m_bookmarks->nextBookmarkEnd();
898 
899             bkmk_length = nextBkl - nextBkf;
900             disruption = nextBkf;
901 
902 #ifdef WV2_DEBUG_BOOKMARK
903             wvlog << "nextBkf=" << nextBkf << "(0x" << hex << nextBkf << ")" <<dec<<
904                      "nextBkl=" << nextBkl << "(0x" << hex << nextBkl << ")" <<dec<<
905                      "disruption=" << disruption << "length=" << length << endl;
906 #endif
907             Q_ASSERT (nextBkf <= nextBkl);
908         }
909 
910         if ( (disruption >= startCP) && (disruption < (startCP + length)) ) {
911 
912 #if defined WV2_DEBUG_FOOTNOTES || defined WV2_DEBUG_BOOKMARK
913             wvlog << "startCP=" << startCP << " disruption=" << disruption <<
914              " bkmk_length=" << bkmk_length << " length=" << length << endl;
915 #endif
916             U32 disLen = disruption - startCP;
917             //there's something to be processed before the bookmark
918             if ( disLen != 0 ) {
919                 processRun( chunk, chp, disLen, index, currentStart );
920                 length -= disLen;
921                 index += disLen;
922             }
923 
924             if ( m_footnotes ) {
925                 //TODO: support for bookmarks in the number of a footnote
926                 m_customFootnote = chunk.m_text.substr(index, length);
927                 emitFootnote( m_customFootnote, disruption, chp, length );
928                 m_customFootnote = "";
929                 length = 0;
930             }
931             else if ( m_bookmarks ) {
932 
933                 //TODO: There might a number of bookmarks to process at the
934                 //current CP.  The first one gets processed, the rest is
935                 //skipped at the moment.
936 
937                 //TODO: Bookmarks can overlap, handle all bookmarks of a chunk.
938 
939                 //TODO: A bookmark can denote text comrised of segments
940                 //belonging into different chunks.
941 
942                 //NOTE: Not checking the ok value, invalid bookmarks were
943                 //already reported.  So it's obsolete at the moment.
944                 bool ok;
945                 BookmarkData data( m_bookmarks->bookmark( disruption, ok ) );
946 
947                 if ( !(bkmk_length <= length) ) {
948                     wvlog << "WARNING: bookmarks covering several chunks are not supported yet!";
949                     processRun( chunk, chp, length, index, currentStart );
950                     length = 0;
951                 } else {
952                     m_textHandler->bookmarkStart( data );
953                     if (bkmk_length > 0) {
954                         processRun( chunk, chp, bkmk_length, index, currentStart );
955                         m_textHandler->bookmarkEnd( data );
956                         length -= bkmk_length;
957                         index += bkmk_length;
958                     }
959                 }
960             }
961         } else {
962             processRun( chunk, chp, length, index, currentStart );
963             break; // should be faster than messing with length...
964         }
965     }
966 }
967 
processRun(const Chunk & chunk,SharedPtr<const Word97::CHP> chp,U32 length,U32 index,U32 currentStart)968 void Parser9x::processRun( const Chunk& chunk, SharedPtr<const Word97::CHP> chp,
969                            U32 length, U32 index, U32 currentStart )
970 {
971     if ( chp->fSpec ) {
972         U32 i = 0;
973         while ( i < length ) {
974             emitSpecialCharacter( chunk.m_text[ index + i ], currentStart + chunk.m_position.offset + index + i, chp );
975             ++i;
976         }
977     }
978     else {
979         UConstString str( const_cast<UChar*>( chunk.m_text.data() ) + index, length );
980         m_textHandler->runOfText( str.string(), chp );
981     }
982 }
983 
emitSpecialCharacter(UChar character,U32 globalCP,SharedPtr<const Word97::CHP> chp)984 void Parser9x::emitSpecialCharacter( UChar character, U32 globalCP, SharedPtr<const Word97::CHP> chp )
985 {
986     switch( character.unicode() ) {
987         // Is it one of the "simple" special characters?
988     case TextHandler::CurrentPageNumber:
989     case TextHandler::LineNumber:
990     case TextHandler::AbbreviatedDate:
991     case TextHandler::TimeHMS:
992     case TextHandler::CurrentSectionNumber:
993     case TextHandler::AbbreviatedDayOfWeek:
994     case TextHandler::DayOfWeek:
995     case TextHandler::DayShort:
996     case TextHandler::HourCurrentTime:
997     case TextHandler::HourCurrentTimeTwoDigits:
998     case TextHandler::MinuteCurrentTime:
999     case TextHandler::MinuteCurrentTimeTwoDigits:
1000     case TextHandler::SecondsCurrentTime:
1001     case TextHandler::AMPMCurrentTime:
1002     case TextHandler::CurrentTimeHMSOld:
1003     case TextHandler::DateM:
1004     case TextHandler::DateShort:
1005     case TextHandler::MonthShort:
1006     case TextHandler::YearLong:
1007     case TextHandler::YearShort:
1008     case TextHandler::AbbreviatedMonth:
1009     case TextHandler::MonthLong:
1010     case TextHandler::CurrentTimeHMS:
1011     case TextHandler::DateLong:
1012         m_textHandler->specialCharacter( static_cast<TextHandler::SpecialCharacter>( character.unicode() ), chp );
1013         break;
1014 
1015         // It has to be one of the very special characters...
1016     case TextHandler::Picture:
1017         // PictureData are required to process inline MS-ODRAW objects.
1018         emitPictureData( globalCP, chp );
1019         break;
1020     case TextHandler::DrawnObject:
1021         // Only globalCP is required to process floating MS-ODRAW objects.
1022         m_textHandler->msodrawObjectFound( globalCP, 0 );
1023         break;
1024     case TextHandler::FootnoteAuto:
1025         if ( m_subDocument == Footnote || m_subDocument == Endnote ) {
1026             m_textHandler->footnoteAutoNumber( chp );
1027         } else {
1028             emitFootnote( UString(character), globalCP, chp);
1029         }
1030         break;
1031     case TextHandler::FieldBegin:
1032         {
1033             const FLD* fld( m_fields->fldForCP( m_subDocument, toLocalCP( globalCP ) ) );
1034             if ( fld ) {
1035                 m_textHandler->fieldStart( fld, chp );
1036             } else {
1037                 wvlog << "FieldStart: Plcfld does not contain this CP, ignoring!";
1038             }
1039             break;
1040         }
1041     case TextHandler::FieldSeparator:
1042         {
1043             const FLD* fld( m_fields->fldForCP( m_subDocument, toLocalCP( globalCP ) ) );
1044             if ( fld ) {
1045                 m_textHandler->fieldSeparator( fld, chp );
1046             } else {
1047                 wvlog << "FieldSeparator: Plcfld does not contain this CP, ignoring!";
1048             }
1049             break;
1050         }
1051     case TextHandler::FieldEnd:
1052         {
1053             const FLD* fld( m_fields->fldForCP( m_subDocument, toLocalCP( globalCP ) ) );
1054             if ( fld ) {
1055                 m_textHandler->fieldEnd( fld, chp );
1056             } else {
1057                 wvlog << "FieldEnd: Plcfld does not contain this CP, ignoring!";
1058             }
1059             break;
1060         }
1061     case TextHandler::AnnotationRef:
1062         {
1063             //comment reference characters are only in the Main Document
1064             if (m_subDocument == Main) {
1065                 emitAnnotation( UString(character), globalCP, chp );
1066             }
1067         }
1068     case TextHandler::FieldEscapeChar:
1069         wvlog << "Found an escape character ++++++++++++++++++++?" << endl;
1070         break;
1071     case TextHandler::Symbol:
1072     {
1073         //NOTE: MS Word 2k/2k3/2k7 ignores chp->ftcSym (font for the symbol).
1074         m_textHandler->runOfText(UString(reinterpret_cast<const wvWare::UChar*>(&chp->xchSym), 1), chp);
1075         break;
1076     }
1077     default:
1078         wvlog << "Parser9x::processSpecialCharacter(): Support for character " << character.unicode()
1079               << " not implemented yet." << endl;
1080         break;
1081     }
1082 }
1083 
emitFootnote(UString characters,U32 globalCP,SharedPtr<const Word97::CHP> chp,U32)1084 void Parser9x::emitFootnote( UString characters, U32 globalCP,
1085                              SharedPtr<const Word97::CHP> chp,
1086                              U32 /* length */ )
1087 {
1088     if ( !m_footnotes ) {
1089         wvlog << "Bug: Found a footnote, but m_footnotes == 0!" << endl;
1090         return;
1091     }
1092 #ifdef WV2_DEBUG_FOOTNOTES
1093     wvlog << "######### Footnote found: CP=" << globalCP << endl;
1094 #endif
1095     bool ok;
1096     FootnoteData data( m_footnotes->footnote( globalCP, ok ) );
1097     if ( ok ) {
1098 #ifdef WV2_DEBUG_FOOTNOTES
1099         wvlog << "char: 0x" << hex << characters[0].unicode() <<
1100                  "| fAuto:" << data.autoNumbered <<
1101                  "| fSpec:" << chp->fSpec;
1102 #endif
1103         SharedPtr<const Word97::SEP> sep( m_properties->sepForCP( globalCP ) );
1104         m_textHandler->footnoteFound( data, characters, sep, chp,
1105                                       make_functor( *this, &Parser9x::parseFootnote, data ));
1106     }
1107 }
1108 
emitBookmark(U32 globalCP)1109 void Parser9x::emitBookmark( U32 globalCP )
1110 {
1111     bool ok = false;
1112     BookmarkData data( m_bookmarks->bookmark( globalCP, ok ) );
1113 
1114     //TODO: handle bookmarks marking a text range between paragraphs in this
1115     //special case
1116 
1117     //there might be more bookmarks for the current CP
1118     while (ok) {
1119         if ((data.limCP - data.startCP) > 0) {
1120             wvlog << "WARNING: bookmarks marking a text range between paragraphs not supported!";
1121         } else {
1122             m_textHandler->bookmarkStart( data );
1123         }
1124         data = m_bookmarks->bookmark( globalCP, ok );
1125 
1126 #ifdef WV2_DEBUG_BOOKMARK
1127         wvlog << "Bookmark found: CP=" << globalCP << endl;
1128 #endif
1129     }
1130 }
1131 
emitAnnotation(UString characters,U32 globalCP,SharedPtr<const Word97::CHP> chp,U32)1132 void Parser9x::emitAnnotation( UString characters, U32 globalCP, SharedPtr<const Word97::CHP> chp, U32 /* length */ )
1133 {
1134     if ( !m_annotations ) {
1135         wvlog << "Bug: Found an annotation, but m_annotations == 0!" << endl;
1136         return;
1137     }
1138 
1139     bool ok;
1140     AnnotationData data( m_annotations->annotation( globalCP, ok ) );
1141     if ( ok ) {
1142         m_textHandler->annotationFound(characters, chp,
1143                                        make_functor( *this, &Parser9x::parseAnnotation, data ));
1144     }
1145 }
1146 
emitHeaderData(SharedPtr<const Word97::SEP> sep)1147 void Parser9x::emitHeaderData( SharedPtr<const Word97::SEP> sep )
1148 {
1149     // We don't care about non-existent headers
1150     if ( !m_headers ) {
1151         return;
1152     }
1153     // NOTE: MS Word stores headers in a very strange way, so we have to keep
1154     // track of the section numbers.  We use a 0-based index for convenience
1155     // inside the header reading code. (Werner)
1156     //
1157     // Of course the file format has changed between Word 6/7 and Word 8, so I
1158     // had to add a workaround... oh well.
1159     HeaderData data( m_sectionNumber++ );
1160 
1161     if ( m_fib.nFib < Word8nFib ) {
1162         data.headerMask = sep->grpfIhdt;
1163         m_headers->set_headerMask( sep->grpfIhdt );
1164     }
1165     else {
1166         //check if an even header/footer is expected
1167         if ( dop().fFacingPages ) {
1168             data.headerMask |= HeaderData::HeaderEven | HeaderData::FooterEven;
1169         }
1170         //check if a first page header/footer is expected
1171         if ( sep->fTitlePage ) {
1172             data.headerMask |= HeaderData::HeaderFirst | HeaderData::FooterFirst;
1173         }
1174     }
1175     m_textHandler->headersFound( make_functor( *this, &Parser9x::parseHeaders, data ) );
1176 }
1177 
emitPictureData(const U32 globalCP,SharedPtr<const Word97::CHP> chp,const bool isBulletPicture)1178 QString Parser9x::emitPictureData( const U32 globalCP, SharedPtr<const Word97::CHP> chp , const bool isBulletPicture)
1179 {
1180     //NOTE: No need for the globalCP param at the moment.
1181 
1182 #ifdef WV2_DEBUG_PICTURES
1183     wvlog << "fcPic: " << chp->fcPic_fcObj_lTagObj;
1184     wvlog << "fObj:" << chp->fObj;
1185     wvlog << "fOle2:" << chp->fOle2;
1186 #endif
1187     QString ret;
1188 
1189     if (chp->fOle2) {
1190         wvlog << "Embedded OLE2 objects not supported." << endl;
1191         return ret;
1192     }
1193 
1194     OLEStreamReader* stream( m_fib.nFib < Word8nFib ? m_wordDocument : m_data );
1195     if ( !stream || static_cast<unsigned int>( chp->fcPic_fcObj_lTagObj ) >= stream->size() ) {
1196         wvlog << "Error: Severe problems when trying to read an image. Skipping." << endl;
1197         return ret;
1198     }
1199     stream->push();
1200     stream->seek( chp->fcPic_fcObj_lTagObj, WV2_SEEK_SET );
1201 
1202     Word97::PICF* picf( 0 );
1203     if ( m_fib.nFib < Word8nFib ) {
1204         picf = new Word97::PICF( Word95::toWord97( Word95::PICF( stream, false ) ) );
1205     } else {
1206         picf = new Word97::PICF( stream, false );
1207     }
1208     stream->pop();
1209 
1210     //[MS-DOC] — v20101219, 419/621
1211     if ( picf->cbHeader != 0x44 ) {
1212         wvlog << "Error: Expected size of the PICF structure is 0x44, got " << hex << picf->cbHeader;
1213         wvlog << "Skipping the image!" << endl;
1214         delete picf;
1215         return ret;
1216     }
1217 
1218     if ( picf->fError ) {
1219         wvlog << "Information: Skipping the image, fError is set" << endl;
1220         delete picf;
1221         return ret;
1222     }
1223 
1224 #ifdef WV2_DEBUG_PICTURES
1225     picf->dump();
1226 #endif
1227 
1228     // Offset into the Data stream for the GraphicsHandler, position of the
1229     // OfficeArtInlineSpContainer to parse with libmso.
1230     int offset = chp->fcPic_fcObj_lTagObj + picf->cbHeader;
1231 
1232     // Read cchPicName and stPicName in case of a shape file, MS-DOC p.422/609.
1233     if ( picf->mfp.mm == 0x0066 )
1234     {
1235         U8 cchPicName = stream->readU8();
1236 #ifdef WV2_DEBUG_PICTURES
1237         wvlog << "cchPicName: " << cchPicName << endl;
1238 #endif
1239         if (cchPicName) {
1240             U8* stPicName = new U8[cchPicName + 1];
1241             stream->read(stPicName, cchPicName);
1242             stPicName[cchPicName] = '\0';
1243 #ifdef WV2_DEBUG_PICTURES
1244             wvlog << "stPicName: " << stPicName << endl;
1245 #endif
1246             delete [] stPicName;
1247         }
1248         offset += cchPicName + 1;
1249     }
1250 
1251     SharedPtr<const Word97::PICF> sharedPicf( picf );
1252     PictureData data( offset, sharedPicf );
1253 
1254     if (isBulletPicture) {
1255         ret = m_graphicsHandler->handleInlineObject(data, isBulletPicture);
1256     } else {
1257         m_textHandler->msodrawObjectFound(globalCP, &data);
1258     }
1259     return ret;
1260 }
1261 
parseHeader(const HeaderData & data,unsigned char mask)1262 void Parser9x::parseHeader( const HeaderData& data, unsigned char mask )
1263 {
1264 #ifdef WV2_DEBUG_HEADERS
1265     wvlog << "parsing one header for section " << data.sectionNumber << ": mask=0x"
1266             <<  hex << static_cast<int>( mask ) << dec << endl;
1267 #endif
1268 
1269     // First we have to determine the CP start/lim for the header text. From what I
1270     // found out Word 8 does it that way:
1271     //    - At the begin of the plcfhdd there are always 6 "0 fields" (stoppers)
1272     //    - The number of headers modulo 6 is always 0
1273     // Word 6 does it completely different, of course :-}
1274     std::pair<U32, U32> range( m_headers->findHeader( data.sectionNumber, mask ) );
1275 
1276     int length = range.second - range.first;
1277 #ifdef WV2_DEBUG_HEADERS
1278     wvlog << "found a range: start=" << range.first << " lim=" << range.second << endl
1279             << "length: " << length << endl;
1280 #endif
1281     if ( length < 1 ) {
1282 #ifdef WV2_DEBUG_HEADERS
1283         wvlog << "Warning: Didn't find valid CPs for this header/footer -- ignoring it" << endl;
1284 #endif
1285 //         m_subDocumentHandler->headerStart( static_cast<HeaderData::Type>( mask ) );
1286 //         SharedPtr<const ParagraphProperties> sharedProps( new ParagraphProperties );
1287 //         m_textHandler->paragraphStart( sharedProps );
1288 //         m_textHandler->paragraphEnd();
1289 //         m_subDocumentHandler->headerEnd();
1290         return;
1291     }
1292     else if ( length > 1 ) {
1293         // get rid of the trailing "end of header/footer" character
1294         --length;
1295     }
1296 
1297     saveState( length, Header );
1298 
1299     m_subDocumentHandler->headerStart( static_cast<HeaderData::Type>( mask ) );
1300     parseHelper( Position( m_fib.ccpText + m_fib.ccpFtn + range.first, m_plcfpcd ) );
1301     m_subDocumentHandler->headerEnd();
1302 
1303     restoreState();
1304 }
1305 
saveState(U32 newRemainingChars,SubDocument newSubDocument,ParsingMode newParsingMode)1306 void Parser9x::saveState( U32 newRemainingChars, SubDocument newSubDocument, ParsingMode newParsingMode )
1307 {
1308     oldParsingStates.push( ParsingState( m_tableRowStart, m_tableRowLength, m_cellMarkFound, m_remainingCells,
1309                                          m_table_skimming, m_currentParagraph, m_remainingChars, m_sectionNumber,
1310                                          m_subDocument, m_parsingMode ) );
1311     m_tableRowStart = 0;
1312     m_cellMarkFound = false;
1313     m_table_skimming = false;
1314     m_currentParagraph = new Paragraph;
1315     m_remainingChars = newRemainingChars;
1316     m_subDocument = newSubDocument;
1317     m_parsingMode = newParsingMode;
1318 
1319     // save current positions in OLEStreams
1320     m_wordDocument->push();
1321     if ( m_data ) {
1322         m_data->push();
1323     }
1324     if ( m_table ) {
1325         m_table->push();
1326     }
1327 }
1328 
restoreState()1329 void Parser9x::restoreState()
1330 {
1331     if ( oldParsingStates.empty() ) {
1332         wvlog << "Bug: You messed up the save/restore stack! The stack is empty" << endl;
1333         return;
1334     }
1335 
1336     // restore positions in OLEStreams
1337     m_wordDocument->pop();
1338     if ( m_data ) {
1339         m_data->pop();
1340     }
1341     if ( m_table ) {
1342         m_table->pop();
1343     }
1344 
1345     ParsingState ps( oldParsingStates.top() );
1346     oldParsingStates.pop();
1347 
1348     if ( m_tableRowStart ) {
1349         wvlog << "Bug: We still have to process the table row." << endl;
1350     }
1351     // Should be a no-op, but I hate mem-leaks even for buggy code ;-)
1352     delete m_tableRowStart;
1353 
1354     m_tableRowStart = ps.tableRowStart;
1355     m_tableRowLength = ps.tableRowLength;
1356     m_cellMarkFound = ps.cellMarkFound;
1357     m_remainingCells = ps.remainingCells;
1358     m_table_skimming = ps.tableSkimming;
1359 
1360     if ( !m_currentParagraph->empty() ) {
1361         wvlog << "Bug: The current paragraph isn't empty." << endl;
1362     }
1363     delete m_currentParagraph;
1364     m_currentParagraph = ps.paragraph;
1365 
1366     if ( m_remainingChars != 0 ) {
1367         wvlog << "Bug: Still got " << m_remainingChars << " remaining chars." << endl;
1368     }
1369     m_remainingChars = ps.remainingChars;
1370     m_sectionNumber = ps.sectionNumber;
1371 
1372     m_subDocument = ps.subDocument;
1373     m_parsingMode = ps.parsingMode;
1374 }
1375 
toLocalCP(U32 globalCP) const1376 U32 Parser9x::toLocalCP( U32 globalCP ) const
1377 {
1378     if ( globalCP < m_fib.ccpText )
1379         return globalCP;
1380     globalCP -= m_fib.ccpText;
1381 
1382     if ( globalCP < m_fib.ccpFtn )
1383         return globalCP;
1384     globalCP -= m_fib.ccpFtn;
1385 
1386     if ( globalCP < m_fib.ccpHdd )
1387         return globalCP;
1388     globalCP -= m_fib.ccpHdd;
1389 
1390     if ( globalCP < m_fib.ccpMcr )
1391         return globalCP;
1392     globalCP -= m_fib.ccpMcr;
1393 
1394     if ( globalCP < m_fib.ccpAtn )
1395         return globalCP;
1396     globalCP -= m_fib.ccpAtn;
1397 
1398     if ( globalCP < m_fib.ccpEdn )
1399         return globalCP;
1400     globalCP -= m_fib.ccpEdn;
1401 
1402     if ( globalCP < m_fib.ccpTxbx )
1403         return globalCP;
1404     globalCP -= m_fib.ccpTxbx;
1405 
1406     if ( globalCP < m_fib.ccpHdrTxbx )
1407         return globalCP;
1408     globalCP -= m_fib.ccpHdrTxbx;
1409 
1410     wvlog << "Warning: You aimed " << globalCP << " characters past the end of the text!" << endl;
1411     return globalCP;
1412 }
1413 
accumulativeLength(int len,const Parser9x::Chunk & chunk)1414 int Parser9x::accumulativeLength( int len, const Parser9x::Chunk& chunk )
1415 {
1416     return len + chunk.m_text.length();
1417 }
1418