1 /* This file is part of the wvWare 2 project
2 Copyright (C) 2001-2003 Werner Trobin <trobin@kde.org>
3 Copyright (C) 2010, 2011 Matus Uzak <matus.uzak@ixonos.com>
4
5 This library is free software; you can redistribute it and/or
6 modify it under the terms of the Library GNU General Public
7 version 2 of the License, or (at your option) version 3 or,
8 at the discretion of KDE e.V (which shall act as a proxy as in
9 section 14 of the GPLv3), any later version..
10
11 This library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Library General Public License for more details.
15
16 You should have received a copy of the GNU Library General Public License
17 along with this library; see the file COPYING.LIB. If not, write to
18 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
19 Boston, MA 02110-1301, USA.
20 */
21
22 #include "parser9x.h"
23 #include "properties97.h"
24 #include "styles.h"
25 #include "word97_helper.h"
26 #include "lists.h"
27 #include "handlers.h"
28 #include "footnotes97.h"
29 #include "annotations.h"
30 #include "bookmark.h"
31 #include "headers.h"
32 #include "fonts.h"
33 #include "textconverter.h"
34 #include "olestream.h"
35 #include "fields.h"
36 #include "graphics.h"
37 #include "associatedstrings.h"
38 #include "paragraphproperties.h"
39 #include "functor.h"
40 #include "functordata.h"
41 #include "word95_generated.h"
42 #include "convert.h"
43 #include "wvlog.h"
44
45 #include <numeric>
46 #include <string.h>
47
48 using namespace wvWare;
49
Position(U32 cp,const PLCF<Word97::PCD> * plcfpcd)50 Parser9x::Position::Position( U32 cp, const PLCF<Word97::PCD>* plcfpcd ) :
51 piece( 0 ), offset( cp )
52 {
53 PLCFIterator<Word97::PCD> it( *plcfpcd );
54 for ( ; it.current(); ++it, ++piece ) {
55 if ( it.currentLim() > cp && it.currentStart() <= cp )
56 break;
57 offset -= it.currentRun();
58 }
59 }
60
61
Parser9x(OLEStorage * storage,OLEStreamReader * wordDocument,const Word97::FIB & fib)62 Parser9x::Parser9x( OLEStorage* storage, OLEStreamReader* wordDocument, const Word97::FIB& fib ) :
63 Parser( storage, wordDocument ), m_fib( fib ), m_table( 0 ), m_data( 0 ),
64 m_properties( 0 ), m_headers( 0 ), m_lists( 0 ), m_textconverter( 0 ), m_fields( 0 ),
65 m_footnotes( 0 ), m_annotations( 0 ), m_fonts( 0 ), m_drawings( 0 ), m_bookmarks(0),
66 m_plcfpcd( 0 ), m_tableRowStart( 0 ), m_tableRowLength( 0 ), m_cellMarkFound( false ),
67 m_remainingCells( 0 ), m_table_skimming( 0 ),
68 m_currentParagraph( new Paragraph ), m_remainingChars( 0 ),
69 m_sectionNumber( 0 ), m_subDocument( None ), m_parsingMode( Default )
70 {
71 if ( !isOk() )
72 return;
73
74 m_table = storage->createStreamReader( tableStream() );
75 if ( !m_table || !m_table->isValid() ) {
76 wvlog << "Error: Couldn't open the table stream (i.e. [0|1]Table or WordDocument)" << endl;
77 m_okay = false;
78 return;
79 }
80
81 m_data = storage->createStreamReader( "Data" );
82 if ( !m_data || !m_data->isValid() ) {
83 wvlog << "Information: Couldn't open the Data stream, no big deal" << endl;
84 delete m_data;
85 m_data = 0;
86 }
87
88 //validate FIB, keep going even if it's not perfect
89 m_fib.valid();
90
91 #ifdef WV2_DUMP_FIB
92 wvlog << "Dumping some parts of the FIB: " << endl;
93 wvlog << " wIdent=" << m_fib.wIdent << endl;
94 wvlog << " nFib=0x" << hex << m_fib.nFib << dec << endl;
95 wvlog << " nFibBack=" << m_fib.nFibBack << endl;
96 wvlog << " lid=0x" << hex << m_fib.lid << dec << endl;
97 wvlog << " lidFE=0x" << hex << m_fib.lidFE << dec << endl;
98 wvlog << " fEncrypted=" << m_fib.fEncrypted << endl;
99 wvlog << " chs=" << m_fib.chs << endl;
100 wvlog << " fcMin=" << m_fib.fcMin << endl;
101 wvlog << " fcMac=" << m_fib.fcMac << endl;
102 wvlog << " ccpText=" << m_fib.ccpText << endl;
103 wvlog << " ccpFtn=" << m_fib.ccpFtn << endl;
104 wvlog << " ccpHdd=" << m_fib.ccpHdd << endl;
105 wvlog << " ccpMcr=" << m_fib.ccpMcr << endl;
106 wvlog << " ccpAtn=" << m_fib.ccpAtn << endl;
107 wvlog << " ccpEdn=" << m_fib.ccpEdn << endl;
108 wvlog << " ccpTxbx=" << m_fib.ccpTxbx << endl;
109 wvlog << " ccpHdrTxbx=" << m_fib.ccpHdrTxbx << endl;
110 wvlog << " pnFbpChpFirst=" << m_fib.pnFbpChpFirst << endl;
111 wvlog << " pnChpFirst=" << m_fib.pnChpFirst << endl;
112 wvlog << " cpnBteChp=" << m_fib.cpnBteChp << endl;
113 wvlog << " pnFbpPapFirst=" << m_fib.pnFbpPapFirst << endl;
114 wvlog << " pnPapFirst=" << m_fib.pnPapFirst << endl;
115 wvlog << " cpnBtePap=" << m_fib.cpnBtePap << endl;
116 wvlog << " fcPlcfandRef=" << m_fib.fcPlcfandRef << endl;
117 wvlog << " lcbPlcfandRef=" << m_fib.lcbPlcfandRef << endl;
118 wvlog << " cswNew=" << hex << m_fib.cswNew << dec << endl;
119 #endif
120 // Initialize all the cached data structures like stylesheets, fonts,
121 // textconverter,...
122 init();
123 }
124
~Parser9x()125 Parser9x::~Parser9x()
126 {
127 // Sanity check
128 if ( !oldParsingStates.empty() || m_subDocument != None ) {
129 wvlog << "Bug: Someone messed up the save/restore stack!" << endl;
130 }
131
132 delete m_currentParagraph;
133 delete m_tableRowStart;
134 delete m_drawings;
135 delete m_fonts;
136 delete m_plcfpcd;
137 delete m_headers;
138 delete m_footnotes;
139 delete m_bookmarks;
140 delete m_annotations;
141 delete m_fields;
142 delete m_textconverter;
143 delete m_properties;
144 delete m_lists;
145 delete m_data;
146 delete m_table;
147 }
148
parse()149 bool Parser9x::parse()
150 {
151 if ( !isOk() )
152 return false;
153
154 if ( m_fib.fEncrypted ) {
155 // There is some code out there to break this "encryption", do we want
156 // to implement that?
157 // We could either ask for a password or cheat a bit :-)
158 wvlog << "Error: The document is encrypted." << endl;
159 return false;
160 }
161
162 if ( m_fib.lcbClx == 0 )
163 fakePieceTable();
164 else {
165 // Get the piece table
166 if ( !readPieceTable() )
167 return false;
168 }
169
170 //provide the headers mask to m_subDocumentHandler
171 if (m_headers) {
172 m_subDocumentHandler->headersMask(m_headers->headersMask());
173 }
174 // start parsing the body
175 if ( !parseBody() )
176 return false;
177 return true;
178 }
179
fib() const180 const Word97::FIB& Parser9x::fib() const
181 {
182 return m_fib;
183 }
184
dop() const185 const Word97::DOP& Parser9x::dop() const
186 {
187 return m_properties->dop();
188 }
189
font(S16 ftc) const190 const Word97::FFN& Parser9x::font( S16 ftc ) const
191 {
192 return m_fonts->font( ftc );
193 }
194
associatedStrings()195 AssociatedStrings Parser9x::associatedStrings()
196 {
197 return AssociatedStrings( m_fib.fcSttbfAssoc, m_fib.lcbSttbfAssoc,
198 m_fib.fFarEast ? m_fib.lidFE : m_fib.lid, m_table );
199 }
200
styleSheet() const201 const StyleSheet& Parser9x::styleSheet() const
202 {
203 return m_properties->styleSheet();
204 }
205
getDrawings() const206 const Drawings* Parser9x::getDrawings() const
207 {
208 return m_drawings;
209 }
210
getTable()211 OLEStreamReader* Parser9x::getTable()
212 {
213 return m_table;
214 }
215
parseHeaders(const HeaderData & data)216 void Parser9x::parseHeaders( const HeaderData& data )
217 {
218 m_subDocumentHandler->headersStart();
219 for ( unsigned char mask = HeaderData::HeaderEven;
220 mask <= HeaderData::FooterFirst; mask <<= 1 )
221 {
222 if ( mask & data.headerMask ) {
223 parseHeader( data, mask );
224 }
225 }
226 m_subDocumentHandler->headersEnd();
227 }
228
parseFootnote(const FootnoteData & data)229 void Parser9x::parseFootnote( const FootnoteData& data )
230 {
231 #ifdef WV2_DEBUG_FOOTNOTES
232 wvlog << "Parser9x::parseFootnote() #####################" << endl;
233 #endif
234 // shouldn't happen, but well...
235 if ( data.limCP - data.startCP == 0 ) {
236 return;
237 }
238
239 saveState( data.limCP - data.startCP, data.type == FootnoteData::Footnote ? Footnote : Endnote );
240 m_subDocumentHandler->footnoteStart();
241
242 U32 offset = m_fib.ccpText + data.startCP;
243 if ( data.type == FootnoteData::Endnote ) {
244 offset += m_fib.ccpFtn + m_fib.ccpHdd + m_fib.ccpMcr + m_fib.ccpAtn;
245 }
246 parseHelper( Position( offset, m_plcfpcd ) );
247
248 m_subDocumentHandler->footnoteEnd();
249 restoreState();
250 #ifdef WV2_DEBUG_FOOTNOTES
251 wvlog << "Parser9x::parseFootnote() done ################" << endl;
252 #endif
253 }
254
parseAnnotation(const AnnotationData & data)255 void Parser9x::parseAnnotation( const AnnotationData& data )
256 {
257 #ifdef WV2_DEBUG_ANNOTATIONS
258 wvlog << "Parser9x::parseAnnotation() #####################" << endl;
259 #endif
260 // shouldn't happen, but well...
261 if ( data.limCP - data.startCP == 0 ) {
262 return;
263 }
264
265 saveState( data.limCP - data.startCP, Annotation );
266 m_subDocumentHandler->annotationStart();
267
268 U32 offset = m_fib.ccpText + m_fib.ccpFtn + m_fib.ccpHdd + data.startCP;
269 parseHelper( Position( offset, m_plcfpcd ) );
270
271 m_subDocumentHandler->annotationEnd();
272 restoreState();
273 #ifdef WV2_DEBUG_ANNOTATIONS
274 wvlog << "Parser9x::parseAnnotation() done ################" << endl;
275 #endif
276 }
277
parseTableRow(const TableRowData & data)278 void Parser9x::parseTableRow( const TableRowData& data )
279 {
280 #ifdef WV2_DEBUG_TABLES
281 wvlog << "Parser9x::parseTableRow(): startPiece=" << data.startPiece <<
282 " startOffset=" << data.startOffset << " length=" << data.length << endl;
283 #endif
284
285 if ( data.length == 0 ) {
286 return;
287 }
288
289 saveState( data.length, static_cast<SubDocument>( data.subDocument ), Table );
290 m_remainingCells = data.tap->itcMac;
291 m_tableHandler->tableRowStart( data.tap );
292 m_tableHandler->tableCellStart();
293
294 parseHelper( Position( data.startPiece, data.startOffset ) );
295
296 m_tableHandler->tableRowEnd();
297 restoreState();
298
299 #ifdef WV2_DEBUG_TABLES
300 wvlog << "Parser9x::parseTableRow() done #####################" << endl;
301 #endif
302 }
303
parseTextBox(unsigned int index,bool stylesxml)304 void Parser9x::parseTextBox(unsigned int index, bool stylesxml)
305 {
306 const PLCF<Word97::FTXBXS>* plcftxbxTxt = 0;
307 if (stylesxml) {
308 plcftxbxTxt = m_drawings->getHdrTxbxTxt();
309 } else {
310 plcftxbxTxt = m_drawings->getTxbxTxt();
311 }
312 if (!plcftxbxTxt) {
313 wvlog << "plcftxbxTxt MISSING!";
314 return;
315 }
316 //NOTE: text ranges for each FTXBXS structure are separated by 0x0D
317 //characters that MUST be the last character in each range.
318
319 PLCFIterator<Word97::FTXBXS> it( plcftxbxTxt->at( index ) );
320
321 //TODO: Do we need to save the state here?
322 saveState( it.currentRun() - 1, TextBox );
323 U32 offset = m_fib.ccpText + it.currentStart();
324 offset += m_fib.ccpFtn + m_fib.ccpHdd + m_fib.ccpAtn + m_fib.ccpEdn;
325 parseHelper( Position( offset, m_plcfpcd ) );
326 restoreState();
327 }
328
tableStream() const329 std::string Parser9x::tableStream() const
330 {
331 if ( m_fib.nFib < Word8nFib )
332 return "WordDocument"; // Word 6 or Word 7 (==95)
333 else
334 return m_fib.fWhichTblStm ? "1Table" : "0Table"; // Word 8 (==97) or newer
335 }
336
init()337 void Parser9x::init()
338 {
339 if ( m_fib.fFarEast )
340 m_textconverter = new TextConverter( m_fib.lidFE );
341 else
342 m_textconverter = new TextConverter( m_fib.lid );
343
344 // Get hold of all the SEP/PAP/CHP related structures and the StyleSheet
345 m_properties = new Properties97( m_wordDocument, m_table, m_fib );
346
347 if ( m_fib.nFib < Word8nFib ) // Word67
348 m_lists = new ListInfoProvider( &styleSheet() );
349 else
350 m_lists = new ListInfoProvider( m_table, m_fib, &m_properties->styleSheet() );
351
352 m_fonts = new FontCollection( m_table, m_fib );
353 m_fields = new Fields( m_table, m_fib );
354 m_drawings = new Drawings( m_table, m_fib );
355
356 if (( m_fib.ccpFtn != 0 ) || ( m_fib.ccpEdn != 0 ))
357 m_footnotes = new Footnotes97( m_table, m_fib );
358
359 if (( m_fib.lcbPlcfbkf != 0 ) || ( m_fib.lcbPlcfbkl != 0 ))
360 m_bookmarks = new Bookmarks( m_table, m_fib );
361
362 if ( m_fib.ccpAtn != 0 ) {
363 m_annotations = new Annotations( m_table, m_fib );
364 }
365 }
366
readPieceTable()367 bool Parser9x::readPieceTable()
368 {
369 m_table->seek( m_fib.fcClx );
370 // first skip the leading grpprl blocks, we'll re-read them
371 // if we need them later (no caching here)
372 U8 blockType = m_table->readU8();
373 while ( blockType == wvWare::clxtGrpprl ) {
374 U16 size = m_table->readU16();
375 #if WV2_DUMP_PIECE_TABLE > 0
376 wvlog << "Found a clxtGrpprl (size=" << size << ")" << endl;
377 #endif
378 m_table->seek( size, WV2_SEEK_CUR );
379 blockType = m_table->readU8();
380 }
381 if ( blockType == wvWare::clxtPlcfpcd ) {
382 U32 size = m_table->readU32();
383 #if WV2_DUMP_PIECE_TABLE > 0
384 wvlog << "Found the clxtPlcfpcd (size=" << size << ")" << endl;
385 #endif
386 m_plcfpcd = new PLCF<Word97::PCD>( size, m_table, false );
387
388 #if WV2_DUMP_PIECE_TABLE > 1
389 PLCFIterator<Word97::PCD> it( *m_plcfpcd );
390 for ( int i = 0; it.current(); ++it, ++i ) {
391 wvlog << "Piece Table Entry(" << i << "): " << endl;
392 wvlog << " start: " << it.currentStart() << endl;
393 wvlog << " lim: " << it.currentLim() << endl;
394 wvlog << " complex: " << it.current()->prm.fComplex << endl;
395 if ( it.current()->prm.fComplex )
396 wvlog << " igrpprl: " << it.current()->prm.toPRM2().igrpprl << endl;
397 else
398 wvlog << " isprm: " << it.current()->prm.isprm << endl;
399
400 U32 fc = it.current()->fc;
401 U32 limit = it.currentRun() << 1;
402 wvlog << " value: " << fc << endl;
403 if ( fc & 0x40000000 ) {
404 fc = ( fc & 0xbfffffff ) >> 1;
405 limit >>= 1;
406 wvlog << " value (cleared 2nd MSB, div. by 2): " << fc << endl;
407 }
408 m_wordDocument->seek( fc );
409 wvlog << " position: " << m_wordDocument->tell() << ", limit: " << limit << endl;
410 for ( unsigned int j = 0; j < limit; ++j ) {
411 U8 foo = m_wordDocument->readU8();
412 if ( foo > 31 )
413 wvlog << static_cast<char>( foo );
414 else if ( foo == PARAGRAPH_MARK )
415 wvlog << endl;
416 else if ( foo > 0 )
417 wvlog << "{" << static_cast<int>( foo ) << "}";
418 else
419 wvlog << "_";
420 }
421 wvlog << endl << " position: " << m_wordDocument->tell() << ", limit: " << limit << endl;
422 }
423 #endif
424 }
425 else {
426 wvlog << "Oooops, couldn't find the piece table." << endl;
427 return false;
428 }
429 return true;
430 }
431
fakePieceTable()432 void Parser9x::fakePieceTable()
433 {
434 U32 fakePlcfPCD[ 4 ];
435 // The first CP is 0 (endianness doesn't matter :-)
436 fakePlcfPCD[ 0 ] = 0;
437 // The second CP corresponds to the length of the document
438 fakePlcfPCD[ 1 ] = toLittleEndian( m_fib.ccpText + m_fib.ccpFtn + m_fib.ccpHdd + m_fib.ccpMcr +
439 m_fib.ccpAtn + m_fib.ccpEdn + m_fib.ccpTxbx + m_fib.ccpHdrTxbx );
440
441 // Now fake a matching PCD
442 U8* tmp( reinterpret_cast<U8*>( &fakePlcfPCD[0] ) );
443 tmp += 8;
444 *tmp++ = 0; // first the bitfields (unused)
445 *tmp++ = 0;
446 U32 fcMin = m_fib.fcMin << 1;
447 fcMin |= 0x40000000;
448 *tmp++ = static_cast<U8>( fcMin & 0x000000ff );
449 *tmp++ = static_cast<U8>( ( fcMin & 0x0000ff00 ) >> 8 ); // then store the
450 *tmp++ = static_cast<U8>( ( fcMin & 0x00ff0000 ) >> 16 ); // fc in little
451 *tmp++ = static_cast<U8>( ( fcMin & 0xff000000 ) >> 24 ); // endian style
452 *tmp++ = 0; // then an empty PRM
453 *tmp++ = 0;
454
455 tmp = reinterpret_cast<U8*>( &fakePlcfPCD[0] );
456 m_plcfpcd = new PLCF<Word97::PCD>( 16, tmp );
457 }
458
parseBody()459 bool Parser9x::parseBody()
460 {
461 saveState( m_fib.ccpText, Main );
462 m_subDocumentHandler->bodyStart();
463
464 SharedPtr<const Word97::SEP> sep( m_properties->sepForCP( 0 ) );
465 if ( !sep ) {
466 sep = new Word97::SEP(); // don't pass 0 pointers in any case
467 }
468 #ifdef WV2_DEBUG_SECTIONS
469 sep->dump();
470 #endif
471
472 m_textHandler->sectionStart( sep ); // First section, starting at CP 0
473
474 emitHeaderData( sep );
475 sep = 0; // get rid of the huge SEP
476
477 // Process all the pieces belonging to the main document text
478 parseHelper( Position( 0, static_cast<U32>( 0 ) ) );
479
480 // Implicit end of the section
481 m_textHandler->sectionEnd();
482 m_subDocumentHandler->bodyEnd();
483 restoreState();
484 return true;
485 }
486
parseHelper(Position startPos)487 void Parser9x::parseHelper( Position startPos )
488 {
489 PLCFIterator<Word97::PCD> it( m_plcfpcd->at( startPos.piece ) );
490
491 while ( m_remainingChars > 0 && it.current() ) {
492 U32 fc = it.current()->fc; // Start FC of this piece
493 bool unicode;
494 realFC( fc, unicode );
495
496 U32 limit = it.currentRun(); // Number of characters in this piece
497
498 // Check whether the text starts somewhere within the piece, reset at
499 // the end of the loop body.
500 if ( startPos.offset != 0 ) {
501 fc += unicode ? startPos.offset * 2 : startPos.offset;
502 limit -= startPos.offset;
503 }
504
505 limit = limit > m_remainingChars ? m_remainingChars : limit;
506 m_wordDocument->seek( fc );
507
508 if ( unicode ) {
509 XCHAR* string = new XCHAR[ limit ];
510 // First read the whole piece
511 for ( unsigned int j = 0; j < limit; ++j ) {
512 string[ j ] = m_wordDocument->readU16();
513 if ( ( string[ j ] & 0xff00 ) == 0xf000 ) {
514 // Microsoft uses a Private Unicode Area (PUA) to store the characters of the
515 // Symbol and the Wingdings font. We simply clear these bits to shift the
516 // characters to 0x00XX and hope the correct font is installed. If the font
517 // isn't there, the user will get some ASCII text instead of symbols :}
518 //wvlog << "private unicode area detected -- cropping" << endl;
519 string[ j ] &= 0x00ff;
520 }
521 }
522 // also takes care to delete [] string
523 processPiece<XCHAR>( string, fc, limit, startPos );
524 }
525 else {
526 U8* string = new U8[ limit ];
527 m_wordDocument->read( string, limit );
528 // also takes care to delete [] string
529 processPiece<U8>( string, fc, limit, startPos );
530 }
531 m_remainingChars -= limit;
532 ++it;
533 ++startPos.piece;
534 startPos.offset = 0; // just in case it was != 0 in the first iteration
535 }
536 }
537
538 template<typename String>
processPiece(String * string,U32 fc,U32 limit,const Position & position)539 void Parser9x::processPiece( String* string, U32 fc, U32 limit, const Position& position )
540 {
541 // Take a closer look at the piece we just read. "start" and "index" are
542 // counted in character positions (take care!)
543 unsigned int start = 0;
544 unsigned int index = 0;
545 while ( index < limit ) {
546 switch( string[ index ] ) {
547 case SECTION_MARK:
548 {
549 if ( !m_currentParagraph->empty() || start != index ) {
550 // No "index - start + 1" here, as we don't want to copy the section mark!
551 UString ustring( processPieceStringHelper( string, start, index ) );
552 m_currentParagraph->push_back( Chunk( ustring, Position( position.piece, position.offset + start ),
553 fc + start * sizeof( String ),
554 sizeof( String ) == sizeof( XCHAR ) ) );
555
556 processParagraph( fc + index * sizeof( String ) );
557 }
558 start = ++index;
559
560 SharedPtr<const Word97::SEP> sep( m_properties->sepForCP( m_fib.ccpText - m_remainingChars + index ) );
561 if ( sep ) {
562
563 //Check if table skimming was active lately. If yes, then this
564 //is the SECTION_MARK which follows a table.
565 if (m_table_skimming) {
566 m_table_skimming = false;
567 wvlog << "A table was identified lately: informing the texthandler.";
568 m_textHandler->tableEndFound();
569 }
570 // It's not only a page break, it's a new section
571 m_textHandler->sectionEnd();
572 m_textHandler->sectionStart( sep );
573 emitHeaderData( sep );
574 }
575 else {
576 //manual page break
577 m_textHandler->pageBreak();
578 }
579 break;
580 }
581 // same ASCII code as TTP_MARK (0x0007), NOTE: table depth == 1
582 case CELL_MARK:
583 m_cellMarkFound = true;
584 // Fall-through intended. A row/cell end is also a paragraph end.
585 case PARAGRAPH_MARK:
586 {
587 // No "index - start + 1" here, as we don't want to copy the
588 // paragraph mark!
589 UString ustring( processPieceStringHelper( string, start, index ) );
590 m_currentParagraph->push_back( Chunk( ustring, Position( position.piece, position.offset + start ),
591 fc + start * sizeof( String ),
592 sizeof( String ) == sizeof( XCHAR ) ) );
593 processParagraph( fc + index * sizeof( String ) );
594 m_cellMarkFound = false;
595 start = ++index;
596
597 //signal progress
598 if (m_subDocument == Main && m_parsingMode == Default) {
599 int value = m_fib.ccpText - m_remainingChars + index;
600 int percentage = (int)((value / (float) m_fib.ccpText) * 100);
601 m_subDocumentHandler->setProgress( percentage );
602 }
603
604 break;
605 }
606 // "Special" characters
607 case TAB:
608 string[ index ] = m_inlineHandler->tab();
609 ++index;
610 break;
611 case HARD_LINE_BREAK:
612 string[ index ] = m_inlineHandler->hardLineBreak();
613 ++index;
614 break;
615 case COLUMN_BREAK:
616 string[ index ] = m_inlineHandler->columnBreak();
617 ++index;
618 break;
619 case NON_BREAKING_HYPHEN:
620 string[ index ] = m_inlineHandler->nonBreakingHyphen();
621 ++index;
622 break;
623 case NON_REQUIRED_HYPHEN:
624 string[ index ] = m_inlineHandler->nonRequiredHyphen();
625 ++index;
626 break;
627 case NON_BREAKING_SPACE:
628 string[ index ] = m_inlineHandler->nonBreakingSpace();
629 ++index;
630 break;
631 default:
632 ++index;
633 break;
634 }
635 } //while
636 if ( start < limit ) {
637 // Finally we have to add the remaining text to the current paragraph
638 // (if there is any)
639 UString ustring( processPieceStringHelper( string, start, limit ) );
640 m_currentParagraph->push_back( Chunk( ustring, Position( position.piece, position.offset + start ),
641 fc + start * sizeof( String ), sizeof( String ) == sizeof( XCHAR ) ) );
642 }
643 delete [] string;
644 }
645
processPieceStringHelper(XCHAR * string,unsigned int start,unsigned int index) const646 UString Parser9x::processPieceStringHelper( XCHAR* string, unsigned int start, unsigned int index ) const
647 {
648 return UString( reinterpret_cast<const wvWare::UChar *>( &string[ start ] ), index - start );
649 }
650
processPieceStringHelper(U8 * string,unsigned int start,unsigned int index) const651 UString Parser9x::processPieceStringHelper( U8* string, unsigned int start, unsigned int index ) const
652 {
653 return m_textconverter->convert( reinterpret_cast<char*>( &string[ start ] ), index - start );
654 }
655
processParagraph(U32 fc)656 void Parser9x::processParagraph( U32 fc )
657 {
658 // Get the PAP structure as it was at the last full-save
659 ParagraphProperties* props( m_properties->fullSavedPap( fc, m_data ) );
660 // ...and apply the latest changes, then the PAP is completely restored
661 m_properties->applyClxGrpprl( m_plcfpcd->at( m_currentParagraph->back().m_position.piece ).current(),
662 m_fib.fcClx, props );
663
664 // Skim the tables first, as soon as the functor is invoked we have to
665 // parse them and emit the text
666 if ( m_parsingMode == Default && props->pap().fInTable ) {
667
668 //TODO: We could be already skimming a separate table, check TAP! In
669 //case this is a new table inform the texthandler.
670
671 //TODO: Support for nested tables!
672
673 if ( !m_tableRowStart ) {
674 m_tableRowStart = new Position( m_currentParagraph->front().m_position );
675 m_tableRowLength = 0;
676 m_table_skimming = true;
677
678 #ifdef WV2_DEBUG_TABLES
679 props->pap().dump();
680 wvlog << "Start of a table row: piece=" << m_tableRowStart->piece <<
681 " offset=" << m_tableRowStart->offset << endl;
682
683 #endif
684 }
685 // init == 1 because of the parag. mark!
686 m_tableRowLength += std::accumulate( m_currentParagraph->begin(), m_currentParagraph->end(), 1,
687 &Parser9x::accumulativeLength );
688
689 //check if this is a Table Terminating Paragraph Mark
690 if ( props->pap().fTtp ) {
691 // Restore the table properties of this row
692 Word97::TAP* tap = m_properties->fullSavedTap( fc, m_data );
693 #ifdef WV2_DEBUG_TABLES
694 tap->dump();
695 #endif
696 m_properties->applyClxGrpprl( m_plcfpcd->at( m_currentParagraph->back().m_position.piece ).current(),
697 m_fib.fcClx, tap, m_properties->styleByIndex( props->pap().istd ) );
698
699 SharedPtr<const Word97::TAP> sharedTap( tap );
700
701 // We decrement the length by 1 that the trailing row mark doesn't
702 // emit one empty paragraph during parsing.
703 TableRowData data( m_tableRowStart->piece, m_tableRowStart->offset, m_tableRowLength - 1,
704 static_cast<int>( m_subDocument ), sharedTap );
705
706 m_textHandler->tableRowFound( make_functor( *this, &Parser9x::parseTableRow, data), sharedTap );
707
708 delete m_tableRowStart;
709 m_tableRowStart = 0;
710 }
711 delete props;
712 }
713 else {
714 #ifdef WV2_DEBUG_PARAGRAPHS
715 props->pap().dump();
716 #endif
717
718 //Check if table skimming was active lately. If yes, then this is the
719 //paragraph behind the table (either a PARAGRAPH_MARK or a SECTION_MARK
720 //follows a table)!
721 if (m_table_skimming) {
722 m_table_skimming = false;
723 wvlog << "A table was identified lately: informing the texthandler.";
724 m_textHandler->tableEndFound();
725 }
726
727 // Get the appropriate style for this paragraph.
728 const Style* style = m_properties->styleByIndex( props->pap().istd );
729 if ( !style ) {
730 wvlog << "Warning: Huh, really obscure error, couldn't find the Style for the current PAP -- skipping" << endl;
731 return;
732 }
733
734 // Get the CHP for the paragraph.
735 Word97::CHP* paragraphChp = new Word97::CHP( style->chp() );
736 m_properties->fullSavedChp( fc, paragraphChp, style );
737
738 #ifdef WV2_DEBUG_PARAGRAPHS
739 paragraphChp->dump();
740 #endif
741
742 // Now that we have the complete PAP and CHP, let's see if this
743 // paragraph belongs to a list.
744 props->createListInfo( *m_lists, *paragraphChp );
745
746 #ifdef WV2_DEBUG_LIST_PROCESSING
747 props->pap().dump();
748 #endif
749
750 // Parse the bullet picture data.
751 const Word97::CHP* bulletChp = 0;
752 if (props->listInfo()) {
753 bulletChp = (props->listInfo()->text()).chp;
754 }
755 if (bulletChp && bulletChp->fPicBullet) {
756 bool ok;
757 BookmarkData data( m_bookmarks->bookmark(UString("_PictureBullets"), ok) );
758 if (ok) {
759 Position pos(data.startCP + bulletChp->picBulletCP, m_plcfpcd);
760 PLCFIterator<Word97::PCD> it( m_plcfpcd->at( pos.piece ));
761 U32 fc = it.current()->fc;
762 bool unicode;
763
764 realFC( fc, unicode );
765 fc += unicode ? pos.offset * 2: pos.offset;
766
767 Word97::CHP* bulletPicChp = new Word97::CHP();
768 m_properties->fullSavedChp( fc, bulletPicChp, 0 );
769
770 if (bulletPicChp->fSpec) {
771 m_wordDocument->push();
772 m_wordDocument->seek( fc, WV2_SEEK_SET );
773 U8 c = m_wordDocument->readU8();
774 m_wordDocument->pop();
775
776 if (c == TextHandler::Picture) {
777 SharedPtr<const Word97::CHP> sharedBPChp( bulletPicChp );
778 QString name = emitPictureData( 0, sharedBPChp, true);
779 props->setBulletPictureName(name);
780 } else {
781 wvlog << "BulletPicture: Support for character 0x" << hex << c << "not implement yet.";
782 }
783 } else {
784 wvlog << "BulletPicture: A special character expected, skipping!";
785 }
786 }
787 }
788
789 // keep it that way, else the variables get deleted!
790 SharedPtr<const ParagraphProperties> sharedPap( props );
791 SharedPtr<const Word97::CHP> sharedParagraphChp( paragraphChp );
792
793 m_textHandler->paragraphStart( sharedPap, sharedParagraphChp );
794
795 std::list<Chunk>::const_iterator it = m_currentParagraph->begin();
796 std::list<Chunk>::const_iterator end = m_currentParagraph->end();
797
798 // Now walk the paragraph, chunk for chunk
799 for ( ; it != end; ++it ) {
800 U32 index = 0;
801 const U32 limit = ( *it ).m_text.length();
802 const PLCFIterator<Word97::PCD> pcdIt( m_plcfpcd->at( ( *it ).m_position.piece ) );
803
804 while ( index < limit ) {
805 // A temporary character style initialized to CHP of the
806 // paragraph style. Both CHPX and the built-in character style
807 // referred by the istd are applied on top of it, while
808 // comparing with the current CHP.
809 Style charStyle( style->chp() );
810
811 U32 fc = ( *it ).m_startFC + index * ( ( *it ).m_isUnicode ? 2 : 1 );
812 U32 length = m_properties->fullSavedChp( fc,
813 &(const_cast<Word97::CHP&>(charStyle.chp())),
814 &charStyle );
815 if ( ( *it ).m_isUnicode ) {
816 length >>= 1;
817 }
818 length = length > limit - index ? limit - index : length;
819
820 m_properties->applyClxGrpprl( pcdIt.current(), m_fib.fcClx,
821 &(const_cast<Word97::CHP&>(charStyle.chp())),
822 &charStyle );
823
824 Word97::CHP* chp = new Word97::CHP( charStyle.chp() );
825 // keep it that way, else the CHP gets deleted!
826 SharedPtr<const Word97::CHP> sharedChp( chp );
827 processChunk( *it, sharedChp, length, index, pcdIt.currentStart() );
828 index += length;
829 }
830 // Bookmark check for the next to last CP (paragraph mark).
831 if ( m_bookmarks ) {
832 emitBookmark( ( *it ).m_position.offset + limit );
833 }
834 }
835 m_textHandler->paragraphEnd();
836
837 if ( m_cellMarkFound ) {
838 m_tableHandler->tableCellEnd();
839 if ( --m_remainingCells > 0) {
840 m_tableHandler->tableCellStart();
841 }
842 }
843 }
844 m_currentParagraph->clear();
845 }
846
processChunk(const Chunk & chunk,SharedPtr<const Word97::CHP> chp,U32 length,U32 index,U32 currentStart)847 void Parser9x::processChunk( const Chunk& chunk, SharedPtr<const Word97::CHP> chp,
848 U32 length, U32 index, U32 currentStart )
849 {
850 // XXX: does the following hold for Annotations as well? (BSAR)
851
852 // Some characters have a special meaning (e.g. a footnote is anchored at some
853 // position inside the text) and they *don't* have the fSpec flag set. This means
854 // that we have to watch out for such characters even in plain text. Slooow :}
855 //
856 // For now we only have to handle footnote and endnote references that way. Due to that
857 // the code below is a bit simpler right now, but I fear we have to extend that later on.
858 // (We will have to keep track of the type of disruption, footnote() takes care of all now)
859 //
860 // A precondition for the footnote/endnote implementation below is, that footnote and
861 // endnote references only occur in the main body text. The reason is that we only check
862 // for the next footnote inside the PLCF and don't take subdocuments into account. If
863 // it turns out that this precondition is not satisfied we would have to change the
864 // O(1) nextFootnote() call to something like an O(n) containsFootnote( start, lim )
865 // Up to now Word 97, 2000, and 2002 seem to be bug compatible and fullfill that precondition.
866 //
867
868 //only process the chunk if not marked hidden, TODO use text:display="none"
869 if (chp->fVanish == 1) {
870 return;
871 }
872
873 while ( length > 0 ) {
874 U32 startCP = currentStart + chunk.m_position.offset + index;
875 U32 disruption = 0xffffffff; // "infinity"
876 U32 bkmk_length = 0; //num. of CPs enclosed in a bookmark
877
878 if ( m_footnotes ) {
879 if (m_subDocument == Main) {
880 m_footnotes->check(startCP);
881 }
882
883 U32 nextFtn = m_footnotes->nextFootnote();
884 U32 nextEnd = m_footnotes->nextEndnote();
885 disruption = nextFtn < nextEnd ? nextFtn : nextEnd;
886
887 #ifdef WV2_DEBUG_FOOTNOTES
888 wvlog << "nextFtn=" << nextFtn << " nextEnd=" << nextEnd <<
889 " disruption=" << disruption << " length=" << length << endl;
890 #endif
891 } else if ( m_bookmarks ) {
892 if (m_subDocument == Main) {
893 m_bookmarks->check(startCP);
894 }
895
896 U32 nextBkf = m_bookmarks->nextBookmarkStart();
897 U32 nextBkl = m_bookmarks->nextBookmarkEnd();
898
899 bkmk_length = nextBkl - nextBkf;
900 disruption = nextBkf;
901
902 #ifdef WV2_DEBUG_BOOKMARK
903 wvlog << "nextBkf=" << nextBkf << "(0x" << hex << nextBkf << ")" <<dec<<
904 "nextBkl=" << nextBkl << "(0x" << hex << nextBkl << ")" <<dec<<
905 "disruption=" << disruption << "length=" << length << endl;
906 #endif
907 Q_ASSERT (nextBkf <= nextBkl);
908 }
909
910 if ( (disruption >= startCP) && (disruption < (startCP + length)) ) {
911
912 #if defined WV2_DEBUG_FOOTNOTES || defined WV2_DEBUG_BOOKMARK
913 wvlog << "startCP=" << startCP << " disruption=" << disruption <<
914 " bkmk_length=" << bkmk_length << " length=" << length << endl;
915 #endif
916 U32 disLen = disruption - startCP;
917 //there's something to be processed before the bookmark
918 if ( disLen != 0 ) {
919 processRun( chunk, chp, disLen, index, currentStart );
920 length -= disLen;
921 index += disLen;
922 }
923
924 if ( m_footnotes ) {
925 //TODO: support for bookmarks in the number of a footnote
926 m_customFootnote = chunk.m_text.substr(index, length);
927 emitFootnote( m_customFootnote, disruption, chp, length );
928 m_customFootnote = "";
929 length = 0;
930 }
931 else if ( m_bookmarks ) {
932
933 //TODO: There might a number of bookmarks to process at the
934 //current CP. The first one gets processed, the rest is
935 //skipped at the moment.
936
937 //TODO: Bookmarks can overlap, handle all bookmarks of a chunk.
938
939 //TODO: A bookmark can denote text comrised of segments
940 //belonging into different chunks.
941
942 //NOTE: Not checking the ok value, invalid bookmarks were
943 //already reported. So it's obsolete at the moment.
944 bool ok;
945 BookmarkData data( m_bookmarks->bookmark( disruption, ok ) );
946
947 if ( !(bkmk_length <= length) ) {
948 wvlog << "WARNING: bookmarks covering several chunks are not supported yet!";
949 processRun( chunk, chp, length, index, currentStart );
950 length = 0;
951 } else {
952 m_textHandler->bookmarkStart( data );
953 if (bkmk_length > 0) {
954 processRun( chunk, chp, bkmk_length, index, currentStart );
955 m_textHandler->bookmarkEnd( data );
956 length -= bkmk_length;
957 index += bkmk_length;
958 }
959 }
960 }
961 } else {
962 processRun( chunk, chp, length, index, currentStart );
963 break; // should be faster than messing with length...
964 }
965 }
966 }
967
processRun(const Chunk & chunk,SharedPtr<const Word97::CHP> chp,U32 length,U32 index,U32 currentStart)968 void Parser9x::processRun( const Chunk& chunk, SharedPtr<const Word97::CHP> chp,
969 U32 length, U32 index, U32 currentStart )
970 {
971 if ( chp->fSpec ) {
972 U32 i = 0;
973 while ( i < length ) {
974 emitSpecialCharacter( chunk.m_text[ index + i ], currentStart + chunk.m_position.offset + index + i, chp );
975 ++i;
976 }
977 }
978 else {
979 UConstString str( const_cast<UChar*>( chunk.m_text.data() ) + index, length );
980 m_textHandler->runOfText( str.string(), chp );
981 }
982 }
983
emitSpecialCharacter(UChar character,U32 globalCP,SharedPtr<const Word97::CHP> chp)984 void Parser9x::emitSpecialCharacter( UChar character, U32 globalCP, SharedPtr<const Word97::CHP> chp )
985 {
986 switch( character.unicode() ) {
987 // Is it one of the "simple" special characters?
988 case TextHandler::CurrentPageNumber:
989 case TextHandler::LineNumber:
990 case TextHandler::AbbreviatedDate:
991 case TextHandler::TimeHMS:
992 case TextHandler::CurrentSectionNumber:
993 case TextHandler::AbbreviatedDayOfWeek:
994 case TextHandler::DayOfWeek:
995 case TextHandler::DayShort:
996 case TextHandler::HourCurrentTime:
997 case TextHandler::HourCurrentTimeTwoDigits:
998 case TextHandler::MinuteCurrentTime:
999 case TextHandler::MinuteCurrentTimeTwoDigits:
1000 case TextHandler::SecondsCurrentTime:
1001 case TextHandler::AMPMCurrentTime:
1002 case TextHandler::CurrentTimeHMSOld:
1003 case TextHandler::DateM:
1004 case TextHandler::DateShort:
1005 case TextHandler::MonthShort:
1006 case TextHandler::YearLong:
1007 case TextHandler::YearShort:
1008 case TextHandler::AbbreviatedMonth:
1009 case TextHandler::MonthLong:
1010 case TextHandler::CurrentTimeHMS:
1011 case TextHandler::DateLong:
1012 m_textHandler->specialCharacter( static_cast<TextHandler::SpecialCharacter>( character.unicode() ), chp );
1013 break;
1014
1015 // It has to be one of the very special characters...
1016 case TextHandler::Picture:
1017 // PictureData are required to process inline MS-ODRAW objects.
1018 emitPictureData( globalCP, chp );
1019 break;
1020 case TextHandler::DrawnObject:
1021 // Only globalCP is required to process floating MS-ODRAW objects.
1022 m_textHandler->msodrawObjectFound( globalCP, 0 );
1023 break;
1024 case TextHandler::FootnoteAuto:
1025 if ( m_subDocument == Footnote || m_subDocument == Endnote ) {
1026 m_textHandler->footnoteAutoNumber( chp );
1027 } else {
1028 emitFootnote( UString(character), globalCP, chp);
1029 }
1030 break;
1031 case TextHandler::FieldBegin:
1032 {
1033 const FLD* fld( m_fields->fldForCP( m_subDocument, toLocalCP( globalCP ) ) );
1034 if ( fld ) {
1035 m_textHandler->fieldStart( fld, chp );
1036 } else {
1037 wvlog << "FieldStart: Plcfld does not contain this CP, ignoring!";
1038 }
1039 break;
1040 }
1041 case TextHandler::FieldSeparator:
1042 {
1043 const FLD* fld( m_fields->fldForCP( m_subDocument, toLocalCP( globalCP ) ) );
1044 if ( fld ) {
1045 m_textHandler->fieldSeparator( fld, chp );
1046 } else {
1047 wvlog << "FieldSeparator: Plcfld does not contain this CP, ignoring!";
1048 }
1049 break;
1050 }
1051 case TextHandler::FieldEnd:
1052 {
1053 const FLD* fld( m_fields->fldForCP( m_subDocument, toLocalCP( globalCP ) ) );
1054 if ( fld ) {
1055 m_textHandler->fieldEnd( fld, chp );
1056 } else {
1057 wvlog << "FieldEnd: Plcfld does not contain this CP, ignoring!";
1058 }
1059 break;
1060 }
1061 case TextHandler::AnnotationRef:
1062 {
1063 //comment reference characters are only in the Main Document
1064 if (m_subDocument == Main) {
1065 emitAnnotation( UString(character), globalCP, chp );
1066 }
1067 }
1068 case TextHandler::FieldEscapeChar:
1069 wvlog << "Found an escape character ++++++++++++++++++++?" << endl;
1070 break;
1071 case TextHandler::Symbol:
1072 {
1073 //NOTE: MS Word 2k/2k3/2k7 ignores chp->ftcSym (font for the symbol).
1074 m_textHandler->runOfText(UString(reinterpret_cast<const wvWare::UChar*>(&chp->xchSym), 1), chp);
1075 break;
1076 }
1077 default:
1078 wvlog << "Parser9x::processSpecialCharacter(): Support for character " << character.unicode()
1079 << " not implemented yet." << endl;
1080 break;
1081 }
1082 }
1083
emitFootnote(UString characters,U32 globalCP,SharedPtr<const Word97::CHP> chp,U32)1084 void Parser9x::emitFootnote( UString characters, U32 globalCP,
1085 SharedPtr<const Word97::CHP> chp,
1086 U32 /* length */ )
1087 {
1088 if ( !m_footnotes ) {
1089 wvlog << "Bug: Found a footnote, but m_footnotes == 0!" << endl;
1090 return;
1091 }
1092 #ifdef WV2_DEBUG_FOOTNOTES
1093 wvlog << "######### Footnote found: CP=" << globalCP << endl;
1094 #endif
1095 bool ok;
1096 FootnoteData data( m_footnotes->footnote( globalCP, ok ) );
1097 if ( ok ) {
1098 #ifdef WV2_DEBUG_FOOTNOTES
1099 wvlog << "char: 0x" << hex << characters[0].unicode() <<
1100 "| fAuto:" << data.autoNumbered <<
1101 "| fSpec:" << chp->fSpec;
1102 #endif
1103 SharedPtr<const Word97::SEP> sep( m_properties->sepForCP( globalCP ) );
1104 m_textHandler->footnoteFound( data, characters, sep, chp,
1105 make_functor( *this, &Parser9x::parseFootnote, data ));
1106 }
1107 }
1108
emitBookmark(U32 globalCP)1109 void Parser9x::emitBookmark( U32 globalCP )
1110 {
1111 bool ok = false;
1112 BookmarkData data( m_bookmarks->bookmark( globalCP, ok ) );
1113
1114 //TODO: handle bookmarks marking a text range between paragraphs in this
1115 //special case
1116
1117 //there might be more bookmarks for the current CP
1118 while (ok) {
1119 if ((data.limCP - data.startCP) > 0) {
1120 wvlog << "WARNING: bookmarks marking a text range between paragraphs not supported!";
1121 } else {
1122 m_textHandler->bookmarkStart( data );
1123 }
1124 data = m_bookmarks->bookmark( globalCP, ok );
1125
1126 #ifdef WV2_DEBUG_BOOKMARK
1127 wvlog << "Bookmark found: CP=" << globalCP << endl;
1128 #endif
1129 }
1130 }
1131
emitAnnotation(UString characters,U32 globalCP,SharedPtr<const Word97::CHP> chp,U32)1132 void Parser9x::emitAnnotation( UString characters, U32 globalCP, SharedPtr<const Word97::CHP> chp, U32 /* length */ )
1133 {
1134 if ( !m_annotations ) {
1135 wvlog << "Bug: Found an annotation, but m_annotations == 0!" << endl;
1136 return;
1137 }
1138
1139 bool ok;
1140 AnnotationData data( m_annotations->annotation( globalCP, ok ) );
1141 if ( ok ) {
1142 m_textHandler->annotationFound(characters, chp,
1143 make_functor( *this, &Parser9x::parseAnnotation, data ));
1144 }
1145 }
1146
emitHeaderData(SharedPtr<const Word97::SEP> sep)1147 void Parser9x::emitHeaderData( SharedPtr<const Word97::SEP> sep )
1148 {
1149 // We don't care about non-existent headers
1150 if ( !m_headers ) {
1151 return;
1152 }
1153 // NOTE: MS Word stores headers in a very strange way, so we have to keep
1154 // track of the section numbers. We use a 0-based index for convenience
1155 // inside the header reading code. (Werner)
1156 //
1157 // Of course the file format has changed between Word 6/7 and Word 8, so I
1158 // had to add a workaround... oh well.
1159 HeaderData data( m_sectionNumber++ );
1160
1161 if ( m_fib.nFib < Word8nFib ) {
1162 data.headerMask = sep->grpfIhdt;
1163 m_headers->set_headerMask( sep->grpfIhdt );
1164 }
1165 else {
1166 //check if an even header/footer is expected
1167 if ( dop().fFacingPages ) {
1168 data.headerMask |= HeaderData::HeaderEven | HeaderData::FooterEven;
1169 }
1170 //check if a first page header/footer is expected
1171 if ( sep->fTitlePage ) {
1172 data.headerMask |= HeaderData::HeaderFirst | HeaderData::FooterFirst;
1173 }
1174 }
1175 m_textHandler->headersFound( make_functor( *this, &Parser9x::parseHeaders, data ) );
1176 }
1177
emitPictureData(const U32 globalCP,SharedPtr<const Word97::CHP> chp,const bool isBulletPicture)1178 QString Parser9x::emitPictureData( const U32 globalCP, SharedPtr<const Word97::CHP> chp , const bool isBulletPicture)
1179 {
1180 //NOTE: No need for the globalCP param at the moment.
1181
1182 #ifdef WV2_DEBUG_PICTURES
1183 wvlog << "fcPic: " << chp->fcPic_fcObj_lTagObj;
1184 wvlog << "fObj:" << chp->fObj;
1185 wvlog << "fOle2:" << chp->fOle2;
1186 #endif
1187 QString ret;
1188
1189 if (chp->fOle2) {
1190 wvlog << "Embedded OLE2 objects not supported." << endl;
1191 return ret;
1192 }
1193
1194 OLEStreamReader* stream( m_fib.nFib < Word8nFib ? m_wordDocument : m_data );
1195 if ( !stream || static_cast<unsigned int>( chp->fcPic_fcObj_lTagObj ) >= stream->size() ) {
1196 wvlog << "Error: Severe problems when trying to read an image. Skipping." << endl;
1197 return ret;
1198 }
1199 stream->push();
1200 stream->seek( chp->fcPic_fcObj_lTagObj, WV2_SEEK_SET );
1201
1202 Word97::PICF* picf( 0 );
1203 if ( m_fib.nFib < Word8nFib ) {
1204 picf = new Word97::PICF( Word95::toWord97( Word95::PICF( stream, false ) ) );
1205 } else {
1206 picf = new Word97::PICF( stream, false );
1207 }
1208 stream->pop();
1209
1210 //[MS-DOC] — v20101219, 419/621
1211 if ( picf->cbHeader != 0x44 ) {
1212 wvlog << "Error: Expected size of the PICF structure is 0x44, got " << hex << picf->cbHeader;
1213 wvlog << "Skipping the image!" << endl;
1214 delete picf;
1215 return ret;
1216 }
1217
1218 if ( picf->fError ) {
1219 wvlog << "Information: Skipping the image, fError is set" << endl;
1220 delete picf;
1221 return ret;
1222 }
1223
1224 #ifdef WV2_DEBUG_PICTURES
1225 picf->dump();
1226 #endif
1227
1228 // Offset into the Data stream for the GraphicsHandler, position of the
1229 // OfficeArtInlineSpContainer to parse with libmso.
1230 int offset = chp->fcPic_fcObj_lTagObj + picf->cbHeader;
1231
1232 // Read cchPicName and stPicName in case of a shape file, MS-DOC p.422/609.
1233 if ( picf->mfp.mm == 0x0066 )
1234 {
1235 U8 cchPicName = stream->readU8();
1236 #ifdef WV2_DEBUG_PICTURES
1237 wvlog << "cchPicName: " << cchPicName << endl;
1238 #endif
1239 if (cchPicName) {
1240 U8* stPicName = new U8[cchPicName + 1];
1241 stream->read(stPicName, cchPicName);
1242 stPicName[cchPicName] = '\0';
1243 #ifdef WV2_DEBUG_PICTURES
1244 wvlog << "stPicName: " << stPicName << endl;
1245 #endif
1246 delete [] stPicName;
1247 }
1248 offset += cchPicName + 1;
1249 }
1250
1251 SharedPtr<const Word97::PICF> sharedPicf( picf );
1252 PictureData data( offset, sharedPicf );
1253
1254 if (isBulletPicture) {
1255 ret = m_graphicsHandler->handleInlineObject(data, isBulletPicture);
1256 } else {
1257 m_textHandler->msodrawObjectFound(globalCP, &data);
1258 }
1259 return ret;
1260 }
1261
parseHeader(const HeaderData & data,unsigned char mask)1262 void Parser9x::parseHeader( const HeaderData& data, unsigned char mask )
1263 {
1264 #ifdef WV2_DEBUG_HEADERS
1265 wvlog << "parsing one header for section " << data.sectionNumber << ": mask=0x"
1266 << hex << static_cast<int>( mask ) << dec << endl;
1267 #endif
1268
1269 // First we have to determine the CP start/lim for the header text. From what I
1270 // found out Word 8 does it that way:
1271 // - At the begin of the plcfhdd there are always 6 "0 fields" (stoppers)
1272 // - The number of headers modulo 6 is always 0
1273 // Word 6 does it completely different, of course :-}
1274 std::pair<U32, U32> range( m_headers->findHeader( data.sectionNumber, mask ) );
1275
1276 int length = range.second - range.first;
1277 #ifdef WV2_DEBUG_HEADERS
1278 wvlog << "found a range: start=" << range.first << " lim=" << range.second << endl
1279 << "length: " << length << endl;
1280 #endif
1281 if ( length < 1 ) {
1282 #ifdef WV2_DEBUG_HEADERS
1283 wvlog << "Warning: Didn't find valid CPs for this header/footer -- ignoring it" << endl;
1284 #endif
1285 // m_subDocumentHandler->headerStart( static_cast<HeaderData::Type>( mask ) );
1286 // SharedPtr<const ParagraphProperties> sharedProps( new ParagraphProperties );
1287 // m_textHandler->paragraphStart( sharedProps );
1288 // m_textHandler->paragraphEnd();
1289 // m_subDocumentHandler->headerEnd();
1290 return;
1291 }
1292 else if ( length > 1 ) {
1293 // get rid of the trailing "end of header/footer" character
1294 --length;
1295 }
1296
1297 saveState( length, Header );
1298
1299 m_subDocumentHandler->headerStart( static_cast<HeaderData::Type>( mask ) );
1300 parseHelper( Position( m_fib.ccpText + m_fib.ccpFtn + range.first, m_plcfpcd ) );
1301 m_subDocumentHandler->headerEnd();
1302
1303 restoreState();
1304 }
1305
saveState(U32 newRemainingChars,SubDocument newSubDocument,ParsingMode newParsingMode)1306 void Parser9x::saveState( U32 newRemainingChars, SubDocument newSubDocument, ParsingMode newParsingMode )
1307 {
1308 oldParsingStates.push( ParsingState( m_tableRowStart, m_tableRowLength, m_cellMarkFound, m_remainingCells,
1309 m_table_skimming, m_currentParagraph, m_remainingChars, m_sectionNumber,
1310 m_subDocument, m_parsingMode ) );
1311 m_tableRowStart = 0;
1312 m_cellMarkFound = false;
1313 m_table_skimming = false;
1314 m_currentParagraph = new Paragraph;
1315 m_remainingChars = newRemainingChars;
1316 m_subDocument = newSubDocument;
1317 m_parsingMode = newParsingMode;
1318
1319 // save current positions in OLEStreams
1320 m_wordDocument->push();
1321 if ( m_data ) {
1322 m_data->push();
1323 }
1324 if ( m_table ) {
1325 m_table->push();
1326 }
1327 }
1328
restoreState()1329 void Parser9x::restoreState()
1330 {
1331 if ( oldParsingStates.empty() ) {
1332 wvlog << "Bug: You messed up the save/restore stack! The stack is empty" << endl;
1333 return;
1334 }
1335
1336 // restore positions in OLEStreams
1337 m_wordDocument->pop();
1338 if ( m_data ) {
1339 m_data->pop();
1340 }
1341 if ( m_table ) {
1342 m_table->pop();
1343 }
1344
1345 ParsingState ps( oldParsingStates.top() );
1346 oldParsingStates.pop();
1347
1348 if ( m_tableRowStart ) {
1349 wvlog << "Bug: We still have to process the table row." << endl;
1350 }
1351 // Should be a no-op, but I hate mem-leaks even for buggy code ;-)
1352 delete m_tableRowStart;
1353
1354 m_tableRowStart = ps.tableRowStart;
1355 m_tableRowLength = ps.tableRowLength;
1356 m_cellMarkFound = ps.cellMarkFound;
1357 m_remainingCells = ps.remainingCells;
1358 m_table_skimming = ps.tableSkimming;
1359
1360 if ( !m_currentParagraph->empty() ) {
1361 wvlog << "Bug: The current paragraph isn't empty." << endl;
1362 }
1363 delete m_currentParagraph;
1364 m_currentParagraph = ps.paragraph;
1365
1366 if ( m_remainingChars != 0 ) {
1367 wvlog << "Bug: Still got " << m_remainingChars << " remaining chars." << endl;
1368 }
1369 m_remainingChars = ps.remainingChars;
1370 m_sectionNumber = ps.sectionNumber;
1371
1372 m_subDocument = ps.subDocument;
1373 m_parsingMode = ps.parsingMode;
1374 }
1375
toLocalCP(U32 globalCP) const1376 U32 Parser9x::toLocalCP( U32 globalCP ) const
1377 {
1378 if ( globalCP < m_fib.ccpText )
1379 return globalCP;
1380 globalCP -= m_fib.ccpText;
1381
1382 if ( globalCP < m_fib.ccpFtn )
1383 return globalCP;
1384 globalCP -= m_fib.ccpFtn;
1385
1386 if ( globalCP < m_fib.ccpHdd )
1387 return globalCP;
1388 globalCP -= m_fib.ccpHdd;
1389
1390 if ( globalCP < m_fib.ccpMcr )
1391 return globalCP;
1392 globalCP -= m_fib.ccpMcr;
1393
1394 if ( globalCP < m_fib.ccpAtn )
1395 return globalCP;
1396 globalCP -= m_fib.ccpAtn;
1397
1398 if ( globalCP < m_fib.ccpEdn )
1399 return globalCP;
1400 globalCP -= m_fib.ccpEdn;
1401
1402 if ( globalCP < m_fib.ccpTxbx )
1403 return globalCP;
1404 globalCP -= m_fib.ccpTxbx;
1405
1406 if ( globalCP < m_fib.ccpHdrTxbx )
1407 return globalCP;
1408 globalCP -= m_fib.ccpHdrTxbx;
1409
1410 wvlog << "Warning: You aimed " << globalCP << " characters past the end of the text!" << endl;
1411 return globalCP;
1412 }
1413
accumulativeLength(int len,const Parser9x::Chunk & chunk)1414 int Parser9x::accumulativeLength( int len, const Parser9x::Chunk& chunk )
1415 {
1416 return len + chunk.m_text.length();
1417 }
1418