1 /*
2 For general Scribus (>=1.3.2) copyright and licensing information please refer
3 to the COPYING file provided with the program. Following this notice may exist
4 a copyright and/or license notice that predates the release of Scribus 1.3.2
5 for which a new license (GPL+exception) is in place.
6 */
7 /***************************************************************************
8 * Copyright (C) 2004 by Riku Leino *
9 * tsoots@gmail.com *
10 * *
11 * This program is free software; you can redistribute it and/or modify *
12 * it under the terms of the GNU General Public License as published by *
13 * the Free Software Foundation; either version 2 of the License, or *
14 * (at your option) any later version. *
15 * *
16 * This program is distributed in the hope that it will be useful, *
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of *
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
19 * GNU General Public License for more details. *
20 * *
21 * You should have received a copy of the GNU General Public License *
22 * along with this program; if not, write to the *
23 * Free Software Foundation, Inc., *
24 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. *
25 ***************************************************************************/
26
27 #include <QObject>
28 #include <QByteArray>
29 #include "htmlreader.h"
30
31 #include "scribusstructs.h"
32 #include "gtmeasure.h"
33
34 HTMLReader* HTMLReader::hreader = nullptr;
35 bool HTMLReader::elemJustStarted = false;
36 bool HTMLReader::elemJustFinished = false;
37
38 extern htmlSAXHandlerPtr mySAXHandler;
39
HTMLReader(gtParagraphStyle * ps,gtWriter * w,bool textOnly)40 HTMLReader::HTMLReader(gtParagraphStyle *ps, gtWriter *w, bool textOnly)
41 {
42 pstyle = ps;
43 defaultColor = ps->getFont()->getColor();
44 defaultWeight = ps->getFont()->getWeight();
45 defaultSlant = ps->getFont()->getSlant();
46 initPStyles();
47
48 writer = w;
49 noFormatting = textOnly;
50 hreader = this;
51 }
52
initPStyles()53 void HTMLReader::initPStyles()
54 {
55 pstylec = new gtParagraphStyle(*pstyle);
56 pstylec->setAlignment(CENTER);
57 pstylec->setName("HTML_center");
58 gtParagraphStyle* pstyleli = new gtParagraphStyle(*pstyle);
59 pstyleli->setIndent(pstyleli->getIndent()+50.0);
60 pstyleli->setName("HTML_li_level-0");
61 listStyles.push_back(pstyleli);
62 nextItemNumbers.push_back(1);
63 pstyleh6 = new gtParagraphStyle(*pstyle);
64 pstyleh6->getFont()->setSize(pstyle->getFont()->getSize() + 2.5);
65 pstyleh6->getFont()->setWeight(BOLD);
66 pstyleh6->setSpaceAbove(2.5);
67 pstyleh6->setSpaceBelow(1.25);
68 pstyleh6->setName("HTML_h6");
69 pstyleh5 = new gtParagraphStyle(*pstyle);
70 pstyleh5->getFont()->setSize(pstyle->getFont()->getSize() + 5);
71 pstyleh5->getFont()->setWeight(BOLD);
72 pstyleh5->setSpaceAbove(5.0);
73 pstyleh5->setSpaceBelow(2.5);
74 pstyleh5->setName("HTML_h5");
75 pstyleh4 = new gtParagraphStyle(*pstyle);
76 pstyleh4->getFont()->setSize(pstyle->getFont()->getSize() + 10);
77 pstyleh4->getFont()->setWeight(BOLD);
78 pstyleh4->setSpaceAbove(10.0);
79 pstyleh4->setSpaceBelow(5.0);
80 pstyleh4->setName("HTML_h4");
81 pstyleh3 = new gtParagraphStyle(*pstyle);
82 pstyleh3->getFont()->setSize(pstyle->getFont()->getSize() + 20);
83 pstyleh3->getFont()->setWeight(BOLD);
84 pstyleh3->setSpaceAbove(20.0);
85 pstyleh3->setSpaceBelow(10.0);
86 pstyleh3->setName("HTML_h3");
87 pstyleh2 = new gtParagraphStyle(*pstyle);
88 pstyleh2->getFont()->setSize(pstyle->getFont()->getSize() + 40);
89 pstyleh2->getFont()->setWeight(BOLD);
90 pstyleh2->setSpaceAbove(30.0);
91 pstyleh2->setSpaceBelow(20.0);
92 pstyleh2->setName("HTML_h2");
93 pstyleh1 = new gtParagraphStyle(*pstyle);
94 pstyleh1->getFont()->setSize(pstyle->getFont()->getSize() + 60);
95 pstyleh1->getFont()->setWeight(BOLD);
96 pstyleh1->setSpaceAbove(40.0);
97 pstyleh1->setSpaceBelow(30.0);
98 pstyleh1->setName("HTML_h1");
99 pstylecode = new gtParagraphStyle(*pstyle);
100 pstylecode->getFont()->setName("Courier Regular");
101 pstylecode->setName("HTML_code");
102 pstylep = new gtParagraphStyle(*pstyle);
103 pstylep->setSpaceBelow(gtMeasure::i2d(5, SC_MM));
104 pstylep->setName("HTML_p");
105 pstylepre = new gtParagraphStyle(*pstyle);
106 pstylepre->setName("HTML_pre");
107 }
108
startElement(void *,const xmlChar * fullname,const xmlChar ** atts)109 void HTMLReader::startElement(void*, const xmlChar * fullname, const xmlChar ** atts)
110 {
111 elemJustStarted = true;
112 elemJustFinished = false;
113 QString name(QString((const char*) fullname).toLower());
114 HTMLAttributesMap attrs;
115 if (atts)
116 {
117 for (const xmlChar** cur = atts; cur && *cur; cur += 2)
118 {
119 QString attrName((char*)*cur);
120 QString attrValue((char*)*(cur + 1));
121 attrs[attrName] = attrValue;
122 }
123 }
124 hreader->startElement(name, attrs);
125 }
126
startElement(const QString & name,const HTMLAttributesMap & attrs)127 bool HTMLReader::startElement(const QString &name, const HTMLAttributesMap &attrs)
128 {
129 if (name == "p")
130 inP = true;
131 else if (name == "center")
132 inCenter = true;
133 else if (name == "br")
134 writer->append("\n", pstyle);
135 else if (name == "a")
136 {
137 toggleEffect(UNDERLINE);
138 setBlueFont();
139 QString hRefVal = attrs.value("href");
140 if (!hRefVal.isEmpty())
141 href = hRefVal;
142 inA = true;
143 }
144 else if (name == "ul")
145 {
146 ++listLevel;
147 if (static_cast<int>(listStyles.size()) < (listLevel + 1))
148 createListStyle();
149 inUL = true;
150 if (inOL)
151 {
152 inOL = false;
153 wasInOL = true;
154 }
155 }
156 else if (name == "ol")
157 {
158 ++listLevel;
159 if (static_cast<int>(listStyles.size()) < (listLevel + 1))
160 createListStyle();
161 inOL = true;
162 if (inUL)
163 {
164 inUL = false;
165 wasInUL = true;
166 }
167 }
168 else if (name == "li")
169 inLI = true;
170 else if (name == "h1")
171 inH1 = true;
172 else if (name == "h2")
173 inH2 = true;
174 else if (name == "h3")
175 inH3 = true;
176 else if (name == "h4")
177 inH4 = true;
178 else if (name == "h5")
179 inH5 = true;
180 else if (name == "h6")
181 inH6 = true;
182 else if ((name == "b") || (name == "strong"))
183 setBoldFont();
184 else if ((name == "i") || (name == "em"))
185 setItalicFont();
186 else if (name == "code")
187 inCode = true;
188 else if (name == "body")
189 inBody = true;
190 else if (name == "pre")
191 inPre = true;
192 else if (name == "img")
193 {
194 QString imgline("(img,");
195 QString srcValue = attrs.value("src");
196 if (!srcValue.isEmpty())
197 {
198 QString attrValue = srcValue;
199 if (attrValue.indexOf("data:image") < 0)
200 imgline += " src: " + attrValue;
201 else
202 {
203 // TODO: correctly embed the image (just putting the source in the
204 // text frame crashes scribus for big images; ale/20120808)
205 imgline += " src: embedded image";
206 }
207 }
208 QString altValue = attrs.value("alt");
209 if (!altValue.isEmpty())
210 imgline += ", alt: " + altValue;
211 imgline += ")\n\n";
212 writer->append(imgline, pstyle);
213 }
214 else if (name == "sub")
215 toggleEffect(SUBSCRIPT);
216 else if (name == "sup")
217 toggleEffect(SUPERSCRIPT);
218 else if (name == "del")
219 toggleEffect(STRIKETHROUGH);
220 else if ((name == "ins" || name == "u") && (!inA))
221 toggleEffect(UNDERLINE);
222 return true;
223 }
characters(void *,const xmlChar * ch,int len)224 void HTMLReader::characters(void*, const xmlChar * ch, int len)
225 {
226 QString chars = QString::fromUtf8((const char*) ch, len);
227 hreader->characters(chars);
228 }
229
characters(const QString & ch)230 bool HTMLReader::characters(const QString &ch)
231 {
232 if (inBody)
233 {
234 QString tmp = ch;
235 // FIXME : According to html spec, new lines placed just after or just before an element
236 // must be ignored, not exactly that, but better than nothing
237 if (elemJustStarted || elemJustFinished)
238 {
239 while (!tmp.isEmpty() && (tmp[0] == '\r' || tmp[0] == '\n'))
240 tmp = tmp.right(tmp.length() - 1);
241 elemJustStarted = elemJustFinished = false;
242 if (tmp.isEmpty())
243 return true;
244 }
245 QString chl(tmp.at(0));
246 QString chr(tmp.right(1));
247 bool fcis = (chl.length() > 0 && chl[0].isSpace());
248 bool lcis = (chr.length() > 0 && chr[0].isSpace());
249 if (inPre)
250 {
251 if (tmp.at(0) == "\n")
252 tmp = tmp.right(tmp.length() - 2);
253 }
254 else
255 tmp = tmp.simplified();
256
257 if (tmp.isEmpty())
258 return true;
259
260 if (!lastCharWasSpace)
261 if (fcis)
262 tmp = " " + tmp;
263
264 if (lcis && !(fcis && tmp.length() <= 1))
265 tmp = tmp + " ";
266 lastCharWasSpace = lcis;
267 if ((inLI) && (!addedLI))
268 {
269 if (inUL)
270 tmp = "- " + tmp;
271 else if (inOL)
272 {
273 tmp = QString("%1. ").arg(nextItemNumbers[listLevel]) + tmp;
274 ++nextItemNumbers[listLevel];
275 }
276 addedLI = true;
277 }
278
279 if (noFormatting)
280 writer->appendUnstyled(tmp);
281 else if (inP)
282 writer->append(tmp, pstylep);
283 else if (inLI)
284 {
285 writer->append(tmp, listStyles[listLevel]);
286 }
287 else if (inH1)
288 writer->append(tmp, pstyleh1);
289 else if (inH2)
290 writer->append(tmp, pstyleh2);
291 else if (inH3)
292 writer->append(tmp, pstyleh3);
293 else if (inH4)
294 writer->append(tmp, pstyleh4);
295 else if (inH5)
296 writer->append(tmp, pstyleh5);
297 else if (inH6)
298 writer->append(tmp, pstyleh6);
299 else if (inCenter)
300 writer->append(tmp, pstylec);
301 else if (inCode)
302 writer->append(tmp, pstylecode);
303 else if (inPre)
304 writer->append(tmp, pstylepre);
305 else
306 writer->append(tmp, pstyle);
307 }
308 return true;
309 }
310
endElement(void *,const xmlChar * name)311 void HTMLReader::endElement(void*, const xmlChar * name)
312 {
313 elemJustStarted = false;
314 elemJustFinished = true;
315 QString nname(QString((const char*) name).toLower());
316 hreader->endElement(nname);
317 }
318
endElement(const QString & name)319 bool HTMLReader::endElement(const QString &name)
320 {
321 if (name == "center")
322 {
323 inCenter = false;
324 lastCharWasSpace = true;
325 writer->append("\n");
326 }
327 else if (name == "p")
328 {
329 inP = false;
330 lastCharWasSpace = true;
331 writer->append("\n");
332 }
333 else if (name == "a")
334 {
335 toggleEffect(UNDERLINE);
336 if ((!href.isEmpty()) && ((href.indexOf("//") != -1) ||
337 (href.indexOf("mailto:") != -1) || (href.indexOf("www") != -1)))
338 {
339 href = href.remove("mailto:");
340 writer->append(QString(" [%1]").arg(extIndex), pstyle);
341 extLinks += QString("[%1] ").arg(extIndex) + href + "\n";
342 ++extIndex;
343 }
344 href = "";
345 setDefaultColor();
346 inA = false;
347 }
348 else if (name == "ul")
349 {
350 if (listLevel == 0)
351 {
352 inUL = false;
353 inOL = false;
354 wasInUL = false;
355 wasInOL = false;
356 listLevel = -1;
357 }
358 else if (wasInOL)
359 {
360 inUL = false;
361 inOL = true;
362 wasInOL = false;
363 --listLevel;
364 }
365 else if (wasInUL)
366 {
367 inUL = true;
368 inOL = false;
369 wasInUL = false;
370 --listLevel;
371 }
372 else
373 --listLevel;
374 if (listLevel == -1)
375 {
376 lastCharWasSpace = true;
377 writer->append("\n");
378 }
379 }
380 else if (name == "ol")
381 {
382 if (listLevel == 0)
383 {
384 inUL = false;
385 inOL = false;
386 wasInUL = false;
387 wasInOL = false;
388 listLevel = -1;
389 }
390 else if (wasInUL)
391 {
392 inOL = false;
393 inUL = true;
394 wasInUL = false;
395 nextItemNumbers[listLevel] = 1;
396 --listLevel;
397 }
398 else if (wasInOL)
399 {
400 inOL = true;
401 inUL = false;
402 wasInOL = false;
403 nextItemNumbers[listLevel] = 1;
404 --listLevel;
405 }
406 else
407 {
408 nextItemNumbers[listLevel] = 1;
409 --listLevel;
410 }
411 if (listLevel == -1)
412 {
413 lastCharWasSpace = true;
414 writer->append("\n");
415 }
416 }
417 else if (name == "li")
418 {
419 inLI = false;
420 addedLI = false;
421 lastCharWasSpace = true;
422 writer->append("\n");
423 }
424 else if (name == "h1")
425 {
426 inH1 = false;
427 lastCharWasSpace = true;
428 writer->append("\n", pstyleh1);
429 }
430 else if (name == "h2")
431 {
432 inH2 = false;
433 lastCharWasSpace = true;
434 writer->append("\n", pstyleh2);
435 }
436 else if (name == "h3")
437 {
438 inH3 = false;
439 lastCharWasSpace = true;
440 writer->append("\n", pstyleh3);
441 }
442 else if (name == "h4")
443 {
444 inH4 = false;
445 lastCharWasSpace = true;
446 writer->append("\n", pstyleh4);
447 }
448 else if (name == "h5")
449 {
450 inH5 = false;
451 lastCharWasSpace = true;
452 writer->append("\n", pstyleh5);
453 }
454 else if (name == "h6")
455 {
456 inH6 = false;
457 lastCharWasSpace = true;
458 writer->append("\n", pstyleh6);
459 }
460 else if ((name == "b") || (name == "strong"))
461 unSetBoldFont();
462 else if ((name == "i") || (name == "em"))
463 unsetItalicFont();
464 else if (name == "code")
465 inCode = false;
466 else if (name == "body")
467 inBody = false;
468 else if (name == "pre")
469 {
470 inPre = false;
471 lastCharWasSpace = true;
472 writer->append("\n");
473 }
474 else if (name == "div")
475 writer->append("\n");
476 else if (name == "sub")
477 toggleEffect(SUBSCRIPT);
478 else if (name == "sup")
479 toggleEffect(SUPERSCRIPT);
480 else if (name == "del")
481 toggleEffect(STRIKETHROUGH);
482 else if ((name == "ins" || name == "u") && (!inA))
483 toggleEffect(UNDERLINE);
484 return true;
485 }
486
toggleEffect(FontEffect e)487 void HTMLReader::toggleEffect(FontEffect e)
488 {
489 pstyle->getFont()->toggleEffect(e);
490 pstylec->getFont()->toggleEffect(e);
491 for (uint i = 0; i < listStyles.size(); ++i)
492 listStyles[i]->getFont()->toggleEffect(e);
493 pstyleh1->getFont()->toggleEffect(e);
494 pstyleh2->getFont()->toggleEffect(e);
495 pstyleh3->getFont()->toggleEffect(e);
496 pstyleh4->getFont()->toggleEffect(e);
497 pstyleh5->getFont()->toggleEffect(e);
498 pstyleh6->getFont()->toggleEffect(e);
499 pstylecode->getFont()->toggleEffect(e);
500 pstylep->getFont()->toggleEffect(e);
501 pstylepre->getFont()->toggleEffect(e);
502 }
503
setItalicFont()504 void HTMLReader::setItalicFont()
505 {
506 pstyle->getFont()->setSlant(ITALIC);
507 pstylec->getFont()->setSlant(ITALIC);
508 for (uint i = 0; i < listStyles.size(); ++i)
509 listStyles[i]->getFont()->setSlant(ITALIC);
510 pstyleh1->getFont()->setSlant(ITALIC);
511 pstyleh2->getFont()->setSlant(ITALIC);
512 pstyleh3->getFont()->setSlant(ITALIC);
513 pstyleh4->getFont()->setSlant(ITALIC);
514 pstyleh5->getFont()->setSlant(ITALIC);
515 pstyleh6->getFont()->setSlant(ITALIC);
516 pstylecode->getFont()->setSlant(ITALIC);
517 pstylep->getFont()->setSlant(ITALIC);
518 pstylepre->getFont()->setSlant(ITALIC);
519 }
520
unsetItalicFont()521 void HTMLReader::unsetItalicFont()
522 {
523 pstyle->getFont()->setSlant(defaultSlant);
524 pstylec->getFont()->setSlant(defaultSlant);
525 for (uint i = 0; i < listStyles.size(); ++i)
526 listStyles[i]->getFont()->setSlant(defaultSlant);
527 pstyleh1->getFont()->setSlant(defaultSlant);
528 pstyleh2->getFont()->setSlant(defaultSlant);
529 pstyleh3->getFont()->setSlant(defaultSlant);
530 pstyleh4->getFont()->setSlant(defaultSlant);
531 pstyleh5->getFont()->setSlant(defaultSlant);
532 pstyleh6->getFont()->setSlant(defaultSlant);
533 pstylecode->getFont()->setSlant(defaultSlant);
534 pstylep->getFont()->setSlant(defaultSlant);
535 pstylepre->getFont()->setSlant(defaultSlant);
536 }
537
setBlueFont()538 void HTMLReader::setBlueFont()
539 {
540 pstyle->getFont()->setColor("Blue");
541 pstylec->getFont()->setColor("Blue");
542 for (uint i = 0; i < listStyles.size(); ++i)
543 listStyles[i]->getFont()->setColor("Blue");
544 pstyleh1->getFont()->setColor("Blue");
545 pstyleh2->getFont()->setColor("Blue");
546 pstyleh3->getFont()->setColor("Blue");
547 pstyleh4->getFont()->setColor("Blue");
548 pstyleh5->getFont()->setColor("Blue");
549 pstyleh6->getFont()->setColor("Blue");
550 pstylecode->getFont()->setColor("Blue");
551 pstylep->getFont()->setColor("Blue");
552 pstylepre->getFont()->setColor("Blue");
553 }
554
setDefaultColor()555 void HTMLReader::setDefaultColor()
556 {
557 pstyle->getFont()->setColor(defaultColor);
558 pstylec->getFont()->setColor(defaultColor);
559 for (uint i = 0; i < listStyles.size(); ++i)
560 listStyles[i]->getFont()->setColor(defaultColor);
561 pstyleh1->getFont()->setColor(defaultColor);
562 pstyleh2->getFont()->setColor(defaultColor);
563 pstyleh3->getFont()->setColor(defaultColor);
564 pstyleh4->getFont()->setColor(defaultColor);
565 pstyleh5->getFont()->setColor(defaultColor);
566 pstyleh6->getFont()->setColor(defaultColor);
567 pstylecode->getFont()->setColor(defaultColor);
568 pstylep->getFont()->setColor(defaultColor);
569 pstylepre->getFont()->setColor(defaultColor);
570 }
571
setBoldFont()572 void HTMLReader::setBoldFont()
573 {
574 pstyle->getFont()->setWeight(BOLD);
575 pstylec->getFont()->setWeight(BOLD);
576 for (uint i = 0; i < listStyles.size(); ++i)
577 listStyles[i]->getFont()->setWeight(BOLD);
578 pstylecode->getFont()->setWeight(BOLD);
579 pstylep->getFont()->setWeight(BOLD);
580 pstylepre->getFont()->setWeight(BOLD);
581 }
582
unSetBoldFont()583 void HTMLReader::unSetBoldFont()
584 {
585 pstyle->getFont()->setWeight(defaultWeight);
586 pstylec->getFont()->setWeight(defaultWeight);
587 for (uint i = 0; i < listStyles.size(); ++i)
588 listStyles[i]->getFont()->setWeight(defaultWeight);
589 pstylecode->getFont()->setWeight(REGULAR);
590 pstylep->getFont()->setWeight(defaultWeight);
591 pstylepre->getFont()->setWeight(defaultWeight);
592 }
593
parse(const QString & filename)594 void HTMLReader::parse(const QString& filename)
595 {
596 #if defined(_WIN32)
597 QString fname = QDir::toNativeSeparators(filename);
598 QByteArray fn = (QSysInfo::WindowsVersion & QSysInfo::WV_NT_based) ? fname.toUtf8() : fname.toLocal8Bit();
599 #else
600 QByteArray fn(filename.toLocal8Bit());
601 #endif
602 elemJustStarted = elemJustFinished = false;
603 htmlSAXParseFile(fn.data(), nullptr, mySAXHandler, nullptr);
604 }
605
createListStyle()606 void HTMLReader::createListStyle()
607 {
608 gtParagraphStyle* tmpStyle = new gtParagraphStyle(*listStyles[0]);
609 tmpStyle->setName(QString("HTML_li_level-%1").arg(listLevel + 1));
610 double indent = listStyles[0]->getIndent();
611 indent += 25 * (listLevel + 1);
612 tmpStyle->setIndent(indent);
613 listStyles.push_back(tmpStyle);
614 nextItemNumbers.push_back(1);
615 }
616
617 htmlSAXHandler mySAXHandlerStruct = {
618 nullptr, // internalSubset,
619 nullptr, // isStandalone,
620 nullptr, // hasInternalSubset,
621 nullptr, // hasExternalSubset,
622 nullptr, // resolveEntity,
623 nullptr, // getEntity,
624 nullptr, // entityDecl,
625 nullptr, // notationDecl,
626 nullptr, // attributeDecl,
627 nullptr, // elementDecl,
628 nullptr, // unparsedEntityDecl,
629 nullptr, // setDocumentLocator,
630 nullptr, // startDocument,
631 nullptr, // endDocument,
632 HTMLReader::startElement,
633 HTMLReader::endElement,
634 nullptr, // reference,
635 HTMLReader::characters,
636 nullptr, // ignorableWhitespace,
637 nullptr, // processingInstruction,
638 nullptr, // comment,
639 nullptr, // warning,
640 nullptr, // error,
641 nullptr, // fatalError,
642 nullptr, // getParameterEntity,
643 nullptr, // cdata,
644 nullptr,
645 1
646 #ifdef HAVE_XML26
647 ,
648 nullptr,
649 nullptr,
650 nullptr,
651 nullptr
652 #endif
653 };
654
655 htmlSAXHandlerPtr mySAXHandler = &mySAXHandlerStruct;
656
~HTMLReader()657 HTMLReader::~HTMLReader()
658 {
659 if (!extLinks.isEmpty())
660 {
661 writer->append(QObject::tr("\nExternal Links\n"), pstyleh4);
662 writer->append(extLinks, pstyle);
663 }
664 for (uint i = 0; i < listStyles.size(); ++i)
665 delete listStyles[i];
666 delete pstylec;
667 delete pstyleh1;
668 delete pstyleh2;
669 delete pstyleh3;
670 delete pstyleh4;
671 delete pstyleh5;
672 delete pstyleh6;
673 delete pstylecode;
674 delete pstylep;
675 delete pstylepre;
676 hreader = nullptr;
677 }
678
679