1 //========================================================================
2 //
3 // HtmlOutputDev.h
4 //
5 // Copyright 1997 Derek B. Noonburg
6 //
7 // Changed 1999 by G.Ovtcharov
8 //========================================================================
9 
10 //========================================================================
11 //
12 // Modified under the Poppler project - http://poppler.freedesktop.org
13 //
14 // All changes made under the Poppler project to this file are licensed
15 // under GPL version 2 or later
16 //
17 // Copyright (C) 2006, 2007, 2009, 2012, 2018-2021 Albert Astals Cid <aacid@kde.org>
18 // Copyright (C) 2008, 2009 Warren Toomey <wkt@tuhs.org>
19 // Copyright (C) 2009, 2011 Carlos Garcia Campos <carlosgc@gnome.org>
20 // Copyright (C) 2009 Kovid Goyal <kovid@kovidgoyal.net>
21 // Copyright (C) 2010 Hib Eris <hib@hiberis.nl>
22 // Copyright (C) 2011 Joshua Richardson <jric@chegg.com>
23 // Copyright (C) 2011 Stephen Reichling <sreichling@chegg.com>
24 // Copyright (C) 2012 Igor Slepchin <igor.redhat@gmail.com>
25 // Copyright (C) 2012 Fabio D'Urso <fabiodurso@hotmail.it>
26 // Copyright (C) 2013 Thomas Freitag <Thomas.Freitag@alfa.de>
27 // Copyright (C) 2018 Klarälvdalens Datakonsult AB, a KDAB Group company, <info@kdab.com>. Work sponsored by the LiMux project of the city of Munich
28 // Copyright (C) 2019 Oliver Sander <oliver.sander@tu-dresden.de>
29 //
30 // To see a description of the changes please see the Changelog file that
31 // came with your tarball or type make ChangeLog if you are building from git
32 //
33 //========================================================================
34 
35 #ifndef HTMLOUTPUTDEV_H
36 #define HTMLOUTPUTDEV_H
37 
38 #include <cstdio>
39 #include "goo/gbasename.h"
40 #include "GfxFont.h"
41 #include "OutputDev.h"
42 #include "HtmlLinks.h"
43 #include "HtmlFonts.h"
44 #include "Link.h"
45 #include "Catalog.h"
46 #include "UnicodeMap.h"
47 
48 #define xoutRound(x) ((int)(x + 0.5))
49 
50 #define DOCTYPE "<!DOCTYPE html>"
51 
52 class GfxState;
53 class GooString;
54 class HtmlImage;
55 class PDFDoc;
56 class OutlineItem;
57 //------------------------------------------------------------------------
58 // HtmlString
59 //------------------------------------------------------------------------
60 
61 enum UnicodeTextDirection
62 {
63     textDirUnknown,
64     textDirLeftRight,
65     textDirRightLeft,
66     textDirTopBottom
67 };
68 
69 class HtmlString
70 {
71 public:
72     // Constructor.
73     HtmlString(GfxState *state, double fontSize, HtmlFontAccu *fonts);
74 
75     // Destructor.
76     ~HtmlString();
77 
78     HtmlString(const HtmlString &) = delete;
79     HtmlString &operator=(const HtmlString &) = delete;
80 
81     // Add a character to the string.
82     void addChar(GfxState *state, double x, double y, double dx, double dy, Unicode u);
getLink()83     const HtmlLink *getLink() const { return link; }
getFont()84     const HtmlFont &getFont() const { return *fonts->Get(fontpos); }
85     void endString(); // postprocessing
86 
87 private:
88     // aender die text variable
89     const HtmlLink *link;
90     double xMin, xMax; // bounding box x coordinates
91     double yMin, yMax; // bounding box y coordinates
92     int col; // starting column
93     Unicode *text; // the text
94     double *xRight; // right-hand x coord of each char
95     HtmlString *yxNext; // next string in y-major order
96     HtmlString *xyNext; // next string in x-major order
97     int fontpos;
98     GooString *htext;
99     int len; // length of text and xRight
100     int size; // size of text and xRight arrays
101     UnicodeTextDirection dir; // direction (left to right/right to left)
102     HtmlFontAccu *fonts;
103 
104     friend class HtmlPage;
105 };
106 
107 //------------------------------------------------------------------------
108 // HtmlPage
109 //------------------------------------------------------------------------
110 
111 class HtmlPage
112 {
113 public:
114     // Constructor.
115     explicit HtmlPage(bool rawOrder);
116 
117     // Destructor.
118     ~HtmlPage();
119 
120     HtmlPage(const HtmlPage &) = delete;
121     HtmlPage &operator=(const HtmlPage &) = delete;
122 
123     // Begin a new string.
124     void beginString(GfxState *state, const GooString *s);
125 
126     // Add a character to the current string.
127     void addChar(GfxState *state, double x, double y, double dx, double dy, double ox, double oy, const Unicode *u, int uLen); // unsigned char c);
128 
129     void updateFont(GfxState *state);
130 
131     // End the current string, sorting it into the list of strings.
132     void endString();
133 
134     // Coalesce strings that look like parts of the same line.
135     void coalesce();
136 
137     // Find a string.  If <top> is true, starts looking at top of page;
138     // otherwise starts looking at <xMin>,<yMin>.  If <bottom> is true,
139     // stops looking at bottom of page; otherwise stops looking at
140     // <xMax>,<yMax>.  If found, sets the text bounding rectangle and
141     // returns true; otherwise returns false.
142 
143     // new functions
AddLink(const HtmlLink & x)144     void AddLink(const HtmlLink &x) { links->AddLink(x); }
145 
146     // add an image to the current page
147     void addImage(GooString *fname, GfxState *state);
148 
149     // number of images on the current page
getNumImages()150     int getNumImages() { return imgList.size(); }
151 
152     void dump(FILE *f, int pageNum, const std::vector<std::string> &backgroundImages);
153 
154     // Clear the page.
155     void clear();
156 
157     void conv();
158 
159 private:
getFont(HtmlString * hStr)160     const HtmlFont *getFont(HtmlString *hStr) const { return fonts->Get(hStr->fontpos); }
161 
162     double fontSize; // current font size
163     bool rawOrder; // keep strings in content stream order
164 
165     HtmlString *curStr; // currently active string
166 
167     HtmlString *yxStrings; // strings in y-major order
168     HtmlString *xyStrings; // strings in x-major order
169     HtmlString *yxCur1, *yxCur2; // cursors for yxStrings list
170 
171     void setDocName(const char *fname);
172     void dumpAsXML(FILE *f, int page);
173     void dumpComplex(FILE *f, int page, const std::vector<std::string> &backgroundImages);
174     int dumpComplexHeaders(FILE *const file, FILE *&pageFile, int page);
175 
176     // marks the position of the fonts that belong to current page (for noframes)
177     int fontsPageMarker;
178     HtmlFontAccu *fonts;
179     HtmlLinks *links;
180     std::vector<HtmlImage *> imgList;
181 
182     GooString *DocName;
183     int pageWidth;
184     int pageHeight;
185     int firstPage; // used to begin the numeration of pages
186 
187     friend class HtmlOutputDev;
188 };
189 
190 //------------------------------------------------------------------------
191 // HtmlMetaVar
192 //------------------------------------------------------------------------
193 class HtmlMetaVar
194 {
195 public:
196     HtmlMetaVar(const char *_name, const char *_content);
197     ~HtmlMetaVar();
198 
199     HtmlMetaVar(const HtmlMetaVar &) = delete;
200     HtmlMetaVar &operator=(const HtmlMetaVar &) = delete;
201 
202     GooString *toString() const;
203 
204 private:
205     GooString *name;
206     GooString *content;
207 };
208 
209 //------------------------------------------------------------------------
210 // HtmlOutputDev
211 //------------------------------------------------------------------------
212 
213 class HtmlOutputDev : public OutputDev
214 {
215 public:
216     // Open a text output file.  If <fileName> is nullptr, no file is written
217     // (this is useful, e.g., for searching text).  If <useASCII7> is true,
218     // text is converted to 7-bit ASCII; otherwise, text is converted to
219     // 8-bit ISO Latin-1.  <useASCII7> should also be set for Japanese
220     // (EUC-JP) text.  If <rawOrder> is true, the text is kept in content
221     // stream order.
222     HtmlOutputDev(Catalog *catalogA, const char *fileName, const char *title, const char *author, const char *keywords, const char *subject, const char *date, bool rawOrder, int firstPage = 1, bool outline = false);
223 
224     // Destructor.
225     ~HtmlOutputDev() override;
226 
227     // Check if file was successfully created.
isOk()228     virtual bool isOk() { return ok; }
229 
230     //---- get info about output device
231 
232     // Does this device use upside-down coordinates?
233     // (Upside-down means (0,0) is the top left corner of the page.)
upsideDown()234     bool upsideDown() override { return true; }
235 
236     // Does this device use drawChar() or drawString()?
useDrawChar()237     bool useDrawChar() override { return true; }
238 
239     // Does this device use beginType3Char/endType3Char?  Otherwise,
240     // text in Type 3 fonts will be drawn with drawChar/drawString.
interpretType3Chars()241     bool interpretType3Chars() override { return false; }
242 
243     // Does this device need non-text content?
needNonText()244     bool needNonText() override { return true; }
245 
246     //----- initialization and control
247 
248     bool checkPageSlice(Page *p, double hDPI, double vDPI, int rotate, bool useMediaBox, bool crop, int sliceX, int sliceY, int sliceW, int sliceH, bool printing, bool (*abortCheckCbk)(void *data) = nullptr,
249                         void *abortCheckCbkData = nullptr, bool (*annotDisplayDecideCbk)(Annot *annot, void *user_data) = nullptr, void *annotDisplayDecideCbkData = nullptr) override
250     {
251         docPage = p;
252         return true;
253     }
254 
255     // Start a page.
256     void startPage(int pageNum, GfxState *state, XRef *xref) override;
257 
258     // End a page.
259     void endPage() override;
260 
261     // add a background image to the list of background images,
262     // as this seems to be done outside other processing. takes ownership of img.
263     void addBackgroundImage(const std::string &img);
264 
265     //----- update text state
266     void updateFont(GfxState *state) override;
267 
268     //----- text drawing
269     void beginString(GfxState *state, const GooString *s) override;
270     void endString(GfxState *state) override;
271     void drawChar(GfxState *state, double x, double y, double dx, double dy, double originX, double originY, CharCode code, int nBytes, const Unicode *u, int uLen) override;
272 
273     void drawImageMask(GfxState *state, Object *ref, Stream *str, int width, int height, bool invert, bool interpolate, bool inlineImg) override;
274     void drawImage(GfxState *state, Object *ref, Stream *str, int width, int height, GfxImageColorMap *colorMap, bool interpolate, const int *maskColors, bool inlineImg) override;
275 
276     // new feature
DevType()277     virtual int DevType() { return 1234; }
278 
getPageWidth()279     int getPageWidth() { return maxPageWidth; }
getPageHeight()280     int getPageHeight() { return maxPageHeight; }
281 
282     bool dumpDocOutline(PDFDoc *doc);
283 
284 private:
285     // convert encoding into a HTML standard, or encoding->c_str if not
286     // recognized.
287     static std::string mapEncodingToHtml(const std::string &encoding);
288     void doProcessLink(AnnotLink *link);
289     GooString *getLinkDest(AnnotLink *link);
290     void dumpMetaVars(FILE *);
291     void doFrame(int firstPage);
292     bool newHtmlOutlineLevel(FILE *output, const std::vector<OutlineItem *> *outlines, int level = 1);
293     void newXmlOutlineLevel(FILE *output, const std::vector<OutlineItem *> *outlines);
294     int getOutlinePageNum(OutlineItem *item);
295     void drawJpegImage(GfxState *state, Stream *str);
296     void drawPngImage(GfxState *state, Stream *str, int width, int height, GfxImageColorMap *colorMap, bool isMask = false);
297     GooString *createImageFileName(const char *ext);
298 
299     FILE *fContentsFrame;
300     FILE *page; // html file
301     // FILE *tin;                    // image log file
302     // bool write;
303     bool needClose; // need to close the file?
304     HtmlPage *pages; // text for the current page
305     bool rawOrder; // keep text in content stream order
306     bool doOutline; // output document outline
307     bool ok; // set up ok?
308     bool dumpJPEG;
309     int pageNum;
310     int maxPageWidth;
311     int maxPageHeight;
312     GooString *Docname;
313     GooString *docTitle;
314     std::vector<HtmlMetaVar *> glMetaVars;
315     Catalog *catalog;
316     Page *docPage;
317     std::vector<std::string> backgroundImages;
318     friend class HtmlPage;
319 };
320 
321 #endif
322