1 //======================================================================== 2 // 3 // HtmlOutputDev.h 4 // 5 // Copyright 1997 Derek B. Noonburg 6 // 7 // Changed 1999 by G.Ovtcharov 8 //======================================================================== 9 10 //======================================================================== 11 // 12 // Modified under the Poppler project - http://poppler.freedesktop.org 13 // 14 // All changes made under the Poppler project to this file are licensed 15 // under GPL version 2 or later 16 // 17 // Copyright (C) 2006, 2007, 2009, 2012, 2018-2021 Albert Astals Cid <aacid@kde.org> 18 // Copyright (C) 2008, 2009 Warren Toomey <wkt@tuhs.org> 19 // Copyright (C) 2009, 2011 Carlos Garcia Campos <carlosgc@gnome.org> 20 // Copyright (C) 2009 Kovid Goyal <kovid@kovidgoyal.net> 21 // Copyright (C) 2010 Hib Eris <hib@hiberis.nl> 22 // Copyright (C) 2011 Joshua Richardson <jric@chegg.com> 23 // Copyright (C) 2011 Stephen Reichling <sreichling@chegg.com> 24 // Copyright (C) 2012 Igor Slepchin <igor.redhat@gmail.com> 25 // Copyright (C) 2012 Fabio D'Urso <fabiodurso@hotmail.it> 26 // Copyright (C) 2013 Thomas Freitag <Thomas.Freitag@alfa.de> 27 // Copyright (C) 2018 Klarälvdalens Datakonsult AB, a KDAB Group company, <info@kdab.com>. Work sponsored by the LiMux project of the city of Munich 28 // Copyright (C) 2019 Oliver Sander <oliver.sander@tu-dresden.de> 29 // 30 // To see a description of the changes please see the Changelog file that 31 // came with your tarball or type make ChangeLog if you are building from git 32 // 33 //======================================================================== 34 35 #ifndef HTMLOUTPUTDEV_H 36 #define HTMLOUTPUTDEV_H 37 38 #include <cstdio> 39 #include "goo/gbasename.h" 40 #include "GfxFont.h" 41 #include "OutputDev.h" 42 #include "HtmlLinks.h" 43 #include "HtmlFonts.h" 44 #include "Link.h" 45 #include "Catalog.h" 46 #include "UnicodeMap.h" 47 48 #define xoutRound(x) ((int)(x + 0.5)) 49 50 #define DOCTYPE "<!DOCTYPE html>" 51 52 class GfxState; 53 class GooString; 54 class HtmlImage; 55 class PDFDoc; 56 class OutlineItem; 57 //------------------------------------------------------------------------ 58 // HtmlString 59 //------------------------------------------------------------------------ 60 61 enum UnicodeTextDirection 62 { 63 textDirUnknown, 64 textDirLeftRight, 65 textDirRightLeft, 66 textDirTopBottom 67 }; 68 69 class HtmlString 70 { 71 public: 72 // Constructor. 73 HtmlString(GfxState *state, double fontSize, HtmlFontAccu *fonts); 74 75 // Destructor. 76 ~HtmlString(); 77 78 HtmlString(const HtmlString &) = delete; 79 HtmlString &operator=(const HtmlString &) = delete; 80 81 // Add a character to the string. 82 void addChar(GfxState *state, double x, double y, double dx, double dy, Unicode u); getLink()83 const HtmlLink *getLink() const { return link; } getFont()84 const HtmlFont &getFont() const { return *fonts->Get(fontpos); } 85 void endString(); // postprocessing 86 87 private: 88 // aender die text variable 89 const HtmlLink *link; 90 double xMin, xMax; // bounding box x coordinates 91 double yMin, yMax; // bounding box y coordinates 92 int col; // starting column 93 Unicode *text; // the text 94 double *xRight; // right-hand x coord of each char 95 HtmlString *yxNext; // next string in y-major order 96 HtmlString *xyNext; // next string in x-major order 97 int fontpos; 98 GooString *htext; 99 int len; // length of text and xRight 100 int size; // size of text and xRight arrays 101 UnicodeTextDirection dir; // direction (left to right/right to left) 102 HtmlFontAccu *fonts; 103 104 friend class HtmlPage; 105 }; 106 107 //------------------------------------------------------------------------ 108 // HtmlPage 109 //------------------------------------------------------------------------ 110 111 class HtmlPage 112 { 113 public: 114 // Constructor. 115 explicit HtmlPage(bool rawOrder); 116 117 // Destructor. 118 ~HtmlPage(); 119 120 HtmlPage(const HtmlPage &) = delete; 121 HtmlPage &operator=(const HtmlPage &) = delete; 122 123 // Begin a new string. 124 void beginString(GfxState *state, const GooString *s); 125 126 // Add a character to the current string. 127 void addChar(GfxState *state, double x, double y, double dx, double dy, double ox, double oy, const Unicode *u, int uLen); // unsigned char c); 128 129 void updateFont(GfxState *state); 130 131 // End the current string, sorting it into the list of strings. 132 void endString(); 133 134 // Coalesce strings that look like parts of the same line. 135 void coalesce(); 136 137 // Find a string. If <top> is true, starts looking at top of page; 138 // otherwise starts looking at <xMin>,<yMin>. If <bottom> is true, 139 // stops looking at bottom of page; otherwise stops looking at 140 // <xMax>,<yMax>. If found, sets the text bounding rectangle and 141 // returns true; otherwise returns false. 142 143 // new functions AddLink(const HtmlLink & x)144 void AddLink(const HtmlLink &x) { links->AddLink(x); } 145 146 // add an image to the current page 147 void addImage(GooString *fname, GfxState *state); 148 149 // number of images on the current page getNumImages()150 int getNumImages() { return imgList.size(); } 151 152 void dump(FILE *f, int pageNum, const std::vector<std::string> &backgroundImages); 153 154 // Clear the page. 155 void clear(); 156 157 void conv(); 158 159 private: getFont(HtmlString * hStr)160 const HtmlFont *getFont(HtmlString *hStr) const { return fonts->Get(hStr->fontpos); } 161 162 double fontSize; // current font size 163 bool rawOrder; // keep strings in content stream order 164 165 HtmlString *curStr; // currently active string 166 167 HtmlString *yxStrings; // strings in y-major order 168 HtmlString *xyStrings; // strings in x-major order 169 HtmlString *yxCur1, *yxCur2; // cursors for yxStrings list 170 171 void setDocName(const char *fname); 172 void dumpAsXML(FILE *f, int page); 173 void dumpComplex(FILE *f, int page, const std::vector<std::string> &backgroundImages); 174 int dumpComplexHeaders(FILE *const file, FILE *&pageFile, int page); 175 176 // marks the position of the fonts that belong to current page (for noframes) 177 int fontsPageMarker; 178 HtmlFontAccu *fonts; 179 HtmlLinks *links; 180 std::vector<HtmlImage *> imgList; 181 182 GooString *DocName; 183 int pageWidth; 184 int pageHeight; 185 int firstPage; // used to begin the numeration of pages 186 187 friend class HtmlOutputDev; 188 }; 189 190 //------------------------------------------------------------------------ 191 // HtmlMetaVar 192 //------------------------------------------------------------------------ 193 class HtmlMetaVar 194 { 195 public: 196 HtmlMetaVar(const char *_name, const char *_content); 197 ~HtmlMetaVar(); 198 199 HtmlMetaVar(const HtmlMetaVar &) = delete; 200 HtmlMetaVar &operator=(const HtmlMetaVar &) = delete; 201 202 GooString *toString() const; 203 204 private: 205 GooString *name; 206 GooString *content; 207 }; 208 209 //------------------------------------------------------------------------ 210 // HtmlOutputDev 211 //------------------------------------------------------------------------ 212 213 class HtmlOutputDev : public OutputDev 214 { 215 public: 216 // Open a text output file. If <fileName> is nullptr, no file is written 217 // (this is useful, e.g., for searching text). If <useASCII7> is true, 218 // text is converted to 7-bit ASCII; otherwise, text is converted to 219 // 8-bit ISO Latin-1. <useASCII7> should also be set for Japanese 220 // (EUC-JP) text. If <rawOrder> is true, the text is kept in content 221 // stream order. 222 HtmlOutputDev(Catalog *catalogA, const char *fileName, const char *title, const char *author, const char *keywords, const char *subject, const char *date, bool rawOrder, int firstPage = 1, bool outline = false); 223 224 // Destructor. 225 ~HtmlOutputDev() override; 226 227 // Check if file was successfully created. isOk()228 virtual bool isOk() { return ok; } 229 230 //---- get info about output device 231 232 // Does this device use upside-down coordinates? 233 // (Upside-down means (0,0) is the top left corner of the page.) upsideDown()234 bool upsideDown() override { return true; } 235 236 // Does this device use drawChar() or drawString()? useDrawChar()237 bool useDrawChar() override { return true; } 238 239 // Does this device use beginType3Char/endType3Char? Otherwise, 240 // text in Type 3 fonts will be drawn with drawChar/drawString. interpretType3Chars()241 bool interpretType3Chars() override { return false; } 242 243 // Does this device need non-text content? needNonText()244 bool needNonText() override { return true; } 245 246 //----- initialization and control 247 248 bool checkPageSlice(Page *p, double hDPI, double vDPI, int rotate, bool useMediaBox, bool crop, int sliceX, int sliceY, int sliceW, int sliceH, bool printing, bool (*abortCheckCbk)(void *data) = nullptr, 249 void *abortCheckCbkData = nullptr, bool (*annotDisplayDecideCbk)(Annot *annot, void *user_data) = nullptr, void *annotDisplayDecideCbkData = nullptr) override 250 { 251 docPage = p; 252 return true; 253 } 254 255 // Start a page. 256 void startPage(int pageNum, GfxState *state, XRef *xref) override; 257 258 // End a page. 259 void endPage() override; 260 261 // add a background image to the list of background images, 262 // as this seems to be done outside other processing. takes ownership of img. 263 void addBackgroundImage(const std::string &img); 264 265 //----- update text state 266 void updateFont(GfxState *state) override; 267 268 //----- text drawing 269 void beginString(GfxState *state, const GooString *s) override; 270 void endString(GfxState *state) override; 271 void drawChar(GfxState *state, double x, double y, double dx, double dy, double originX, double originY, CharCode code, int nBytes, const Unicode *u, int uLen) override; 272 273 void drawImageMask(GfxState *state, Object *ref, Stream *str, int width, int height, bool invert, bool interpolate, bool inlineImg) override; 274 void drawImage(GfxState *state, Object *ref, Stream *str, int width, int height, GfxImageColorMap *colorMap, bool interpolate, const int *maskColors, bool inlineImg) override; 275 276 // new feature DevType()277 virtual int DevType() { return 1234; } 278 getPageWidth()279 int getPageWidth() { return maxPageWidth; } getPageHeight()280 int getPageHeight() { return maxPageHeight; } 281 282 bool dumpDocOutline(PDFDoc *doc); 283 284 private: 285 // convert encoding into a HTML standard, or encoding->c_str if not 286 // recognized. 287 static std::string mapEncodingToHtml(const std::string &encoding); 288 void doProcessLink(AnnotLink *link); 289 GooString *getLinkDest(AnnotLink *link); 290 void dumpMetaVars(FILE *); 291 void doFrame(int firstPage); 292 bool newHtmlOutlineLevel(FILE *output, const std::vector<OutlineItem *> *outlines, int level = 1); 293 void newXmlOutlineLevel(FILE *output, const std::vector<OutlineItem *> *outlines); 294 int getOutlinePageNum(OutlineItem *item); 295 void drawJpegImage(GfxState *state, Stream *str); 296 void drawPngImage(GfxState *state, Stream *str, int width, int height, GfxImageColorMap *colorMap, bool isMask = false); 297 GooString *createImageFileName(const char *ext); 298 299 FILE *fContentsFrame; 300 FILE *page; // html file 301 // FILE *tin; // image log file 302 // bool write; 303 bool needClose; // need to close the file? 304 HtmlPage *pages; // text for the current page 305 bool rawOrder; // keep text in content stream order 306 bool doOutline; // output document outline 307 bool ok; // set up ok? 308 bool dumpJPEG; 309 int pageNum; 310 int maxPageWidth; 311 int maxPageHeight; 312 GooString *Docname; 313 GooString *docTitle; 314 std::vector<HtmlMetaVar *> glMetaVars; 315 Catalog *catalog; 316 Page *docPage; 317 std::vector<std::string> backgroundImages; 318 friend class HtmlPage; 319 }; 320 321 #endif 322