1 /* 2 For general Scribus (>=1.3.2) copyright and licensing information please refer 3 to the COPYING file provided with the program. Following this notice may exist 4 a copyright and/or license notice that predates the release of Scribus 1.3.2 5 for which a new license (GPL+exception) is in place. 6 */ 7 #ifndef PDFTEXTRECOGNITION_H 8 #define PDFTEXTRECOGNITION_H 9 10 #include <QSizeF> 11 #include <QStack> 12 #include <QString> 13 14 #include "pageitem.h" 15 #include "importpdfconfig.h" 16 #include "slaoutput.h" 17 18 #include <poppler/GfxState.h> 19 #include <poppler/CharCodeToUnicode.h> 20 21 /* PDF TextBox Framework */ 22 /* 23 * Holds all the details for each glyph in the text imported from the pdf file. 24 * 25 */ 26 struct PdfGlyph 27 { 28 double dx; // X advance value 29 double dy; // Y advance value 30 double rise; // Text rise parameter 31 QChar code; // UTF-16 coded character 32 }; 33 34 35 class PdfTextRegionLine 36 { 37 public: 38 qreal maxHeight = {}; 39 //we can probably use maxHeight for this. 40 qreal width = {}; 41 int glyphIndex = {}; 42 QPointF baseOrigin = QPointF({}, {}); 43 std::vector<PdfTextRegionLine> segments = std::vector<PdfTextRegionLine>(); 44 45 }; 46 47 class PdfTextRegion 48 { 49 public: 50 enum class LineType 51 { 52 FIRSTPOINT, 53 SAMELINE, 54 STYLESUPERSCRIPT, 55 STYLENORMALRETURN, 56 STYLEBELOWBASELINE, 57 NEWLINE, 58 ENDOFLINE, //TODO: Implement an end of line test 59 FAIL 60 }; 61 # 62 /* 63 * the bounding box shape splines in percentage of width and height. In this case 100% as we want to clip shape to be the full TextBox width and height. */ 64 static constexpr double boundingBoxShape[32] = { 65 0.0, 0.0, 66 0.0, 0.0, 67 100.0, 0.0, 68 100.0, 0.0, 69 100.0, 0.0, 70 100.0, 0.0, 71 100.0, 100.0, 72 100.0, 100.0, 73 100.0, 100.0, 74 100.0, 100.0, 75 0.0, 100.0, 76 0.0, 100.0, 77 0.0, 100.0, 78 0.0, 100.0, 79 0.0, 0.0, 80 0.0, 0.0 81 }; 82 83 QPointF pdfTextRegionBasenOrigin = QPointF({}, {}); 84 qreal maxHeight = {}; 85 qreal lineSpacing = { 1 }; 86 std::vector<PdfTextRegionLine> pdfTextRegionLines = std::vector<PdfTextRegionLine>(); 87 qreal maxWidth = {}; 88 QPointF lineBaseXY = QPointF({ }, { }); //updated with the best match left value from all the textRegionLines and the best bottom value from the textRegionLines.segments; 89 QPointF lastXY = QPointF({}, {}); 90 std::vector<PdfGlyph> glyphs; 91 92 static bool collinear(qreal a, qreal b); 93 bool isCloseToX(qreal x1, qreal x2); 94 bool isCloseToY(qreal y1, qreal y2); 95 bool adjunctLesser(qreal testY, qreal lastY, qreal baseY); 96 bool adjunctGreater(qreal testY, qreal lastY, qreal baseY); 97 PdfTextRegion::LineType linearTest(QPointF point, bool xInLimits, bool yInLimits); 98 PdfTextRegion::LineType isRegionConcurrent(QPointF newPoint); 99 PdfTextRegion::LineType moveToPoint(QPointF newPoint); 100 PdfTextRegion::LineType addGlyphAtPoint(QPointF newGlyphPoint, PdfGlyph new_glyph); 101 void renderToTextFrame(PageItem* textNode); 102 bool isNew(); 103 }; 104 105 class PdfTextRecognition 106 { 107 public: 108 PdfTextRecognition(); 109 ~PdfTextRecognition(); 110 111 enum class AddCharMode 112 { 113 ADDFIRSTCHAR, 114 ADDBASICCHAR, 115 ADDCHARWITHNEWSTYLE, 116 ADDCHARWITHPREVIOUSSTYLE, 117 ADDCHARWITHBASESTLYE 118 }; 119 setCharMode(AddCharMode mode)120 void setCharMode(AddCharMode mode) 121 { 122 m_addCharMode = mode; 123 } 124 125 PdfTextRegion* activePdfTextRegion = nullptr; //faster and cleaner than calling back on the vector all the time. 126 127 void addPdfTextRegion(); 128 void addChar(GfxState* state, double x, double y, double dx, double dy, double originX, double originY, CharCode code, int nBytes, POPPLER_CONST_082 Unicode* u, int uLen); 129 bool isNewLineOrRegion(QPointF newPosition); 130 131 private: 132 std::vector<PdfTextRegion> m_pdfTextRegions = std::vector<PdfTextRegion>(); 133 AddCharMode m_addCharMode = AddCharMode::ADDFIRSTCHAR; 134 PdfGlyph AddCharCommon(GfxState* state, double x, double y, double dx, double dy, Unicode const* u, int uLen); 135 PdfGlyph AddFirstChar(GfxState* state, double x, double y, double dx, double dy, double originX, double originY, CharCode code, int nBytes, Unicode const* u, int uLen); 136 PdfGlyph AddBasicChar(GfxState* state, double x, double y, double dx, double dy, double originX, double originY, CharCode code, int nBytes, Unicode const* u, int uLen); 137 PdfGlyph AddCharWithNewStyle(GfxState* state, double x, double y, double dx, double dy, double originX, double originY, CharCode code, int nBytes, Unicode const* u, int uLen); 138 PdfGlyph AddCharWithPreviousStyle(GfxState* state, double x, double y, double dx, double dy, double originX, double originY, CharCode code, int nBytes, Unicode const* u, int uLen); 139 }; 140 141 142 class PdfTextOutputDev : public SlaOutputDev 143 { 144 public: 145 PdfTextOutputDev(ScribusDoc* doc, QList<PageItem*>* Elements, QStringList* importedColors, int flags); 146 virtual ~PdfTextOutputDev(); 147 148 void updateFont(GfxState* state) override; 149 150 //----- text drawing 151 void beginTextObject(GfxState* state) override; 152 void endTextObject(GfxState* state) override; 153 void drawChar(GfxState* state, double /*x*/, double /*y*/, double /*dx*/, double /*dy*/, double /*originX*/, double /*originY*/, CharCode /*code*/, int /*nBytes*/, POPPLER_CONST_082 Unicode* /*u*/, int /*uLen*/) override; 154 GBool beginType3Char(GfxState* /*state*/, double /*x*/, double /*y*/, double /*dx*/, double /*dy*/, CharCode /*code*/, POPPLER_CONST_082 Unicode* /*u*/, int /*uLen*/) override; 155 void endType3Char(GfxState* /*state*/) override; 156 void type3D0(GfxState* /*state*/, double /*wx*/, double /*wy*/) override; 157 void type3D1(GfxState* /*state*/, double /*wx*/, double /*wy*/, double /*llx*/, double /*lly*/, double /*urx*/, double /*ury*/) override; 158 159 private: 160 void setFillAndStrokeForPDF(GfxState* state, PageItem* text_node); 161 void updateTextPos(GfxState* state) override; 162 void renderTextFrame(); 163 void finishItem(PageItem* item); 164 PdfTextRecognition m_pdfTextRecognition = {}; 165 }; 166 #endif 167