1 //======================================================================== 2 // 3 // StructElement.h 4 // 5 // This file is licensed under the GPLv2 or later 6 // 7 // Copyright 2013, 2014 Igalia S.L. 8 // Copyright 2014 Luigi Scarso <luigi.scarso@gmail.com> 9 // Copyright 2014, 2018, 2019, 2021 Albert Astals Cid <aacid@kde.org> 10 // Copyright 2018 Adam Reichold <adam.reichold@t-online.de> 11 // Copyright 2021 Adrian Johnson <ajohnson@redneon.com> 12 // 13 //======================================================================== 14 15 #ifndef STRUCTELEMENT_H 16 #define STRUCTELEMENT_H 17 18 #include "goo/GooString.h" 19 #include "MarkedContentOutputDev.h" 20 #include "Object.h" 21 #include "poppler_private_export.h" 22 #include <vector> 23 #include <set> 24 25 class GooString; 26 class Dict; 27 class StructElement; 28 class StructTreeRoot; 29 30 class POPPLER_PRIVATE_EXPORT Attribute 31 { 32 public: 33 enum Type 34 { 35 Unknown = 0, // Uninitialized, parsing error, etc. 36 UserProperty, // User defined attribute (i.e. non-standard) 37 38 // Common standard attributes 39 Placement, 40 WritingMode, 41 BackgroundColor, 42 BorderColor, 43 BorderStyle, 44 BorderThickness, 45 Color, 46 Padding, 47 48 // Block element standard attributes 49 SpaceBefore, 50 SpaceAfter, 51 StartIndent, 52 EndIndent, 53 TextIndent, 54 TextAlign, 55 BBox, 56 Width, 57 Height, 58 BlockAlign, 59 InlineAlign, 60 TBorderStyle, 61 TPadding, 62 63 // Inline element standard attributes 64 BaselineShift, 65 LineHeight, 66 TextDecorationColor, 67 TextDecorationThickness, 68 TextDecorationType, 69 RubyAlign, 70 RubyPosition, 71 GlyphOrientationVertical, 72 73 // Column-only standard attributes 74 ColumnCount, 75 ColumnGap, 76 ColumnWidths, 77 78 // List-only standard attributes 79 ListNumbering, 80 81 // PrintField-only standard attributes 82 Role, 83 checked, 84 Desc, 85 86 // Table-only standard attributes 87 RowSpan, 88 ColSpan, 89 Headers, 90 Scope, 91 Summary, 92 }; 93 94 enum Owner 95 { 96 UnknownOwner = 0, 97 // User-defined attributes 98 UserProperties, 99 // Standard attributes 100 Layout, 101 List, 102 PrintField, 103 Table, 104 // Translation to other formats 105 XML_1_00, 106 HTML_3_20, 107 HTML_4_01, 108 OEB_1_00, 109 RTF_1_05, 110 CSS_1_00, 111 CSS_2_00, 112 }; 113 114 // Creates a standard attribute. The name is predefined, and the 115 // value is type-checked to conform to the PDF specification. 116 Attribute(Type type, Object *value); 117 118 // Creates an UserProperty attribute, with an arbitrary name and value. 119 Attribute(GooString &&name, Object *value); 120 isOk()121 bool isOk() const { return type != Unknown; } 122 123 // Name, type and value can be set only on construction. getType()124 Type getType() const { return type; } getOwner()125 Owner getOwner() const { return owner; } 126 const char *getTypeName() const; 127 const char *getOwnerName() const; getValue()128 const Object *getValue() const { return &value; } 129 static Object *getDefaultValue(Type type); 130 131 // The caller gets the ownership of the return GooString and is responsible of deleting it getName()132 std::unique_ptr<GooString> getName() const { return std::make_unique<GooString>(type == UserProperty ? name.c_str() : getTypeName()); } 133 134 // The revision is optional, and defaults to zero. getRevision()135 unsigned int getRevision() const { return revision; } setRevision(unsigned int revisionA)136 void setRevision(unsigned int revisionA) { revision = revisionA; } 137 138 // Hidden elements should not be displayed by the user agent isHidden()139 bool isHidden() const { return hidden; } setHidden(bool hiddenA)140 void setHidden(bool hiddenA) { hidden = hiddenA; } 141 142 // The formatted value may be in the PDF, or be left undefined (nullptr). 143 // In the later case the user agent should provide a default representation. getFormattedValue()144 const char *getFormattedValue() const { return formatted ? formatted->c_str() : nullptr; } 145 void setFormattedValue(const char *formattedA); 146 147 ~Attribute(); 148 149 private: 150 Type type; 151 Owner owner; 152 unsigned int revision; 153 GooString name; 154 Object value; 155 bool hidden; 156 GooString *formatted; 157 158 bool checkType(StructElement *element = nullptr); 159 static Type getTypeForName(const char *name, StructElement *element = nullptr); 160 static Attribute *parseUserProperty(Dict *property); 161 162 friend class StructElement; 163 }; 164 165 class POPPLER_PRIVATE_EXPORT StructElement 166 { 167 public: 168 enum Type 169 { 170 Unknown = 0, 171 MCID, // MCID reference, used internally 172 OBJR, // Object reference, used internally 173 174 Document, 175 Part, 176 Art, 177 Sect, 178 Div, // Structural elements 179 180 Span, 181 Quote, 182 Note, 183 Reference, 184 BibEntry, // Inline elements 185 Code, 186 Link, 187 Annot, 188 BlockQuote, 189 Caption, 190 NonStruct, 191 TOC, 192 TOCI, 193 Index, 194 Private, 195 196 P, 197 H, 198 H1, 199 H2, 200 H3, 201 H4, 202 H5, 203 H6, // Paragraph-like 204 205 L, 206 LI, 207 Lbl, 208 LBody, // List elements 209 210 Table, 211 TR, 212 TH, 213 TD, 214 THead, 215 TFoot, 216 TBody, // Table elements 217 218 Ruby, 219 RB, 220 RT, 221 RP, // Ruby text elements 222 Warichu, 223 WT, 224 WP, 225 226 Figure, 227 Formula, 228 Form, // Illustration-like elements 229 }; 230 231 static const Ref InvalidRef; 232 233 const char *getTypeName() const; getType()234 Type getType() const { return type; } isOk()235 bool isOk() const { return type != Unknown; } 236 bool isBlock() const; 237 bool isInline() const; 238 bool isGrouping() const; 239 isContent()240 inline bool isContent() const { return (type == MCID) || isObjectRef(); } isObjectRef()241 inline bool isObjectRef() const { return (type == OBJR && c->ref != Ref::INVALID()); } 242 getMCID()243 int getMCID() const { return c->mcid; } getObjectRef()244 Ref getObjectRef() const { return c->ref; } getParentRef()245 Ref getParentRef() { return isContent() ? parent->getParentRef() : s->parentRef; } 246 bool hasPageRef() const; 247 bool getPageRef(Ref &ref) const; getStructTreeRoot()248 StructTreeRoot *getStructTreeRoot() { return treeRoot; } 249 250 // Optional element identifier. getID()251 const GooString *getID() const { return isContent() ? nullptr : s->id; } getID()252 GooString *getID() { return isContent() ? nullptr : s->id; } 253 254 // Optional ISO language name, e.g. en_US getLanguage()255 GooString *getLanguage() 256 { 257 if (!isContent() && s->language) 258 return s->language; 259 return parent ? parent->getLanguage() : nullptr; 260 } getLanguage()261 const GooString *getLanguage() const 262 { 263 if (!isContent() && s->language) 264 return s->language; 265 return parent ? parent->getLanguage() : nullptr; 266 } 267 268 // Optional revision number, defaults to zero. getRevision()269 unsigned int getRevision() const { return isContent() ? 0 : s->revision; } setRevision(unsigned int revision)270 void setRevision(unsigned int revision) 271 { 272 if (isContent()) 273 s->revision = revision; 274 } 275 276 // Optional element title, in human-readable form. getTitle()277 const GooString *getTitle() const { return isContent() ? nullptr : s->title; } getTitle()278 GooString *getTitle() { return isContent() ? nullptr : s->title; } 279 280 // Optional element expanded abbreviation text. getExpandedAbbr()281 const GooString *getExpandedAbbr() const { return isContent() ? nullptr : s->expandedAbbr; } getExpandedAbbr()282 GooString *getExpandedAbbr() { return isContent() ? nullptr : s->expandedAbbr; } 283 getNumChildren()284 unsigned getNumChildren() const { return isContent() ? 0 : s->elements.size(); } getChild(int i)285 const StructElement *getChild(int i) const { return isContent() ? nullptr : s->elements.at(i); } getChild(int i)286 StructElement *getChild(int i) { return isContent() ? nullptr : s->elements.at(i); } 287 appendChild(StructElement * element)288 void appendChild(StructElement *element) 289 { 290 if (!isContent() && element && element->isOk()) { 291 s->elements.push_back(element); 292 } 293 } 294 getNumAttributes()295 unsigned getNumAttributes() const { return isContent() ? 0 : s->attributes.size(); } getAttribute(int i)296 const Attribute *getAttribute(int i) const { return isContent() ? nullptr : s->attributes.at(i); } getAttribute(int i)297 Attribute *getAttribute(int i) { return isContent() ? nullptr : s->attributes.at(i); } 298 appendAttribute(Attribute * attribute)299 void appendAttribute(Attribute *attribute) 300 { 301 if (!isContent() && attribute) { 302 s->attributes.push_back(attribute); 303 } 304 } 305 306 const Attribute *findAttribute(Attribute::Type attributeType, bool inherit = false, Attribute::Owner owner = Attribute::UnknownOwner) const; 307 getAltText()308 const GooString *getAltText() const { return isContent() ? nullptr : s->altText; } getAltText()309 GooString *getAltText() { return isContent() ? nullptr : s->altText; } 310 getActualText()311 const GooString *getActualText() const { return isContent() ? nullptr : s->actualText; } getActualText()312 GooString *getActualText() { return isContent() ? nullptr : s->actualText; } 313 314 // Content text referenced by the element: 315 // 316 // - For MCID reference elements, this is just the text of the 317 // corresponding marked content object in the page stream, regardless 318 // of the setting of the "recursive" flag. 319 // - For other elements, if the "recursive" flag is set, the text 320 // enclosed by *all* the child MCID reference elements of the subtree 321 // is returned. The text is assembled by traversing the leaf MCID 322 // reference elements in logical order. 323 // - In any other case, the function returns nullptr. 324 // 325 // A new string is returned, and the ownership passed to the caller. 326 // 327 GooString *getText(bool recursive = true) const { return appendSubTreeText(nullptr, recursive); } 328 getTextSpans()329 const TextSpanArray getTextSpans() const 330 { 331 if (!isContent()) 332 return TextSpanArray(); 333 MarkedContentOutputDev mcdev(getMCID(), stmRef); 334 return getTextSpansInternal(mcdev); 335 } 336 337 ~StructElement(); 338 339 private: 340 GooString *appendSubTreeText(GooString *string, bool recursive) const; 341 const TextSpanArray &getTextSpansInternal(MarkedContentOutputDev &mcdev) const; 342 343 typedef std::vector<Attribute *> AttrPtrArray; 344 typedef std::vector<StructElement *> ElemPtrArray; 345 346 struct StructData 347 { 348 Ref parentRef; 349 GooString *altText; 350 GooString *actualText; 351 GooString *id; 352 GooString *title; 353 GooString *expandedAbbr; 354 GooString *language; 355 unsigned int revision; 356 ElemPtrArray elements; 357 AttrPtrArray attributes; 358 359 StructData(); 360 ~StructData(); 361 362 StructData(const StructData &) = delete; 363 StructData &operator=(const StructData &) = delete; 364 }; 365 366 // Data in content elements (MCID, MCR) 367 struct ContentData 368 { 369 union { 370 int mcid; 371 Ref ref; 372 }; 373 ContentDataContentData374 explicit ContentData(int mcidA) : mcid(mcidA) { } ContentDataContentData375 explicit ContentData(const Ref r) { ref = r; } 376 }; 377 378 // Common data 379 Type type; 380 StructTreeRoot *treeRoot; 381 StructElement *parent; 382 mutable Object pageRef; 383 Object stmRef; 384 385 union { 386 StructData *s; 387 ContentData *c; 388 }; 389 390 StructElement(Dict *elementDict, StructTreeRoot *treeRootA, StructElement *parentA, std::set<int> &seen); 391 StructElement(int mcid, StructTreeRoot *treeRootA, StructElement *parentA); 392 StructElement(const Ref ref, StructTreeRoot *treeRootA, StructElement *parentA); 393 394 void parse(Dict *elementDict); 395 StructElement *parseChild(const Object *ref, Object *childObj, std::set<int> &seen); 396 void parseChildren(Dict *element, std::set<int> &seen); 397 void parseAttributes(Dict *attributes, bool keepExisting = false); 398 399 friend class StructTreeRoot; 400 }; 401 402 #endif 403