1 //========================================================================
2 //
3 // StructElement.h
4 //
5 // This file is licensed under the GPLv2 or later
6 //
7 // Copyright 2013, 2014 Igalia S.L.
8 // Copyright 2014 Luigi Scarso <luigi.scarso@gmail.com>
9 // Copyright 2014, 2018, 2019, 2021 Albert Astals Cid <aacid@kde.org>
10 // Copyright 2018 Adam Reichold <adam.reichold@t-online.de>
11 // Copyright 2021 Adrian Johnson <ajohnson@redneon.com>
12 //
13 //========================================================================
14 
15 #ifndef STRUCTELEMENT_H
16 #define STRUCTELEMENT_H
17 
18 #include "goo/GooString.h"
19 #include "MarkedContentOutputDev.h"
20 #include "Object.h"
21 #include "poppler_private_export.h"
22 #include <vector>
23 #include <set>
24 
25 class GooString;
26 class Dict;
27 class StructElement;
28 class StructTreeRoot;
29 
30 class POPPLER_PRIVATE_EXPORT Attribute
31 {
32 public:
33     enum Type
34     {
35         Unknown = 0, // Uninitialized, parsing error, etc.
36         UserProperty, // User defined attribute (i.e. non-standard)
37 
38         // Common standard attributes
39         Placement,
40         WritingMode,
41         BackgroundColor,
42         BorderColor,
43         BorderStyle,
44         BorderThickness,
45         Color,
46         Padding,
47 
48         // Block element standard attributes
49         SpaceBefore,
50         SpaceAfter,
51         StartIndent,
52         EndIndent,
53         TextIndent,
54         TextAlign,
55         BBox,
56         Width,
57         Height,
58         BlockAlign,
59         InlineAlign,
60         TBorderStyle,
61         TPadding,
62 
63         // Inline element standard attributes
64         BaselineShift,
65         LineHeight,
66         TextDecorationColor,
67         TextDecorationThickness,
68         TextDecorationType,
69         RubyAlign,
70         RubyPosition,
71         GlyphOrientationVertical,
72 
73         // Column-only standard attributes
74         ColumnCount,
75         ColumnGap,
76         ColumnWidths,
77 
78         // List-only standard attributes
79         ListNumbering,
80 
81         // PrintField-only standard attributes
82         Role,
83         checked,
84         Desc,
85 
86         // Table-only standard attributes
87         RowSpan,
88         ColSpan,
89         Headers,
90         Scope,
91         Summary,
92     };
93 
94     enum Owner
95     {
96         UnknownOwner = 0,
97         // User-defined attributes
98         UserProperties,
99         // Standard attributes
100         Layout,
101         List,
102         PrintField,
103         Table,
104         // Translation to other formats
105         XML_1_00,
106         HTML_3_20,
107         HTML_4_01,
108         OEB_1_00,
109         RTF_1_05,
110         CSS_1_00,
111         CSS_2_00,
112     };
113 
114     // Creates a standard attribute. The name is predefined, and the
115     // value is type-checked to conform to the PDF specification.
116     Attribute(Type type, Object *value);
117 
118     // Creates an UserProperty attribute, with an arbitrary name and value.
119     Attribute(GooString &&name, Object *value);
120 
isOk()121     bool isOk() const { return type != Unknown; }
122 
123     // Name, type and value can be set only on construction.
getType()124     Type getType() const { return type; }
getOwner()125     Owner getOwner() const { return owner; }
126     const char *getTypeName() const;
127     const char *getOwnerName() const;
getValue()128     const Object *getValue() const { return &value; }
129     static Object *getDefaultValue(Type type);
130 
131     // The caller gets the ownership of the return GooString and is responsible of deleting it
getName()132     std::unique_ptr<GooString> getName() const { return std::make_unique<GooString>(type == UserProperty ? name.c_str() : getTypeName()); }
133 
134     // The revision is optional, and defaults to zero.
getRevision()135     unsigned int getRevision() const { return revision; }
setRevision(unsigned int revisionA)136     void setRevision(unsigned int revisionA) { revision = revisionA; }
137 
138     // Hidden elements should not be displayed by the user agent
isHidden()139     bool isHidden() const { return hidden; }
setHidden(bool hiddenA)140     void setHidden(bool hiddenA) { hidden = hiddenA; }
141 
142     // The formatted value may be in the PDF, or be left undefined (nullptr).
143     // In the later case the user agent should provide a default representation.
getFormattedValue()144     const char *getFormattedValue() const { return formatted ? formatted->c_str() : nullptr; }
145     void setFormattedValue(const char *formattedA);
146 
147     ~Attribute();
148 
149 private:
150     Type type;
151     Owner owner;
152     unsigned int revision;
153     GooString name;
154     Object value;
155     bool hidden;
156     GooString *formatted;
157 
158     bool checkType(StructElement *element = nullptr);
159     static Type getTypeForName(const char *name, StructElement *element = nullptr);
160     static Attribute *parseUserProperty(Dict *property);
161 
162     friend class StructElement;
163 };
164 
165 class POPPLER_PRIVATE_EXPORT StructElement
166 {
167 public:
168     enum Type
169     {
170         Unknown = 0,
171         MCID, // MCID reference, used internally
172         OBJR, // Object reference, used internally
173 
174         Document,
175         Part,
176         Art,
177         Sect,
178         Div, // Structural elements
179 
180         Span,
181         Quote,
182         Note,
183         Reference,
184         BibEntry, // Inline elements
185         Code,
186         Link,
187         Annot,
188         BlockQuote,
189         Caption,
190         NonStruct,
191         TOC,
192         TOCI,
193         Index,
194         Private,
195 
196         P,
197         H,
198         H1,
199         H2,
200         H3,
201         H4,
202         H5,
203         H6, // Paragraph-like
204 
205         L,
206         LI,
207         Lbl,
208         LBody, // List elements
209 
210         Table,
211         TR,
212         TH,
213         TD,
214         THead,
215         TFoot,
216         TBody, // Table elements
217 
218         Ruby,
219         RB,
220         RT,
221         RP, // Ruby text elements
222         Warichu,
223         WT,
224         WP,
225 
226         Figure,
227         Formula,
228         Form, // Illustration-like elements
229     };
230 
231     static const Ref InvalidRef;
232 
233     const char *getTypeName() const;
getType()234     Type getType() const { return type; }
isOk()235     bool isOk() const { return type != Unknown; }
236     bool isBlock() const;
237     bool isInline() const;
238     bool isGrouping() const;
239 
isContent()240     inline bool isContent() const { return (type == MCID) || isObjectRef(); }
isObjectRef()241     inline bool isObjectRef() const { return (type == OBJR && c->ref != Ref::INVALID()); }
242 
getMCID()243     int getMCID() const { return c->mcid; }
getObjectRef()244     Ref getObjectRef() const { return c->ref; }
getParentRef()245     Ref getParentRef() { return isContent() ? parent->getParentRef() : s->parentRef; }
246     bool hasPageRef() const;
247     bool getPageRef(Ref &ref) const;
getStructTreeRoot()248     StructTreeRoot *getStructTreeRoot() { return treeRoot; }
249 
250     // Optional element identifier.
getID()251     const GooString *getID() const { return isContent() ? nullptr : s->id; }
getID()252     GooString *getID() { return isContent() ? nullptr : s->id; }
253 
254     // Optional ISO language name, e.g. en_US
getLanguage()255     GooString *getLanguage()
256     {
257         if (!isContent() && s->language)
258             return s->language;
259         return parent ? parent->getLanguage() : nullptr;
260     }
getLanguage()261     const GooString *getLanguage() const
262     {
263         if (!isContent() && s->language)
264             return s->language;
265         return parent ? parent->getLanguage() : nullptr;
266     }
267 
268     // Optional revision number, defaults to zero.
getRevision()269     unsigned int getRevision() const { return isContent() ? 0 : s->revision; }
setRevision(unsigned int revision)270     void setRevision(unsigned int revision)
271     {
272         if (isContent())
273             s->revision = revision;
274     }
275 
276     // Optional element title, in human-readable form.
getTitle()277     const GooString *getTitle() const { return isContent() ? nullptr : s->title; }
getTitle()278     GooString *getTitle() { return isContent() ? nullptr : s->title; }
279 
280     // Optional element expanded abbreviation text.
getExpandedAbbr()281     const GooString *getExpandedAbbr() const { return isContent() ? nullptr : s->expandedAbbr; }
getExpandedAbbr()282     GooString *getExpandedAbbr() { return isContent() ? nullptr : s->expandedAbbr; }
283 
getNumChildren()284     unsigned getNumChildren() const { return isContent() ? 0 : s->elements.size(); }
getChild(int i)285     const StructElement *getChild(int i) const { return isContent() ? nullptr : s->elements.at(i); }
getChild(int i)286     StructElement *getChild(int i) { return isContent() ? nullptr : s->elements.at(i); }
287 
appendChild(StructElement * element)288     void appendChild(StructElement *element)
289     {
290         if (!isContent() && element && element->isOk()) {
291             s->elements.push_back(element);
292         }
293     }
294 
getNumAttributes()295     unsigned getNumAttributes() const { return isContent() ? 0 : s->attributes.size(); }
getAttribute(int i)296     const Attribute *getAttribute(int i) const { return isContent() ? nullptr : s->attributes.at(i); }
getAttribute(int i)297     Attribute *getAttribute(int i) { return isContent() ? nullptr : s->attributes.at(i); }
298 
appendAttribute(Attribute * attribute)299     void appendAttribute(Attribute *attribute)
300     {
301         if (!isContent() && attribute) {
302             s->attributes.push_back(attribute);
303         }
304     }
305 
306     const Attribute *findAttribute(Attribute::Type attributeType, bool inherit = false, Attribute::Owner owner = Attribute::UnknownOwner) const;
307 
getAltText()308     const GooString *getAltText() const { return isContent() ? nullptr : s->altText; }
getAltText()309     GooString *getAltText() { return isContent() ? nullptr : s->altText; }
310 
getActualText()311     const GooString *getActualText() const { return isContent() ? nullptr : s->actualText; }
getActualText()312     GooString *getActualText() { return isContent() ? nullptr : s->actualText; }
313 
314     // Content text referenced by the element:
315     //
316     // - For MCID reference elements, this is just the text of the
317     //   corresponding marked content object in the page stream, regardless
318     //   of the setting of the "recursive" flag.
319     // - For other elements, if the "recursive" flag is set, the text
320     //   enclosed by *all* the child MCID reference elements of the subtree
321     //   is returned. The text is assembled by traversing the leaf MCID
322     //   reference elements in logical order.
323     // - In any other case, the function returns nullptr.
324     //
325     // A new string is returned, and the ownership passed to the caller.
326     //
327     GooString *getText(bool recursive = true) const { return appendSubTreeText(nullptr, recursive); }
328 
getTextSpans()329     const TextSpanArray getTextSpans() const
330     {
331         if (!isContent())
332             return TextSpanArray();
333         MarkedContentOutputDev mcdev(getMCID(), stmRef);
334         return getTextSpansInternal(mcdev);
335     }
336 
337     ~StructElement();
338 
339 private:
340     GooString *appendSubTreeText(GooString *string, bool recursive) const;
341     const TextSpanArray &getTextSpansInternal(MarkedContentOutputDev &mcdev) const;
342 
343     typedef std::vector<Attribute *> AttrPtrArray;
344     typedef std::vector<StructElement *> ElemPtrArray;
345 
346     struct StructData
347     {
348         Ref parentRef;
349         GooString *altText;
350         GooString *actualText;
351         GooString *id;
352         GooString *title;
353         GooString *expandedAbbr;
354         GooString *language;
355         unsigned int revision;
356         ElemPtrArray elements;
357         AttrPtrArray attributes;
358 
359         StructData();
360         ~StructData();
361 
362         StructData(const StructData &) = delete;
363         StructData &operator=(const StructData &) = delete;
364     };
365 
366     // Data in content elements (MCID, MCR)
367     struct ContentData
368     {
369         union {
370             int mcid;
371             Ref ref;
372         };
373 
ContentDataContentData374         explicit ContentData(int mcidA) : mcid(mcidA) { }
ContentDataContentData375         explicit ContentData(const Ref r) { ref = r; }
376     };
377 
378     // Common data
379     Type type;
380     StructTreeRoot *treeRoot;
381     StructElement *parent;
382     mutable Object pageRef;
383     Object stmRef;
384 
385     union {
386         StructData *s;
387         ContentData *c;
388     };
389 
390     StructElement(Dict *elementDict, StructTreeRoot *treeRootA, StructElement *parentA, std::set<int> &seen);
391     StructElement(int mcid, StructTreeRoot *treeRootA, StructElement *parentA);
392     StructElement(const Ref ref, StructTreeRoot *treeRootA, StructElement *parentA);
393 
394     void parse(Dict *elementDict);
395     StructElement *parseChild(const Object *ref, Object *childObj, std::set<int> &seen);
396     void parseChildren(Dict *element, std::set<int> &seen);
397     void parseAttributes(Dict *attributes, bool keepExisting = false);
398 
399     friend class StructTreeRoot;
400 };
401 
402 #endif
403