1 /*
2  * Copyright 2018 Google Inc.
3  *
4  * Use of this source code is governed by a BSD-style license that can be
5  * found in the LICENSE file.
6  */
7 
8 #include "src/pdf/SkPDFDocumentPriv.h"
9 #include "src/pdf/SkPDFTag.h"
10 
11 // Table 333 in PDF 32000-1:2008
tag_name_from_type(SkPDF::DocumentStructureType type)12 static const char* tag_name_from_type(SkPDF::DocumentStructureType type) {
13     switch (type) {
14         #define M(X) case SkPDF::DocumentStructureType::k ## X: return #X
15         M(Document);
16         M(Part);
17         M(Art);
18         M(Sect);
19         M(Div);
20         M(BlockQuote);
21         M(Caption);
22         M(TOC);
23         M(TOCI);
24         M(Index);
25         M(NonStruct);
26         M(Private);
27         M(H);
28         M(H1);
29         M(H2);
30         M(H3);
31         M(H4);
32         M(H5);
33         M(H6);
34         M(P);
35         M(L);
36         M(LI);
37         M(Lbl);
38         M(LBody);
39         M(Table);
40         M(TR);
41         M(TH);
42         M(TD);
43         M(THead);
44         M(TBody);
45         M(TFoot);
46         M(Span);
47         M(Quote);
48         M(Note);
49         M(Reference);
50         M(BibEntry);
51         M(Code);
52         M(Link);
53         M(Annot);
54         M(Ruby);
55         M(RB);
56         M(RT);
57         M(RP);
58         M(Warichu);
59         M(WT);
60         M(WP);
61         M(Figure);
62         M(Formula);
63         M(Form);
64         #undef M
65     }
66     SK_ABORT("bad tag");
67 }
68 
69 struct SkPDFTagNode {
70     SkPDFTagNode* fChildren = nullptr;
71     size_t fChildCount = 0;
72     struct MarkedContentInfo {
73         unsigned fPageIndex;
74         int fMarkId;
75     };
76     SkTArray<MarkedContentInfo> fMarkedContent;
77     int fNodeId;
78     SkPDF::DocumentStructureType fType;
79     SkPDFIndirectReference fRef;
80     enum State {
81         kUnknown,
82         kYes,
83         kNo,
84     } fCanDiscard = kUnknown;
85 };
86 
SkPDFTagTree()87 SkPDFTagTree::SkPDFTagTree() : fArena(4 * sizeof(SkPDFTagNode)) {}
88 
89 SkPDFTagTree::~SkPDFTagTree() = default;
90 
copy(const SkPDF::StructureElementNode & node,SkPDFTagNode * dst,SkArenaAlloc * arena,SkTHashMap<int,SkPDFTagNode * > * nodeMap)91 static void copy(const SkPDF::StructureElementNode& node,
92                  SkPDFTagNode* dst,
93                  SkArenaAlloc* arena,
94                  SkTHashMap<int, SkPDFTagNode*>* nodeMap) {
95     nodeMap->set(node.fNodeId, dst);
96     size_t childCount = node.fChildCount;
97     SkPDFTagNode* children = arena->makeArray<SkPDFTagNode>(childCount);
98     dst->fChildCount = childCount;
99     dst->fNodeId = node.fNodeId;
100     dst->fType = node.fType;
101     dst->fChildren = children;
102     for (size_t i = 0; i < childCount; ++i) {
103         copy(node.fChildren[i], &children[i], arena, nodeMap);
104     }
105 }
106 
init(const SkPDF::StructureElementNode * node)107 void SkPDFTagTree::init(const SkPDF::StructureElementNode* node) {
108     if (node) {
109         fRoot = fArena.make<SkPDFTagNode>();
110         copy(*node, fRoot, &fArena, &fNodeMap);
111     }
112 }
113 
reset()114 void SkPDFTagTree::reset() {
115     fArena.reset();
116     fNodeMap.reset();
117     fMarksPerPage.reset();
118     fRoot = nullptr;
119 }
120 
getMarkIdForNodeId(int nodeId,unsigned pageIndex)121 int SkPDFTagTree::getMarkIdForNodeId(int nodeId, unsigned pageIndex) {
122     if (!fRoot) {
123         return -1;
124     }
125     SkPDFTagNode** tagPtr = fNodeMap.find(nodeId);
126     if (!tagPtr) {
127         return -1;
128     }
129     SkPDFTagNode* tag = *tagPtr;
130     SkASSERT(tag);
131     while (fMarksPerPage.size() < pageIndex + 1) {
132         fMarksPerPage.push_back();
133     }
134     SkTArray<SkPDFTagNode*>& pageMarks = fMarksPerPage[pageIndex];
135     int markId = pageMarks.count();
136     tag->fMarkedContent.push_back({pageIndex, markId});
137     pageMarks.push_back(tag);
138     return markId;
139 }
140 
can_discard(SkPDFTagNode * node)141 static bool can_discard(SkPDFTagNode* node) {
142     if (node->fCanDiscard == SkPDFTagNode::kYes) {
143         return true;
144     }
145     if (node->fCanDiscard == SkPDFTagNode::kNo) {
146         return false;
147     }
148     if (!node->fMarkedContent.empty()) {
149         node->fCanDiscard = SkPDFTagNode::kNo;
150         return false;
151     }
152     for (size_t i = 0; i < node->fChildCount; ++i) {
153         if (!can_discard(&node->fChildren[i])) {
154             node->fCanDiscard = SkPDFTagNode::kNo;
155             return false;
156         }
157     }
158     node->fCanDiscard = SkPDFTagNode::kYes;
159     return true;
160 }
161 
162 
prepare_tag_tree_to_emit(SkPDFIndirectReference parent,SkPDFTagNode * node,SkPDFDocument * doc)163 SkPDFIndirectReference prepare_tag_tree_to_emit(SkPDFIndirectReference parent,
164                                                 SkPDFTagNode* node,
165                                                 SkPDFDocument* doc) {
166     SkPDFIndirectReference ref = doc->reserveRef();
167     std::unique_ptr<SkPDFArray> kids = SkPDFMakeArray();
168     SkPDFTagNode* children = node->fChildren;
169     size_t childCount = node->fChildCount;
170     for (size_t i = 0; i < childCount; ++i) {
171         SkPDFTagNode* child = &children[i];
172         if (!(can_discard(child))) {
173             kids->appendRef(prepare_tag_tree_to_emit(ref, child, doc));
174         }
175     }
176     for (const SkPDFTagNode::MarkedContentInfo& info : node->fMarkedContent) {
177         std::unique_ptr<SkPDFDict> mcr = SkPDFMakeDict("MCR");
178         mcr->insertRef("Pg", doc->getPage(info.fPageIndex));
179         mcr->insertInt("MCID", info.fMarkId);
180         kids->appendObject(std::move(mcr));
181     }
182     node->fRef = ref;
183     SkPDFDict dict("StructElem");
184     dict.insertName("S", tag_name_from_type(node->fType));
185     dict.insertRef("P", parent);
186     dict.insertObject("K", std::move(kids));
187     return doc->emit(dict, ref);
188 }
189 
makeStructTreeRoot(SkPDFDocument * doc)190 SkPDFIndirectReference SkPDFTagTree::makeStructTreeRoot(SkPDFDocument* doc) {
191     if (!fRoot) {
192         return SkPDFIndirectReference();
193     }
194     if (can_discard(fRoot)) {
195         SkDEBUGFAIL("PDF has tag tree but no marked content.");
196     }
197     SkPDFIndirectReference ref = doc->reserveRef();
198 
199     unsigned pageCount = SkToUInt(doc->pageCount());
200 
201     // Build the StructTreeRoot.
202     SkPDFDict structTreeRoot("StructTreeRoot");
203     structTreeRoot.insertRef("K", prepare_tag_tree_to_emit(ref, fRoot, doc));
204     structTreeRoot.insertInt("ParentTreeNextKey", SkToInt(pageCount));
205 
206     // Build the parent tree, which is a mapping from the marked
207     // content IDs on each page to their corressponding tags.
208     SkPDFDict parentTree("ParentTree");
209     auto parentTreeNums = SkPDFMakeArray();
210 
211     SkASSERT(fMarksPerPage.size() <= pageCount);
212     for (size_t j = 0; j < fMarksPerPage.size(); ++j) {
213         const SkTArray<SkPDFTagNode*>& pageMarks = fMarksPerPage[j];
214         SkPDFArray markToTagArray;
215         for (SkPDFTagNode* mark : pageMarks) {
216             SkASSERT(mark->fRef);
217             markToTagArray.appendRef(mark->fRef);
218         }
219         parentTreeNums->appendInt(j);
220         parentTreeNums->appendRef(doc->emit(markToTagArray));
221     }
222     parentTree.insertObject("Nums", std::move(parentTreeNums));
223     structTreeRoot.insertRef("ParentTree", doc->emit(parentTree));
224     return doc->emit(structTreeRoot, ref);
225 }
226 
227