1 /*
2  * nghttp2 - HTTP/2 C Library
3  *
4  * Copyright (c) 2012 Tatsuhiro Tsujikawa
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining
7  * a copy of this software and associated documentation files (the
8  * "Software"), to deal in the Software without restriction, including
9  * without limitation the rights to use, copy, modify, merge, publish,
10  * distribute, sublicense, and/or sell copies of the Software, and to
11  * permit persons to whom the Software is furnished to do so, subject to
12  * the following conditions:
13  *
14  * The above copyright notice and this permission notice shall be
15  * included in all copies or substantial portions of the Software.
16  *
17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
18  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
19  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
20  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
21  * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
22  * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
23  * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
24  */
25 #include "HtmlParser.h"
26 
27 #include <libxml/uri.h>
28 
29 #include "util.h"
30 
31 namespace nghttp2 {
32 
ParserData(const std::string & base_uri)33 ParserData::ParserData(const std::string &base_uri)
34     : base_uri(base_uri), inside_head(0) {}
35 
HtmlParser(const std::string & base_uri)36 HtmlParser::HtmlParser(const std::string &base_uri)
37     : base_uri_(base_uri), parser_ctx_(nullptr), parser_data_(base_uri) {}
38 
~HtmlParser()39 HtmlParser::~HtmlParser() { htmlFreeParserCtxt(parser_ctx_); }
40 
41 namespace {
get_attr(const xmlChar ** attrs,const StringRef & name)42 StringRef get_attr(const xmlChar **attrs, const StringRef &name) {
43   if (attrs == nullptr) {
44     return StringRef{};
45   }
46   for (; *attrs; attrs += 2) {
47     if (util::strieq(StringRef{attrs[0], strlen(reinterpret_cast<const char *>(
48                                              attrs[0]))},
49                      name)) {
50       return StringRef{attrs[1],
51                        strlen(reinterpret_cast<const char *>(attrs[1]))};
52     }
53   }
54   return StringRef{};
55 }
56 } // namespace
57 
58 namespace {
59 ResourceType
get_resource_type_for_preload_as(const StringRef & attribute_value)60 get_resource_type_for_preload_as(const StringRef &attribute_value) {
61   if (util::strieq_l("image", attribute_value)) {
62     return REQ_IMG;
63   } else if (util::strieq_l("style", attribute_value)) {
64     return REQ_CSS;
65   } else if (util::strieq_l("script", attribute_value)) {
66     return REQ_UNBLOCK_JS;
67   } else {
68     return REQ_OTHERS;
69   }
70 }
71 } // namespace
72 
73 namespace {
add_link(ParserData * parser_data,const StringRef & uri,ResourceType res_type)74 void add_link(ParserData *parser_data, const StringRef &uri,
75               ResourceType res_type) {
76   auto u = xmlBuildURI(
77       reinterpret_cast<const xmlChar *>(uri.c_str()),
78       reinterpret_cast<const xmlChar *>(parser_data->base_uri.c_str()));
79   if (u) {
80     parser_data->links.push_back(
81         std::make_pair(reinterpret_cast<char *>(u), res_type));
82     free(u);
83   }
84 }
85 } // namespace
86 
87 namespace {
start_element_func(void * user_data,const xmlChar * src_name,const xmlChar ** attrs)88 void start_element_func(void *user_data, const xmlChar *src_name,
89                         const xmlChar **attrs) {
90   auto parser_data = static_cast<ParserData *>(user_data);
91   auto name =
92       StringRef{src_name, strlen(reinterpret_cast<const char *>(src_name))};
93   if (util::strieq_l("head", name)) {
94     ++parser_data->inside_head;
95   }
96   if (util::strieq_l("link", name)) {
97     auto rel_attr = get_attr(attrs, StringRef::from_lit("rel"));
98     auto href_attr = get_attr(attrs, StringRef::from_lit("href"));
99     if (rel_attr.empty() || href_attr.empty()) {
100       return;
101     }
102     if (util::strieq_l("shortcut icon", rel_attr)) {
103       add_link(parser_data, href_attr, REQ_OTHERS);
104     } else if (util::strieq_l("stylesheet", rel_attr)) {
105       add_link(parser_data, href_attr, REQ_CSS);
106     } else if (util::strieq_l("preload", rel_attr)) {
107       auto as_attr = get_attr(attrs, StringRef::from_lit("as"));
108       if (as_attr.empty()) {
109         return;
110       }
111       add_link(parser_data, href_attr,
112                get_resource_type_for_preload_as(as_attr));
113     }
114   } else if (util::strieq_l("img", name)) {
115     auto src_attr = get_attr(attrs, StringRef::from_lit("src"));
116     if (src_attr.empty()) {
117       return;
118     }
119     add_link(parser_data, src_attr, REQ_IMG);
120   } else if (util::strieq_l("script", name)) {
121     auto src_attr = get_attr(attrs, StringRef::from_lit("src"));
122     if (src_attr.empty()) {
123       return;
124     }
125     if (parser_data->inside_head) {
126       add_link(parser_data, src_attr, REQ_JS);
127     } else {
128       add_link(parser_data, src_attr, REQ_UNBLOCK_JS);
129     }
130   }
131 }
132 } // namespace
133 
134 namespace {
end_element_func(void * user_data,const xmlChar * name)135 void end_element_func(void *user_data, const xmlChar *name) {
136   auto parser_data = static_cast<ParserData *>(user_data);
137   if (util::strieq_l(
138           "head",
139           StringRef{name, strlen(reinterpret_cast<const char *>(name))})) {
140     --parser_data->inside_head;
141   }
142 }
143 } // namespace
144 
145 namespace {
146 xmlSAXHandler saxHandler = {
147     nullptr,             // internalSubsetSAXFunc
148     nullptr,             // isStandaloneSAXFunc
149     nullptr,             // hasInternalSubsetSAXFunc
150     nullptr,             // hasExternalSubsetSAXFunc
151     nullptr,             // resolveEntitySAXFunc
152     nullptr,             // getEntitySAXFunc
153     nullptr,             // entityDeclSAXFunc
154     nullptr,             // notationDeclSAXFunc
155     nullptr,             // attributeDeclSAXFunc
156     nullptr,             // elementDeclSAXFunc
157     nullptr,             // unparsedEntityDeclSAXFunc
158     nullptr,             // setDocumentLocatorSAXFunc
159     nullptr,             // startDocumentSAXFunc
160     nullptr,             // endDocumentSAXFunc
161     &start_element_func, // startElementSAXFunc
162     &end_element_func,   // endElementSAXFunc
163     nullptr,             // referenceSAXFunc
164     nullptr,             // charactersSAXFunc
165     nullptr,             // ignorableWhitespaceSAXFunc
166     nullptr,             // processingInstructionSAXFunc
167     nullptr,             // commentSAXFunc
168     nullptr,             // warningSAXFunc
169     nullptr,             // errorSAXFunc
170     nullptr,             // fatalErrorSAXFunc
171     nullptr,             // getParameterEntitySAXFunc
172     nullptr,             // cdataBlockSAXFunc
173     nullptr,             // externalSubsetSAXFunc
174     0,                   // unsigned int initialized
175     nullptr,             // void * _private
176     nullptr,             // startElementNsSAX2Func
177     nullptr,             // endElementNsSAX2Func
178     nullptr,             // xmlStructuredErrorFunc
179 };
180 } // namespace
181 
parse_chunk(const char * chunk,size_t size,int fin)182 int HtmlParser::parse_chunk(const char *chunk, size_t size, int fin) {
183   if (!parser_ctx_) {
184     parser_ctx_ =
185         htmlCreatePushParserCtxt(&saxHandler, &parser_data_, chunk, size,
186                                  base_uri_.c_str(), XML_CHAR_ENCODING_NONE);
187     if (!parser_ctx_) {
188       return -1;
189     } else {
190       if (fin) {
191         return parse_chunk_internal(nullptr, 0, fin);
192       } else {
193         return 0;
194       }
195     }
196   } else {
197     return parse_chunk_internal(chunk, size, fin);
198   }
199 }
200 
parse_chunk_internal(const char * chunk,size_t size,int fin)201 int HtmlParser::parse_chunk_internal(const char *chunk, size_t size, int fin) {
202   int rv = htmlParseChunk(parser_ctx_, chunk, size, fin);
203   if (rv == 0) {
204     return 0;
205   } else {
206     return -1;
207   }
208 }
209 
210 const std::vector<std::pair<std::string, ResourceType>> &
get_links() const211 HtmlParser::get_links() const {
212   return parser_data_.links;
213 }
214 
clear_links()215 void HtmlParser::clear_links() { parser_data_.links.clear(); }
216 
217 } // namespace nghttp2
218