1 /*
2  * Copyright (c) 2012 Tim Ruehsen
3  * Copyright (c) 2015-2021 Free Software Foundation, Inc.
4  *
5  * This file is part of libwget.
6  *
7  * Libwget is free software: you can redistribute it and/or modify
8  * it under the terms of the GNU Lesser General Public License as published by
9  * the Free Software Foundation, either version 3 of the License, or
10  * (at your option) any later version.
11  *
12  * Libwget is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15  * GNU Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public License
18  * along with libwget.  If not, see <https://www.gnu.org/licenses/>.
19  *
20  *
21  * css parsing routines
22  *
23  * Changelog
24  * 03.07.2012  Tim Ruehsen  created
25  *
26  * A parser using the flex tokenizer, created with flex tokens from
27  *   https://www.w3.org/TR/css3-syntax/
28  *
29  * TODO:
30  *  - since we are just interested in @import ... and url(...), we could use
31  *    a simplistic hand-written parser which might be much smaller and faster
32  */
33 
34 #include <config.h>
35 
36 #include <stddef.h>
37 #include <unistd.h>
38 #include <string.h>
39 #include <c-ctype.h>
40 #include <fcntl.h>
41 #include <sys/stat.h>
42 #ifdef HAVE_MMAP
43 #include <sys/mman.h>
44 #endif
45 
46 #include <wget.h>
47 #include "private.h"
48 
49 #include "css_tokenizer.h"
50 
51 // see css_tokenizer.c
52 typedef void* yyscan_t;
53 int yyget_leng(yyscan_t yyscanner);
54 char *yyget_text(yyscan_t yyscanner);
55 typedef struct yy_buffer_state *YY_BUFFER_STATE;
56 int yylex_init(yyscan_t* scanner);
57 YY_BUFFER_STATE yy_scan_string(const char * yystr, yyscan_t yyscanner);
58 YY_BUFFER_STATE yy_scan_bytes(const char * yystr, int len, yyscan_t yyscanner);
59 int yylex(yyscan_t yyscanner);
60 int yylex_destroy(yyscan_t yyscanner);
61 void *yyalloc(size_t size);
62 void *yyrealloc(void *p, size_t size);
63 
yyalloc(size_t size)64 void *yyalloc(size_t size) {
65 	return wget_malloc(size);
66 }
yyrealloc(void * p,size_t size)67 void *yyrealloc(void *p, size_t size) {
68 	return wget_realloc(p, size);
69 }
70 
wget_css_parse_buffer(const char * buf,size_t len,wget_css_parse_uri_callback * callback_uri,wget_css_parse_encoding_callback * callback_encoding,void * user_ctx)71 void wget_css_parse_buffer(
72 	const char *buf,
73 	size_t len,
74 	wget_css_parse_uri_callback *callback_uri,
75 	wget_css_parse_encoding_callback *callback_encoding,
76 	void *user_ctx)
77 {
78 	int token;
79 	size_t length, pos = 0;
80 	char *text;
81 	yyscan_t scanner;
82 
83 	yylex_init(&scanner);
84 	yy_scan_bytes(buf, (int) len, scanner);
85 
86 	while ((token = yylex(scanner)) != CSSEOF) {
87 		if (token == IMPORT_SYM) {
88 			// e.g. @import "https://example.com/index.html"
89 			pos += yyget_leng(scanner);
90 
91 			// skip whitespace before URI/STRING
92 			while ((token = yylex(scanner)) == S)
93 				pos += yyget_leng(scanner);
94 
95 			// now token should be STRING or URI
96 			if (token == STRING)
97 				token = URI;
98 		}
99 
100 		if (token == URI && callback_uri) {
101 			// e.g. url(https://example.com/index.html)
102 			text = yyget_text(scanner);
103 			length = yyget_leng(scanner);
104 
105 			if (*text == '\'' || *text == '\"') {
106 				// a string - remove the quotes
107 				callback_uri(user_ctx, text + 1, length - 2, pos + 1);
108 			} else {
109 				// extract URI from url(...)
110 				if (!wget_strncasecmp_ascii(text, "url(", 4)) {
111 					char *otext = text;
112 
113 					// remove trailing ) and any spaces before
114 					for (length--; c_isspace(text[length - 1]); length--);
115 
116 					// remove leading url( and any spaces after
117 					for (length -= 4, text += 4; length && c_isspace(*text); text++, length--);
118 
119 					// remove quotes
120 					if (length && (*text == '\'' || *text == '\"')) {
121 						text++;
122 						length--;
123 					}
124 
125 					if (length && (text[length - 1] == '\'' || text[length - 1] == '\"'))
126 						length--;
127 
128 					callback_uri(user_ctx, text, length, pos + (text - otext));
129 				}
130 			}
131 		} else if (token == CHARSET_SYM && callback_encoding) {
132 			// e.g. @charset "UTF-8"
133 			pos += yyget_leng(scanner);
134 
135 			// skip whitespace before charset name
136 			while ((token = yylex(scanner)) == S)
137 				pos += yyget_leng(scanner);
138 
139 			// now token should be STRING
140 			if (token == STRING) {
141 				text = yyget_text(scanner);
142 				length = yyget_leng(scanner);
143 
144 				if (*text == '\'' || *text == '\"') {
145 					// a string - remove the quotes
146 					callback_encoding(user_ctx, text + 1, length - 2);
147 				} else {
148 					// a string without quotes
149 					callback_encoding(user_ctx, text, length);
150 				}
151 			} else {
152 				error_printf(_("Unknown token after @charset: %d\n"), token);
153 			}
154 		}
155 		pos += yyget_leng(scanner);
156 	}
157 
158 	yylex_destroy(scanner);
159 }
160 
wget_css_parse_file(const char * fname,wget_css_parse_uri_callback * callback_uri,wget_css_parse_encoding_callback * callback_encoding,void * user_ctx)161 void wget_css_parse_file(
162 	const char *fname,
163 	wget_css_parse_uri_callback *callback_uri,
164 	wget_css_parse_encoding_callback *callback_encoding,
165 	void *user_ctx)
166 {
167 	if (strcmp(fname,"-")) {
168 		int fd;
169 
170 		if ((fd = open(fname, O_RDONLY|O_BINARY)) != -1) {
171 			struct stat st;
172 			if (fstat(fd, &st) == 0) {
173 #ifdef HAVE_MMAP
174 				size_t nread = st.st_size;
175 				char *buf = mmap(NULL, nread + 1, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0);
176 #else
177 				char *buf=wget_malloc(st.st_size+1);
178 				size_t nread=read(fd,buf,st.st_size);
179 #endif
180 
181 				if (nread > 0) {
182 					buf[nread] = 0; // PROT_WRITE allows this write, MAP_PRIVATE prevents changes in underlying file system
183 					wget_css_parse_buffer(buf, st.st_size, callback_uri, callback_encoding, user_ctx);
184 				}
185 
186 #ifdef HAVE_MMAP
187 				munmap(buf, nread);
188 #else
189 				xfree(buf);
190 #endif
191 			}
192 			close(fd);
193 		} else
194 			error_printf(_("Failed to open %s\n"), fname);
195 	} else {
196 		// read data from STDIN.
197 		// maybe should use yy_scan_bytes instead of buffering into memory.
198 		char tmp[4096];
199 		ssize_t nbytes;
200 		wget_buffer buf;
201 
202 		wget_buffer_init(&buf, NULL, 4096);
203 
204 		while ((nbytes = read(STDIN_FILENO, tmp, sizeof(tmp))) > 0) {
205 			wget_buffer_memcat(&buf, tmp, nbytes);
206 		}
207 
208 		if (buf.length)
209 			wget_css_parse_buffer(buf.data, buf.length, callback_uri, callback_encoding, user_ctx);
210 
211 		wget_buffer_deinit(&buf);
212 	}
213 }
214