1 /*
2  * Copyright (c) 2012 Tim Ruehsen
3  * Copyright (c) 2015-2021 Free Software Foundation, Inc.
4  *
5  * This file is part of libwget.
6  *
7  * Libwget is free software: you can redistribute it and/or modify
8  * it under the terms of the GNU Lesser General Public License as published by
9  * the Free Software Foundation, either version 3 of the License, or
10  * (at your option) any later version.
11  *
12  * Libwget is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15  * GNU Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public License
18  * along with libwget.  If not, see <https://www.gnu.org/licenses/>.
19  *
20  *
21  * routines to parse robots.txt
22  *
23  * Changelog
24  * 28.09.2013  Tim Ruehsen  created
25  *
26  */
27 
28 #include <config.h>
29 
30 #include <string.h>
31 #include <ctype.h>
32 
33 #include <wget.h>
34 #include "private.h"
35 
36 /**
37  * \file
38  * \brief Robots Exclusion file parser
39  * \defgroup libwget-robots Robots Exclusion file parser
40  * @{
41  *
42  * The purpose of this set of functions is to parse a
43  * Robots Exclusion Standard file into a data structure
44  * for easy access.
45  */
46 
47 struct wget_robots_st {
48 	wget_vector
49 		*paths;    //!< paths found in robots.txt (element: wget_string)
50 	wget_vector
51 		*sitemaps; //!< sitemaps found in robots.txt (element: char *)
52 };
53 
path_free(void * path)54 static void path_free(void *path)
55 {
56 	wget_string *p = path;
57 
58 	xfree(p->p);
59 	xfree(p);
60 }
61 
62 /**
63  * \param[in] data Memory with robots.txt content (with trailing 0-byte)
64  * \param[in] client Name of the client / user-agent
65  * \return Return an allocated wget_robots structure or NULL on error
66  *
67  * The function parses the robots.txt \p data and returns a ROBOTS structure
68  * including a list of the disallowed paths and including a list of the sitemap
69  * files.
70  *
71  * The ROBOTS structure has to be freed by calling wget_robots_free().
72  */
wget_robots_parse(wget_robots ** _robots,const char * data,const char * client)73 int wget_robots_parse(wget_robots **_robots, const char *data, const char *client)
74 {
75 	wget_robots *robots;
76 	wget_string path;
77 	size_t client_length = client ? strlen(client) : 0;
78 	int collect = 0;
79 	const char *p;
80 
81 	if (!data || !*data || !_robots)
82 		return WGET_E_INVALID;
83 
84 	if (!(robots = wget_calloc(1, sizeof(wget_robots))))
85 		return WGET_E_MEMORY;
86 
87 	do {
88 		if (collect < 2 && !wget_strncasecmp_ascii(data, "User-agent:", 11)) {
89 			if (!collect) {
90 				for (data += 11; *data == ' ' || *data == '\t'; data++);
91 				if (client && !wget_strncasecmp_ascii(data, client, client_length)) {
92 					collect = 1;
93 				}
94 				else if (*data == '*') {
95 					collect = 1;
96 				}
97 			} else
98 				collect = 2;
99 		}
100 		else if (collect == 1 && !wget_strncasecmp_ascii(data, "Disallow:", 9)) {
101 			for (data += 9; *data == ' ' || *data == '\t'; data++);
102 			if (*data == '\r' || *data == '\n' || !*data) {
103 				// all allowed
104 				wget_vector_free(&robots->paths);
105 				collect = 2;
106 			} else {
107 				if (!robots->paths) {
108 					if (!(robots->paths = wget_vector_create(32, NULL)))
109 						goto oom;
110 					wget_vector_set_destructor(robots->paths, path_free);
111 				}
112 				for (p = data; *p && !isspace(*p); p++);
113 				path.len = p - data;
114 				if (!(path.p = wget_strmemdup(data, path.len)))
115 					goto oom;
116 				if (wget_vector_add_memdup(robots->paths, &path, sizeof(path)) < 0) {
117 					xfree(path.p);
118 					goto oom;
119 				}
120 			}
121 		}
122 		else if (!wget_strncasecmp_ascii(data, "Sitemap:", 8)) {
123 			for (data += 8; *data==' ' || *data == '\t'; data++);
124 			for (p = data; *p && !isspace(*p); p++);
125 
126 			if (!robots->sitemaps)
127 				if (!(robots->sitemaps = wget_vector_create(4, NULL)))
128 					goto oom;
129 
130 			char *sitemap = wget_strmemdup(data, p - data);
131 			if (!sitemap)
132 				goto oom;
133 			if (wget_vector_add(robots->sitemaps, sitemap) < 0)
134 				goto oom;
135 		}
136 
137 		if ((data = strchr(data, '\n')))
138 			data++; // point to next line
139 	} while (data && *data);
140 
141 /*
142 	for (int it = 0; it < wget_vector_size(robots->paths); it++) {
143 		ROBOTS_PATH *path = wget_vector_get(robots->paths, it);
144 		debug_printf("path '%s'\n", path->path);
145 	}
146 	for (int it = 0; it < wget_vector_size(robots->sitemaps); it++) {
147 		const char *sitemap = wget_vector_get(robots->sitemaps, it);
148 		debug_printf("sitemap '%s'\n", sitemap);
149 	}
150 */
151 
152 	*(_robots) = robots;
153 	return WGET_E_SUCCESS;
154 
155 oom:
156 	wget_robots_free(&robots);
157 	return WGET_E_MEMORY;
158 }
159 
160 /**
161  * \param[in,out] robots Pointer to Pointer to wget_robots structure
162  *
163  * wget_robots_free() free's the formerly allocated wget_robots structure.
164  */
wget_robots_free(wget_robots ** robots)165 void wget_robots_free(wget_robots **robots)
166 {
167 	if (robots && *robots) {
168 		wget_vector_free(&(*robots)->paths);
169 		wget_vector_free(&(*robots)->sitemaps);
170 		xfree(*robots);
171 		*robots = NULL;
172 	}
173 }
174 
175 /**
176  * @param robots Pointer to instance of wget_robots
177  * @return Returns the number of paths listed in \p robots
178  */
wget_robots_get_path_count(wget_robots * robots)179 int wget_robots_get_path_count(wget_robots *robots)
180 {
181 	if (robots)
182 		return wget_vector_size(robots->paths);
183 
184 	return 0;
185 }
186 
187 /**
188  * @param robots Pointer to instance of wget_robots
189  * @param index Index of the wanted path
190  * @return Returns the path at \p index or NULL
191  */
wget_robots_get_path(wget_robots * robots,int index)192 wget_string *wget_robots_get_path(wget_robots *robots, int index)
193 {
194 	if (robots && robots->paths)
195 		return wget_vector_get(robots->paths, index);
196 
197 	return NULL;
198 }
199 
200 /**
201  * @param robots Pointer to instance of wget_robots
202  * @return Returns the number of sitemaps listed in \p robots
203  */
wget_robots_get_sitemap_count(wget_robots * robots)204 int wget_robots_get_sitemap_count(wget_robots *robots)
205 {
206 	if (robots)
207 		return wget_vector_size(robots->sitemaps);
208 
209 	return 0;
210 }
211 
212 /**
213  * @param robots Pointer to instance of wget_robots
214  * @param index Index of the wanted sitemap URL
215  * @return Returns the sitemap URL at \p index or NULL
216  */
wget_robots_get_sitemap(wget_robots * robots,int index)217 const char *wget_robots_get_sitemap(wget_robots *robots, int index)
218 {
219 	if (robots && robots->sitemaps)
220 		return wget_vector_get(robots->sitemaps, index);
221 
222 	return NULL;
223 }
224 
225 /**@}*/
226