1 /*
2 * Copyright (c) 2012 Tim Ruehsen
3 * Copyright (c) 2015-2021 Free Software Foundation, Inc.
4 *
5 * This file is part of libwget.
6 *
7 * Libwget is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU Lesser General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * Libwget is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public License
18 * along with libwget. If not, see <https://www.gnu.org/licenses/>.
19 *
20 *
21 * routines to parse robots.txt
22 *
23 * Changelog
24 * 28.09.2013 Tim Ruehsen created
25 *
26 */
27
28 #include <config.h>
29
30 #include <string.h>
31 #include <ctype.h>
32
33 #include <wget.h>
34 #include "private.h"
35
36 /**
37 * \file
38 * \brief Robots Exclusion file parser
39 * \defgroup libwget-robots Robots Exclusion file parser
40 * @{
41 *
42 * The purpose of this set of functions is to parse a
43 * Robots Exclusion Standard file into a data structure
44 * for easy access.
45 */
46
47 struct wget_robots_st {
48 wget_vector
49 *paths; //!< paths found in robots.txt (element: wget_string)
50 wget_vector
51 *sitemaps; //!< sitemaps found in robots.txt (element: char *)
52 };
53
path_free(void * path)54 static void path_free(void *path)
55 {
56 wget_string *p = path;
57
58 xfree(p->p);
59 xfree(p);
60 }
61
62 /**
63 * \param[in] data Memory with robots.txt content (with trailing 0-byte)
64 * \param[in] client Name of the client / user-agent
65 * \return Return an allocated wget_robots structure or NULL on error
66 *
67 * The function parses the robots.txt \p data and returns a ROBOTS structure
68 * including a list of the disallowed paths and including a list of the sitemap
69 * files.
70 *
71 * The ROBOTS structure has to be freed by calling wget_robots_free().
72 */
wget_robots_parse(wget_robots ** _robots,const char * data,const char * client)73 int wget_robots_parse(wget_robots **_robots, const char *data, const char *client)
74 {
75 wget_robots *robots;
76 wget_string path;
77 size_t client_length = client ? strlen(client) : 0;
78 int collect = 0;
79 const char *p;
80
81 if (!data || !*data || !_robots)
82 return WGET_E_INVALID;
83
84 if (!(robots = wget_calloc(1, sizeof(wget_robots))))
85 return WGET_E_MEMORY;
86
87 do {
88 if (collect < 2 && !wget_strncasecmp_ascii(data, "User-agent:", 11)) {
89 if (!collect) {
90 for (data += 11; *data == ' ' || *data == '\t'; data++);
91 if (client && !wget_strncasecmp_ascii(data, client, client_length)) {
92 collect = 1;
93 }
94 else if (*data == '*') {
95 collect = 1;
96 }
97 } else
98 collect = 2;
99 }
100 else if (collect == 1 && !wget_strncasecmp_ascii(data, "Disallow:", 9)) {
101 for (data += 9; *data == ' ' || *data == '\t'; data++);
102 if (*data == '\r' || *data == '\n' || !*data) {
103 // all allowed
104 wget_vector_free(&robots->paths);
105 collect = 2;
106 } else {
107 if (!robots->paths) {
108 if (!(robots->paths = wget_vector_create(32, NULL)))
109 goto oom;
110 wget_vector_set_destructor(robots->paths, path_free);
111 }
112 for (p = data; *p && !isspace(*p); p++);
113 path.len = p - data;
114 if (!(path.p = wget_strmemdup(data, path.len)))
115 goto oom;
116 if (wget_vector_add_memdup(robots->paths, &path, sizeof(path)) < 0) {
117 xfree(path.p);
118 goto oom;
119 }
120 }
121 }
122 else if (!wget_strncasecmp_ascii(data, "Sitemap:", 8)) {
123 for (data += 8; *data==' ' || *data == '\t'; data++);
124 for (p = data; *p && !isspace(*p); p++);
125
126 if (!robots->sitemaps)
127 if (!(robots->sitemaps = wget_vector_create(4, NULL)))
128 goto oom;
129
130 char *sitemap = wget_strmemdup(data, p - data);
131 if (!sitemap)
132 goto oom;
133 if (wget_vector_add(robots->sitemaps, sitemap) < 0)
134 goto oom;
135 }
136
137 if ((data = strchr(data, '\n')))
138 data++; // point to next line
139 } while (data && *data);
140
141 /*
142 for (int it = 0; it < wget_vector_size(robots->paths); it++) {
143 ROBOTS_PATH *path = wget_vector_get(robots->paths, it);
144 debug_printf("path '%s'\n", path->path);
145 }
146 for (int it = 0; it < wget_vector_size(robots->sitemaps); it++) {
147 const char *sitemap = wget_vector_get(robots->sitemaps, it);
148 debug_printf("sitemap '%s'\n", sitemap);
149 }
150 */
151
152 *(_robots) = robots;
153 return WGET_E_SUCCESS;
154
155 oom:
156 wget_robots_free(&robots);
157 return WGET_E_MEMORY;
158 }
159
160 /**
161 * \param[in,out] robots Pointer to Pointer to wget_robots structure
162 *
163 * wget_robots_free() free's the formerly allocated wget_robots structure.
164 */
wget_robots_free(wget_robots ** robots)165 void wget_robots_free(wget_robots **robots)
166 {
167 if (robots && *robots) {
168 wget_vector_free(&(*robots)->paths);
169 wget_vector_free(&(*robots)->sitemaps);
170 xfree(*robots);
171 *robots = NULL;
172 }
173 }
174
175 /**
176 * @param robots Pointer to instance of wget_robots
177 * @return Returns the number of paths listed in \p robots
178 */
wget_robots_get_path_count(wget_robots * robots)179 int wget_robots_get_path_count(wget_robots *robots)
180 {
181 if (robots)
182 return wget_vector_size(robots->paths);
183
184 return 0;
185 }
186
187 /**
188 * @param robots Pointer to instance of wget_robots
189 * @param index Index of the wanted path
190 * @return Returns the path at \p index or NULL
191 */
wget_robots_get_path(wget_robots * robots,int index)192 wget_string *wget_robots_get_path(wget_robots *robots, int index)
193 {
194 if (robots && robots->paths)
195 return wget_vector_get(robots->paths, index);
196
197 return NULL;
198 }
199
200 /**
201 * @param robots Pointer to instance of wget_robots
202 * @return Returns the number of sitemaps listed in \p robots
203 */
wget_robots_get_sitemap_count(wget_robots * robots)204 int wget_robots_get_sitemap_count(wget_robots *robots)
205 {
206 if (robots)
207 return wget_vector_size(robots->sitemaps);
208
209 return 0;
210 }
211
212 /**
213 * @param robots Pointer to instance of wget_robots
214 * @param index Index of the wanted sitemap URL
215 * @return Returns the sitemap URL at \p index or NULL
216 */
wget_robots_get_sitemap(wget_robots * robots,int index)217 const char *wget_robots_get_sitemap(wget_robots *robots, int index)
218 {
219 if (robots && robots->sitemaps)
220 return wget_vector_get(robots->sitemaps, index);
221
222 return NULL;
223 }
224
225 /**@}*/
226