1 /*
2 ** @(#) $Id$
3 **
4 ** W3C Webbot can be found at "http://www.w3.org/Robot/"
5 **
6 ** Copyright �� 1995-1998 World Wide Web Consortium, (Massachusetts
7 ** Institute of Technology, Institut National de Recherche en
8 ** Informatique et en Automatique, Keio University). All Rights
9 ** Reserved. This program is distributed under the W3C's Software
10 ** Intellectual Property License. This program is distributed in the hope
11 ** that it will be useful, but WITHOUT ANY WARRANTY; without even the
12 ** implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
13 ** PURPOSE. See W3C License http://www.w3.org/Consortium/Legal/ for more
14 ** details.
15 **
16 ** Authors:
17 ** JP John Punin
18 **
19 ** History:
20 ** Oct 1998 Written
21 */
22
23 #include "HTRobMan.h"
24 #include "RobotTxt.h"
25
new_user_agent(void)26 PUBLIC UserAgent * new_user_agent (void)
27 {
28 UserAgent * ua;
29 if ((ua = (UserAgent *) HT_CALLOC(1, sizeof(UserAgent))) == NULL)
30 HT_OUTOFMEM("new_user_agent");
31 ua->disallow = HTList_new();
32 return ua;
33 }
34
get_name_user_agent(UserAgent * ua)35 PUBLIC char * get_name_user_agent (UserAgent * ua)
36 {
37 return ua ? ua->name : NULL;
38 }
39
get_disallow_user_agent(UserAgent * ua)40 PUBLIC HTList * get_disallow_user_agent (UserAgent * ua)
41 {
42 return ua ? ua->disallow : NULL;
43 }
44
set_name_user_agent(UserAgent * ua,char * name)45 PUBLIC BOOL set_name_user_agent (UserAgent * ua, char * name)
46 {
47 if (ua && name) {
48 StrAllocCopy(ua->name, name);
49 return YES;
50 }
51 return NO;
52 }
53
add_disallow_user_agent(UserAgent * ua,char * disallow)54 PUBLIC BOOL add_disallow_user_agent (UserAgent * ua, char * disallow)
55 {
56 if (ua && disallow) {
57 char * da = NULL;
58 StrAllocCopy(da, disallow);
59
60 /* @@@ Should be an association list (HTAssocList) @@@ */
61 return HTList_addObject(ua->disallow, da);
62 }
63 return NO;
64 }
65
delete_user_agent(UserAgent * ua)66 PUBLIC BOOL delete_user_agent (UserAgent * ua)
67 {
68 if (ua) {
69 HT_FREE(ua->name);
70 if (ua->disallow) {
71 HTList *cur = ua->disallow;
72 char *pres;
73 while ((pres = (char *) HTList_nextObject(cur)))
74 HT_FREE(pres);
75 HTList_delete(ua->disallow);
76 }
77 return YES;
78 }
79 return NO;
80 }
81
delete_all_user_agents(HTList * user_agents)82 PUBLIC BOOL delete_all_user_agents(HTList *user_agents)
83 {
84 if (user_agents) {
85 HTList *cur = user_agents;
86 UserAgent *pres;
87 while ((pres = (UserAgent *) HTList_nextObject(cur)))
88 delete_user_agent(pres);
89 return HTList_delete(user_agents);
90 }
91 return NO;
92 }
93
get_regular_expression(HTList * user_agents,char * name_robot)94 PUBLIC char * get_regular_expression (HTList * user_agents, char * name_robot)
95 {
96 if (user_agents && name_robot) {
97 HTChunk *ch = HTChunk_new (1024);
98 HTList *cur = user_agents;
99 UserAgent *pres;
100 UserAgent *ua_gen=NULL;
101 int found=0;
102
103 while ((pres = (UserAgent *) HTList_nextObject(cur))) {
104 char *name = get_name_user_agent(pres);
105
106 if(!strcmp(name,"*"))
107 ua_gen = pres;
108
109 if(!strcmp(name,name_robot)) {
110 put_string_disallow(ch,pres);
111 found = 1;
112 }
113 }
114 if(!found && ua_gen) put_string_disallow(ch,ua_gen);
115
116 return (HTChunk_toCString (ch));
117 }
118 return NULL;
119 }
120
put_string_disallow(HTChunk * ch,UserAgent * ua)121 PUBLIC BOOL put_string_disallow (HTChunk * ch, UserAgent * ua)
122 {
123 if (ch && ua) {
124 HTList *cur = get_disallow_user_agent(ua);
125 char *pres;
126 int first = 1;
127
128 while ((pres = (char *) HTList_nextObject(cur))) {
129 if(!first)
130 HTChunk_puts (ch,"|");
131 else
132 first = 0;
133 HTChunk_puts (ch,pres);
134 }
135 return YES;
136 }
137 return NO;
138 }
139
print_user_agent(UserAgent * ua)140 PUBLIC void print_user_agent(UserAgent *ua)
141 {
142 HTList *cur = ua->disallow;
143 char *pres;
144 HTTRACE(APP_TRACE, "User Agent : %s \n" _ ua->name);
145 while ((pres = (char*) HTList_nextObject(cur)))
146 HTTRACE(APP_TRACE, "Disallow : %s \n" _ pres);
147 }
148
print_all_user_agents(HTList * user_agents)149 PUBLIC void print_all_user_agents(HTList * user_agents)
150 {
151 HTList *cur = user_agents;
152 UserAgent *pres;
153 while ((pres = (UserAgent *) HTList_nextObject(cur)))
154 {
155 HTTRACE(APP_TRACE, "\nNew User Agent\n");
156 print_user_agent(pres);
157 }
158 }
159
get_all_user_agents(char * rob_str)160 PUBLIC HTList * get_all_user_agents(char * rob_str)
161 {
162 if (rob_str) {
163 char * ptr = rob_str;
164 HTList * user_agents = HTList_new();
165
166 /* skip blank spaces */
167 while(isspace((int)*ptr))
168 ptr++;
169
170 /* skip comments */
171 ptr = skip_comments(ptr);
172
173 if(!get_user_agents(ptr,user_agents))
174 HTTRACE(APP_TRACE, "Something is wrong in robots.txt\n");
175
176 return user_agents;
177 }
178 return NULL;
179 }
180
skip_comments(char * ptr)181 PUBLIC char * skip_comments(char *ptr)
182 {
183 if (ptr && *ptr == '#') {
184 do {
185 while(*ptr != '\n')
186 ptr++;
187 while(isspace((int)*ptr))
188 ptr++;
189 } while (*ptr == '#');
190 }
191 return ptr;
192 }
193
scan_name_until_eoline(char * robot_str,char * name)194 PUBLIC void scan_name_until_eoline(char *robot_str, char *name)
195 {
196 char *ptr = robot_str;
197 char *ntr = name;
198 while(*ptr != '\n' && *ptr != '#')
199 {
200 *ntr = *ptr;
201 ntr++; ptr++;
202 if(*ptr == '\0')
203 break;
204 }
205 *ntr = '\0';
206 }
207
scan_name_until_space(char * robot_str,char * name)208 PUBLIC void scan_name_until_space(char *robot_str, char *name)
209 {
210 char *ptr = robot_str;
211 char *ntr = name;
212 while(!isspace((int)*ptr) && *ptr != '#')
213 {
214 *ntr = *ptr;
215 ntr++; ptr++;
216 if(*ptr == '\0')
217 break;
218 }
219 *ntr = '\0';
220 }
221
get_user_agents(char * ptr,HTList * user_agents)222 PUBLIC BOOL get_user_agents(char * ptr, HTList *user_agents)
223 {
224 char *uastr = "user-agent:";
225 char *disstr = "disallow:";
226 int luastr = 10;
227 int ldisstr = 9;
228 char name[2000];
229 int indices[200];
230 int i = 0;
231 if (ptr && !strncasecomp(ptr,uastr,luastr)) {
232 UserAgent *ua = NULL;
233 do {
234 i=0;
235 do {
236 ua = new_user_agent();
237 HTList_appendObject(user_agents,(void *)ua);
238 indices[i++] = HTList_indexOf(user_agents, (void *)ua);
239 ptr += luastr + 1;
240 while(isspace((int)*ptr))
241 ptr++;
242 scan_name_until_eoline(ptr,name);
243 ptr += strlen(name) + 1;
244 while(isspace((int)*ptr))
245 ptr++;
246 ptr = skip_comments(ptr);
247 set_name_user_agent(ua,name);
248 } while(!strncasecomp(ptr,uastr,luastr));
249
250 if(!strncasecomp(ptr, disstr,ldisstr))
251 {
252 do {
253 ptr += ldisstr + 1;
254 scan_name_until_space(ptr,name);
255 ptr += strlen(name) + 1;
256 while(isspace((int)*ptr))
257 ptr++;
258 ptr = skip_comments(ptr);
259 if(i==1)
260 add_disallow_user_agent(ua,name);
261 else
262 {
263 int j;
264 for(j = 0 ; j < i ; j++)
265 {
266 ua = HTList_objectAt(user_agents, indices[j]);
267 add_disallow_user_agent(ua,name);
268 }
269 }
270 } while(!strncasecomp(ptr,disstr,ldisstr));
271 }
272 else
273 return NO;
274
275 } while(!strncasecomp(ptr,uastr,luastr));
276 return YES;
277 }
278 else
279 return NO;
280 }
281
scan_robots_txt(char * rob_str,char * name_robot)282 PUBLIC char * scan_robots_txt(char *rob_str, char *name_robot)
283 {
284 char *reg_exp_exclude = NULL;
285 HTList * user_agents = get_all_user_agents(rob_str);
286 /*print_all_user_agents(user_agents);*/
287
288 reg_exp_exclude = get_regular_expression(user_agents, name_robot);
289 delete_all_user_agents(user_agents);
290
291 return reg_exp_exclude;
292 }
293
294 #ifdef ROBOTS_TXT_STANDALONE
295
296 int
main(int argc,char * argv[])297 main(int argc, char *argv[])
298 {
299 char *text;
300 char *reg_exp;
301 char *filename= argc > 1 ? argv[1] : "robots.txt";
302 FILE *fp;
303 struct stat statb;
304 /* make sure the file is a regular text file and open it */
305 if(stat(filename, &statb) == -1 ||
306 (statb.st_mode & S_IFMT ) != S_IFREG ||
307 !(fp = fopen(filename, "r")))
308 {
309 if((statb.st_mode & S_IFMT) == S_IFREG)
310 perror(filename);
311 else
312 HTTRACE(ALL_TRACE, "%s : not a regular file \n" _ filename);
313 return 1;
314 }
315
316 if(!(text = malloc((unsigned)(statb.st_size +1))))
317 {
318 HTTRACE(ALL_TRACE, "Can't alloc enough space for %s" _ filename);
319 fclose(fp);
320 return;
321 }
322 if(!fread(text,sizeof(char), statb.st_size + 1, fp))
323 HTTRACE(APP_TRACE, "Warning: may not have read entire file!\n");
324 text[statb.st_size] = 0; /* be sure to NULL-terminate */
325 fclose(fp);
326 if(argc > 2)
327 {
328 reg_exp = scan_robots_txt(text,argv[2]);
329 if(reg_exp)
330 {
331 HTTRACE(ALL_TRACE, "REG EXP : %s \n" _ reg_exp);
332 free(reg_exp);
333 }
334 }
335 free(text);
336
337 return 0;
338 }
339
340
341 #endif
342