1 /*
2 **	@(#) $Id$
3 **
4 **	W3C Webbot can be found at "http://www.w3.org/Robot/"
5 **
6 **	Copyright �� 1995-1998 World Wide Web Consortium, (Massachusetts
7 **	Institute of Technology, Institut National de Recherche en
8 **	Informatique et en Automatique, Keio University). All Rights
9 **	Reserved. This program is distributed under the W3C's Software
10 **	Intellectual Property License. This program is distributed in the hope
11 **	that it will be useful, but WITHOUT ANY WARRANTY; without even the
12 **	implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
13 **	PURPOSE. See W3C License http://www.w3.org/Consortium/Legal/ for more
14 **	details.
15 **
16 **  Authors:
17 **	JP		John Punin
18 **
19 **  History:
20 **	Oct 1998	Written
21 */
22 
23 #include "HTRobMan.h"
24 #include "RobotTxt.h"
25 
new_user_agent(void)26 PUBLIC UserAgent * new_user_agent (void)
27 {
28     UserAgent * ua;
29     if ((ua = (UserAgent *) HT_CALLOC(1, sizeof(UserAgent))) == NULL)
30         HT_OUTOFMEM("new_user_agent");
31     ua->disallow = HTList_new();
32     return ua;
33 }
34 
get_name_user_agent(UserAgent * ua)35 PUBLIC char * get_name_user_agent (UserAgent * ua)
36 {
37     return ua ? ua->name : NULL;
38 }
39 
get_disallow_user_agent(UserAgent * ua)40 PUBLIC HTList * get_disallow_user_agent (UserAgent * ua)
41 {
42     return ua ? ua->disallow : NULL;
43 }
44 
set_name_user_agent(UserAgent * ua,char * name)45 PUBLIC BOOL set_name_user_agent (UserAgent * ua, char * name)
46 {
47     if (ua && name) {
48 	StrAllocCopy(ua->name, name);
49 	return YES;
50     }
51     return NO;
52 }
53 
add_disallow_user_agent(UserAgent * ua,char * disallow)54 PUBLIC BOOL add_disallow_user_agent (UserAgent * ua, char * disallow)
55 {
56     if (ua && disallow) {
57 	char * da = NULL;
58 	StrAllocCopy(da, disallow);
59 
60 	/* @@@ Should be an association list (HTAssocList) @@@ */
61 	return HTList_addObject(ua->disallow, da);
62     }
63     return NO;
64 }
65 
delete_user_agent(UserAgent * ua)66 PUBLIC BOOL delete_user_agent (UserAgent * ua)
67 {
68     if (ua) {
69 	HT_FREE(ua->name);
70 	if (ua->disallow) {
71 	    HTList *cur = ua->disallow;
72 	    char *pres;
73 	    while ((pres = (char *) HTList_nextObject(cur)))
74 		HT_FREE(pres);
75 	    HTList_delete(ua->disallow);
76 	}
77 	return YES;
78     }
79     return NO;
80 }
81 
delete_all_user_agents(HTList * user_agents)82 PUBLIC BOOL delete_all_user_agents(HTList *user_agents)
83 {
84     if (user_agents) {
85 	HTList *cur = user_agents;
86 	UserAgent *pres;
87 	while ((pres = (UserAgent *) HTList_nextObject(cur)))
88 	    delete_user_agent(pres);
89 	return HTList_delete(user_agents);
90     }
91     return NO;
92 }
93 
get_regular_expression(HTList * user_agents,char * name_robot)94 PUBLIC char * get_regular_expression (HTList * user_agents, char * name_robot)
95 {
96     if (user_agents && name_robot) {
97 	HTChunk *ch = HTChunk_new (1024);
98 	HTList *cur = user_agents;
99 	UserAgent *pres;
100 	UserAgent *ua_gen=NULL;
101 	int found=0;
102 
103 	while ((pres = (UserAgent *) HTList_nextObject(cur))) {
104 	    char *name = get_name_user_agent(pres);
105 
106 	    if(!strcmp(name,"*"))
107 		ua_gen = pres;
108 
109 	    if(!strcmp(name,name_robot)) {
110 		put_string_disallow(ch,pres);
111 		found = 1;
112 	    }
113 	}
114 	if(!found && ua_gen) put_string_disallow(ch,ua_gen);
115 
116 	return (HTChunk_toCString (ch));
117     }
118     return NULL;
119 }
120 
put_string_disallow(HTChunk * ch,UserAgent * ua)121 PUBLIC BOOL put_string_disallow (HTChunk * ch, UserAgent * ua)
122 {
123     if (ch && ua) {
124 	HTList *cur = get_disallow_user_agent(ua);
125 	char *pres;
126 	int first = 1;
127 
128 	while ((pres = (char *) HTList_nextObject(cur))) {
129 	    if(!first)
130 		HTChunk_puts (ch,"|");
131 	    else
132 		first = 0;
133 	    HTChunk_puts (ch,pres);
134 	}
135 	return YES;
136     }
137     return NO;
138 }
139 
print_user_agent(UserAgent * ua)140 PUBLIC void print_user_agent(UserAgent *ua)
141 {
142   HTList *cur = ua->disallow;
143   char *pres;
144   HTTRACE(APP_TRACE, "User Agent : %s \n" _ ua->name);
145   while ((pres = (char*) HTList_nextObject(cur)))
146       HTTRACE(APP_TRACE, "Disallow : %s \n" _ pres);
147 }
148 
print_all_user_agents(HTList * user_agents)149 PUBLIC void print_all_user_agents(HTList * user_agents)
150 {
151     HTList *cur = user_agents;
152     UserAgent *pres;
153     while ((pres = (UserAgent *) HTList_nextObject(cur)))
154     {
155 	HTTRACE(APP_TRACE, "\nNew User Agent\n");
156 	print_user_agent(pres);
157     }
158 }
159 
get_all_user_agents(char * rob_str)160 PUBLIC HTList * get_all_user_agents(char * rob_str)
161 {
162     if (rob_str) {
163 	char * ptr = rob_str;
164 	HTList * user_agents = HTList_new();
165 
166 	/* skip blank spaces */
167 	while(isspace((int)*ptr))
168 	    ptr++;
169 
170 	/* skip comments */
171 	ptr = skip_comments(ptr);
172 
173 	if(!get_user_agents(ptr,user_agents))
174 	    HTTRACE(APP_TRACE, "Something is wrong in robots.txt\n");
175 
176 	return user_agents;
177     }
178     return NULL;
179 }
180 
skip_comments(char * ptr)181 PUBLIC char * skip_comments(char *ptr)
182 {
183     if (ptr && *ptr == '#') {
184 	do {
185 	    while(*ptr != '\n')
186 		ptr++;
187 	    while(isspace((int)*ptr))
188 		ptr++;
189 	} while (*ptr == '#');
190     }
191     return ptr;
192 }
193 
scan_name_until_eoline(char * robot_str,char * name)194 PUBLIC void scan_name_until_eoline(char *robot_str, char *name)
195 {
196   char *ptr = robot_str;
197   char *ntr = name;
198   while(*ptr != '\n' && *ptr != '#')
199     {
200       *ntr = *ptr;
201       ntr++; ptr++;
202       if(*ptr == '\0')
203 	break;
204     }
205   *ntr = '\0';
206 }
207 
scan_name_until_space(char * robot_str,char * name)208 PUBLIC void scan_name_until_space(char *robot_str, char *name)
209 {
210   char *ptr = robot_str;
211   char *ntr = name;
212   while(!isspace((int)*ptr) && *ptr != '#')
213     {
214       *ntr = *ptr;
215       ntr++; ptr++;
216       if(*ptr == '\0')
217 	break;
218     }
219   *ntr = '\0';
220 }
221 
get_user_agents(char * ptr,HTList * user_agents)222 PUBLIC BOOL get_user_agents(char * ptr, HTList *user_agents)
223 {
224   char *uastr = "user-agent:";
225   char *disstr = "disallow:";
226   int luastr = 10;
227   int ldisstr = 9;
228   char name[2000];
229   int indices[200];
230   int i = 0;
231   if (ptr && !strncasecomp(ptr,uastr,luastr)) {
232       UserAgent *ua = NULL;
233       do {
234 	i=0;
235 	do {
236 	  ua = new_user_agent();
237 	  HTList_appendObject(user_agents,(void *)ua);
238 	  indices[i++] = HTList_indexOf(user_agents, (void *)ua);
239 	  ptr += luastr + 1;
240 	  while(isspace((int)*ptr))
241 	    ptr++;
242 	  scan_name_until_eoline(ptr,name);
243 	  ptr += strlen(name) + 1;
244 	  while(isspace((int)*ptr))
245 	    ptr++;
246 	  ptr = skip_comments(ptr);
247 	  set_name_user_agent(ua,name);
248 	} while(!strncasecomp(ptr,uastr,luastr));
249 
250 	if(!strncasecomp(ptr, disstr,ldisstr))
251 	  {
252 	    do {
253 	      ptr += ldisstr + 1;
254 	      scan_name_until_space(ptr,name);
255 	      ptr += strlen(name) + 1;
256 	      while(isspace((int)*ptr))
257 		ptr++;
258 	      ptr = skip_comments(ptr);
259 	      if(i==1)
260 		add_disallow_user_agent(ua,name);
261 	      else
262 		{
263 		  int j;
264 		  for(j = 0 ; j < i ; j++)
265 		    {
266 		      ua = HTList_objectAt(user_agents, indices[j]);
267 		      add_disallow_user_agent(ua,name);
268 		    }
269 		}
270 	    } while(!strncasecomp(ptr,disstr,ldisstr));
271 	  }
272 	else
273 	  return NO;
274 
275       } while(!strncasecomp(ptr,uastr,luastr));
276       return YES;
277     }
278   else
279     return NO;
280 }
281 
scan_robots_txt(char * rob_str,char * name_robot)282 PUBLIC char * scan_robots_txt(char *rob_str, char *name_robot)
283 {
284   char *reg_exp_exclude = NULL;
285   HTList * user_agents = get_all_user_agents(rob_str);
286   /*print_all_user_agents(user_agents);*/
287 
288   reg_exp_exclude = get_regular_expression(user_agents, name_robot);
289   delete_all_user_agents(user_agents);
290 
291   return reg_exp_exclude;
292 }
293 
294 #ifdef ROBOTS_TXT_STANDALONE
295 
296 int
main(int argc,char * argv[])297 main(int argc, char *argv[])
298 {
299   char *text;
300   char *reg_exp;
301   char *filename= argc > 1 ? argv[1] : "robots.txt";
302   FILE *fp;
303   struct stat statb;
304   /* make sure the file is a regular text file and open it */
305   if(stat(filename, &statb) == -1 ||
306      (statb.st_mode & S_IFMT ) != S_IFREG ||
307      !(fp = fopen(filename, "r")))
308     {
309       if((statb.st_mode & S_IFMT) == S_IFREG)
310 	perror(filename);
311       else
312 	HTTRACE(ALL_TRACE, "%s : not a regular file \n" _ filename);
313       return 1;
314     }
315 
316   if(!(text = malloc((unsigned)(statb.st_size +1))))
317     {
318       HTTRACE(ALL_TRACE, "Can't alloc enough space for %s" _ filename);
319       fclose(fp);
320       return;
321     }
322   if(!fread(text,sizeof(char), statb.st_size + 1, fp))
323     HTTRACE(APP_TRACE, "Warning: may not have read entire file!\n");
324   text[statb.st_size] = 0; /* be sure to NULL-terminate */
325   fclose(fp);
326   if(argc > 2)
327     {
328       reg_exp = scan_robots_txt(text,argv[2]);
329       if(reg_exp)
330 	{
331 	  HTTRACE(ALL_TRACE, "REG EXP : %s \n" _ reg_exp);
332 	  free(reg_exp);
333 	}
334     }
335   free(text);
336 
337   return 0;
338 }
339 
340 
341 #endif
342